diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 4ebf304e23d587..2e503c867403bc 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -68,7 +68,7 @@ function compute-projects-to-test() {
       done
     ;;
     clang)
-      for p in clang-tools-extra compiler-rt flang libc lldb openmp cross-project-tests; do
+      for p in clang-tools-extra compiler-rt flang lldb cross-project-tests; do
         echo $p
       done
     ;;
@@ -224,7 +224,7 @@ fi
 # needs while letting them run on the infrastructure provided by LLVM.
 
 # Figure out which projects need to be built on each platform
-all_projects="bolt clang-tools-extra compiler-rt cross-project-tests flang libc libclc lld lldb llvm mlir openmp polly pstl"
+all_projects="bolt clang clang-tools-extra compiler-rt cross-project-tests flang libc libclc lld lldb llvm mlir openmp polly pstl"
 modified_projects="$(keep-modified-projects ${all_projects})"
 
 linux_projects_to_test=$(exclude-linux $(compute-projects-to-test ${modified_projects}))
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 85850848b220c4..561da6a588c016 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -32,6 +32,9 @@
 /clang/www/cxx_dr_status.html @Endilll
 /clang/www/make_cxx_dr_status @Endilll
 
+clang/lib/AST/Interp/ @tbaederr
+clang/test/AST/Interp/ @tbaederr
+
 /lldb/ @JDevlieghere
 
 # MLIR Interfaces.
@@ -105,3 +108,6 @@
 
 # BOLT
 /bolt/ @aaupov @maksfb @rafaelauler @ayermolo @dcci
+
+# Bazel build system.
+/utils/bazel/ @rupprecht
diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 9a580c6d6984e1..a0428336d300f9 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -629,6 +629,8 @@ backend:DirectX:
   - '**/*DirectX*/**'
   - '**/*DXIL*/**'
   - '**/*dxil*/**'
+  - '**/*DXContainer*'
+  - '**/*DXContainer*/**'
 
 backend:SPIR-V:
   - clang/lib/Driver/ToolChains/SPIRV.*
@@ -933,3 +935,6 @@ openmp:libomp:
 
 openmp:libomptarget:
   - any: ['openmp/**', '!openmp/runtime/**']
+
+bazel:
+  - utils/bazel/**
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index 127628d76f1913..64d60bc3da45e1 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -42,6 +42,7 @@ jobs:
       BASELINE_REF: ${{ steps.vars.outputs.BASELINE_REF }}
       ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
       BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
+      BASELINE_VERSION_MINOR: ${{ steps.vars.outputs.BASELINE_VERSION_MINOR }}
       LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
       LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }}
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }}
@@ -58,7 +59,14 @@ jobs:
       - name: Setup Variables
         id: vars
         run: |
-          if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 ] || [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
+          # C++ ABI:
+          # 18.1.0 we aren't doing ABI checks.
+          # 18.1.1 We want to check 18.1.0.
+          # C ABI:
+          # 18.1.0 We want to check 17.0.x
+          # 18.1.1 We want to check 18.1.0
+          echo "BASELINE_VERSION_MINOR=1" >> "$GITHUB_OUTPUT"
+          if [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
             {
               echo "BASELINE_VERSION_MAJOR=$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))"
               echo "ABI_HEADERS=llvm-c"
@@ -82,7 +90,7 @@ jobs:
         include:
           - name: build-baseline
             llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}
-            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.0.0
+            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MINOR }}.0
             repo: llvm/llvm-project
           - name: build-latest
             llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }}
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index bbb3789782f486..efd1983293ef8c 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -52,7 +52,7 @@ jobs:
       - name: Install clang-format
         uses: aminya/setup-cpp@v1
         with:
-          clangformat: 17.0.1
+          clangformat: 18.1.1
 
       - name: Setup Python env
         uses: actions/setup-python@v4
diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index 4debf936335472..06084819ec0b3a 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -209,7 +209,13 @@ class DIEBuilder {
   void updateReferences();
 
   /// Update the Offset and Size of DIE, populate DebugNames table.
-  uint32_t finalizeDIEs(DWARFUnit &CU, DIE &Die, uint32_t &CurOffset);
+  /// Along with current CU, and DIE being processed and the new DIE offset to
+  /// be updated, it takes in Parents vector that can be empty if this DIE has
+  /// no parents.
+  uint32_t
+  finalizeDIEs(DWARFUnit &CU, DIE &Die,
+               std::vector<std::optional<BOLTDWARF5AccelTableData *>> &Parents,
+               uint32_t &CurOffset);
 
   void registerUnit(DWARFUnit &DU, bool NeedSort);
 
diff --git a/bolt/include/bolt/Core/DebugNames.h b/bolt/include/bolt/Core/DebugNames.h
index 84c448aa1354cc..1f17f1ae4d139b 100644
--- a/bolt/include/bolt/Core/DebugNames.h
+++ b/bolt/include/bolt/Core/DebugNames.h
@@ -36,6 +36,9 @@ class BOLTDWARF5AccelTableData : public DWARF5AccelTableData {
   bool isTU() const { return DWARF5AccelTableData::isTU(); }
   std::optional<unsigned> getSecondUnitID() const { return SecondUnitID; }
 
+  void setPatchOffset(uint64_t PatchOffset) { OffsetVal = PatchOffset; }
+  uint64_t getPatchOffset() const { return std::get<uint64_t>(OffsetVal); }
+
 private:
   std::optional<unsigned> SecondUnitID;
 };
@@ -49,10 +52,12 @@ class DWARF5AcceleratorTable {
       Abbrev->~DebugNamesAbbrev();
   }
   /// Add DWARF5 Accelerator table entry.
-  /// Input is DWARFUnit being processed, DIE that belongs to it, and potential
-  /// SkeletonCU if the Unit comes from a DWO section.
-  void addAccelTableEntry(DWARFUnit &Unit, const DIE &Die,
-                          const std::optional<uint64_t> &DWOID);
+  /// Input is DWARFUnit being processed, DIE that belongs to it, potential
+  /// DWOID if the Unit comes from a DWO section, and potential parent entry.
+  std::optional<BOLTDWARF5AccelTableData *>
+  addAccelTableEntry(DWARFUnit &Unit, const DIE &Die,
+                     const std::optional<uint64_t> &DWOID,
+                     std::optional<BOLTDWARF5AccelTableData *> &Parent);
   /// Set current unit being processed.
   void setCurrentUnit(DWARFUnit &Unit, const uint64_t UnitStartOffset);
   /// Emit Accelerator table.
@@ -121,6 +126,8 @@ class DWARF5AcceleratorTable {
   llvm::DenseMap<llvm::hash_code, uint64_t> StrCacheToOffsetMap;
   // Contains DWO ID to CUList Index.
   llvm::DenseMap<uint64_t, uint32_t> CUOffsetsToPatch;
+  // Contains a map of Entry ID to Entry relative offset.
+  llvm::DenseMap<uint64_t, uint32_t> EntryRelativeOffsets;
   /// Adds Unit to either CUList, LocalTUList or ForeignTUList.
   /// Input Unit being processed, and DWO ID if Unit is being processed comes
   /// from a DWO section.
@@ -143,7 +150,7 @@ class DWARF5AcceleratorTable {
   /// Write Entries.
   void writeEntries();
   /// Write an Entry.
-  void writeEntry(const BOLTDWARF5AccelTableData &Entry);
+  void writeEntry(BOLTDWARF5AccelTableData &Entry);
   /// Write augmentation_string for BOLT.
   void writeAugmentationString();
   /// Emit out Header for DWARF5 Accelerator table.
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 6bb76d1b917db3..96b58f54162344 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -487,10 +487,9 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
   }
 
-  virtual bool createDirectCall(MCInst &Inst, const MCSymbol *Target,
+  virtual void createDirectCall(MCInst &Inst, const MCSymbol *Target,
                                 MCContext *Ctx, bool IsTailCall) {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   virtual MCPhysReg getX86R11() const { llvm_unreachable("not implemented"); }
@@ -1534,15 +1533,13 @@ class MCPlusBuilder {
   }
 
   /// Create a no-op instruction.
-  virtual bool createNoop(MCInst &Inst) const {
+  virtual void createNoop(MCInst &Inst) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Create a return instruction.
-  virtual bool createReturn(MCInst &Inst) const {
+  virtual void createReturn(MCInst &Inst) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Store \p Target absolute address to \p RegName
@@ -1556,32 +1553,30 @@ class MCPlusBuilder {
 
   /// Creates a new unconditional branch instruction in Inst and set its operand
   /// to TBB.
-  ///
-  /// Returns true on success.
-  virtual bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+  virtual void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
                                   MCContext *Ctx) const {
     llvm_unreachable("not implemented");
-    return false;
+  }
+
+  /// Create a version of unconditional jump that has the largest span for a
+  /// single instruction with direct target.
+  virtual void createLongUncondBranch(MCInst &Inst, const MCSymbol *Target,
+                                      MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
   }
 
   /// Creates a new call instruction in Inst and sets its operand to
   /// Target.
-  ///
-  /// Returns true on success.
-  virtual bool createCall(MCInst &Inst, const MCSymbol *Target,
+  virtual void createCall(MCInst &Inst, const MCSymbol *Target,
                           MCContext *Ctx) {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Creates a new tail call instruction in Inst and sets its operand to
   /// Target.
-  ///
-  /// Returns true on success.
-  virtual bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+  virtual void createTailCall(MCInst &Inst, const MCSymbol *Target,
                               MCContext *Ctx) {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   virtual void createLongTailCall(InstructionListType &Seq,
@@ -1590,43 +1585,36 @@ class MCPlusBuilder {
   }
 
   /// Creates a trap instruction in Inst.
-  ///
-  /// Returns true on success.
-  virtual bool createTrap(MCInst &Inst) const {
+  virtual void createTrap(MCInst &Inst) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Creates an instruction to bump the stack pointer just like a call.
-  virtual bool createStackPointerIncrement(MCInst &Inst, int Size = 8,
+  virtual void createStackPointerIncrement(MCInst &Inst, int Size = 8,
                                            bool NoFlagsClobber = false) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Creates an instruction to move the stack pointer just like a ret.
-  virtual bool createStackPointerDecrement(MCInst &Inst, int Size = 8,
+  virtual void createStackPointerDecrement(MCInst &Inst, int Size = 8,
                                            bool NoFlagsClobber = false) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Create a store instruction using \p StackReg as the base register
   /// and \p Offset as the displacement.
-  virtual bool createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg,
+  virtual void createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg,
                                  int Offset, const MCPhysReg &SrcReg,
                                  int Size) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
-  virtual bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
+  virtual void createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
                           const MCPhysReg &IndexReg, int64_t Offset,
                           const MCExpr *OffsetExpr,
                           const MCPhysReg &AddrSegmentReg,
                           const MCPhysReg &DstReg, int Size) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   virtual InstructionListType createLoadImmediate(const MCPhysReg Dest,
@@ -1636,32 +1624,27 @@ class MCPlusBuilder {
 
   /// Create a fragment of code (sequence of instructions) that load a 32-bit
   /// address from memory, zero-extends it to 64 and jump to it (indirect jump).
-  virtual bool
+  virtual void
   createIJmp32Frag(SmallVectorImpl<MCInst> &Insts, const MCOperand &BaseReg,
                    const MCOperand &Scale, const MCOperand &IndexReg,
                    const MCOperand &Offset, const MCOperand &TmpReg) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Create a load instruction using \p StackReg as the base register
   /// and \p Offset as the displacement.
-  virtual bool createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
+  virtual void createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
                                       int Offset, const MCPhysReg &DstReg,
                                       int Size) const {
     llvm_unreachable("not implemented");
-    return false;
   }
 
   /// Creates a call frame pseudo instruction. A single operand identifies which
   /// MCCFIInstruction this MCInst is referring to.
-  ///
-  /// Returns true on success.
-  virtual bool createCFI(MCInst &Inst, int64_t Offset) const {
+  virtual void createCFI(MCInst &Inst, int64_t Offset) const {
     Inst.clear();
     Inst.setOpcode(TargetOpcode::CFI_INSTRUCTION);
     Inst.addOperand(MCOperand::createImm(Offset));
-    return true;
   }
 
   /// Create an inline version of memcpy(dest, src, 1).
@@ -1699,6 +1682,12 @@ class MCPlusBuilder {
     return Inst.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
   }
 
+  /// Create a conditional branch with a target-specific conditional code \p CC.
+  virtual void createCondBranch(MCInst &Inst, const MCSymbol *Target,
+                                unsigned CC, MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+  }
+
   /// Reverses the branch condition in Inst and update its taken target to TBB.
   ///
   /// Returns true on success.
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index 42287978a8576f..0cf8a5e8c2c3db 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -377,20 +377,32 @@ getUnitForOffset(DIEBuilder &Builder, DWARFContext &DWCtx,
   return nullptr;
 }
 
-uint32_t DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
-                                  uint32_t &CurOffset) {
+uint32_t DIEBuilder::finalizeDIEs(
+    DWARFUnit &CU, DIE &Die,
+    std::vector<std::optional<BOLTDWARF5AccelTableData *>> &Parents,
+    uint32_t &CurOffset) {
   getState().DWARFDieAddressesParsed.erase(Die.getOffset());
   uint32_t CurSize = 0;
   Die.setOffset(CurOffset);
-  DebugNamesTable.addAccelTableEntry(
-      CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt);
+  std::optional<BOLTDWARF5AccelTableData *> NameEntry =
+      DebugNamesTable.addAccelTableEntry(
+          CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt,
+          Parents.back());
+  // It is possible that an indexed debugging information entry has a parent
+  // that is not indexed (for example, if its parent does not have a name
+  // attribute). In such a case, a parent attribute may point to a nameless
+  // index entry (that is, one that cannot be reached from any entry in the name
+  // table), or it may point to the nearest ancestor that does have an index
+  // entry.
+  if (NameEntry)
+    Parents.push_back(std::move(NameEntry));
   for (DIEValue &Val : Die.values())
     CurSize += Val.sizeOf(CU.getFormParams());
   CurSize += getULEB128Size(Die.getAbbrevNumber());
   CurOffset += CurSize;
 
   for (DIE &Child : Die.children()) {
-    uint32_t ChildSize = finalizeDIEs(CU, Child, CurOffset);
+    uint32_t ChildSize = finalizeDIEs(CU, Child, Parents, CurOffset);
     CurSize += ChildSize;
   }
   // for children end mark.
@@ -400,6 +412,8 @@ uint32_t DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
   }
 
   Die.setSize(CurSize);
+  if (NameEntry)
+    Parents.pop_back();
 
   return CurSize;
 }
@@ -410,7 +424,9 @@ void DIEBuilder::finish() {
     uint32_t HeaderSize = CU.getHeaderSize();
     uint32_t CurOffset = HeaderSize;
     DebugNamesTable.setCurrentUnit(CU, UnitStartOffset);
-    finalizeDIEs(CU, *UnitDIE, CurOffset);
+    std::vector<std::optional<BOLTDWARF5AccelTableData *>> Parents;
+    Parents.push_back(std::nullopt);
+    finalizeDIEs(CU, *UnitDIE, Parents, CurOffset);
 
     DWARFUnitInfo &CurUnitInfo = getUnitInfoByDwarfUnit(CU);
     CurUnitInfo.UnitOffset = UnitStartOffset;
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
index 384e63695dfdc5..23a29f52513c04 100644
--- a/bolt/lib/Core/DebugNames.cpp
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/LEB128.h"
 #include <cstdint>
+#include <optional>
 
 namespace llvm {
 namespace bolt {
@@ -163,10 +164,16 @@ static uint64_t getNameOffset(BinaryContext &BC, DWARFUnit &Unit,
                                    Index * DwarfOffsetByteSize);
 }
 
-void DWARF5AcceleratorTable::addAccelTableEntry(
-    DWARFUnit &Unit, const DIE &Die, const std::optional<uint64_t> &DWOID) {
+static uint64_t getEntryID(const BOLTDWARF5AccelTableData &Entry) {
+  return reinterpret_cast<uint64_t>(&Entry);
+}
+
+std::optional<BOLTDWARF5AccelTableData *>
+DWARF5AcceleratorTable::addAccelTableEntry(
+    DWARFUnit &Unit, const DIE &Die, const std::optional<uint64_t> &DWOID,
+    std::optional<BOLTDWARF5AccelTableData *> &Parent) {
   if (Unit.getVersion() < 5 || !NeedToCreate)
-    return;
+    return std::nullopt;
   std::string NameToUse = "";
   auto canProcess = [&](const DIE &Die) -> bool {
     switch (Die.getTag()) {
@@ -217,7 +224,7 @@ void DWARF5AcceleratorTable::addAccelTableEntry(
   };
 
   if (!canProcess(Die))
-    return;
+    return std::nullopt;
 
   // Addes a Unit to either CU, LocalTU or ForeignTU list the first time we
   // encounter it.
@@ -227,10 +234,10 @@ void DWARF5AcceleratorTable::addAccelTableEntry(
     addUnit(Unit, DWOID);
   }
 
-  auto addEntry = [&](DIEValue ValName) -> void {
+  auto getName = [&](DIEValue ValName) -> std::optional<std::string> {
     if ((!ValName || ValName.getForm() == dwarf::DW_FORM_string) &&
         NameToUse.empty())
-      return;
+      return std::nullopt;
     std::string Name = "";
     uint64_t NameIndexOffset = 0;
     if (NameToUse.empty()) {
@@ -260,7 +267,16 @@ void DWARF5AcceleratorTable::addAccelTableEntry(
       // This the same hash function used in DWARF5AccelTableData.
       It.HashValue = caseFoldingDjbHash(Name);
     }
+    return Name;
+  };
+
+  auto addEntry =
+      [&](DIEValue ValName) -> std::optional<BOLTDWARF5AccelTableData *> {
+    std::optional<std::string> Name = getName(ValName);
+    if (!Name)
+      return std::nullopt;
 
+    auto &It = Entries[*Name];
     bool IsTU = false;
     uint32_t DieTag = 0;
     uint32_t UnitID = getUnitID(Unit, IsTU, DieTag);
@@ -270,18 +286,53 @@ void DWARF5AcceleratorTable::addAccelTableEntry(
       if (Iter == CUOffsetsToPatch.end())
         BC.errs() << "BOLT-WARNING: [internal-dwarf-warning]: Could not find "
                      "DWO ID in CU offsets for second Unit Index "
-                  << Name << ". For DIE at offset: "
+                  << *Name << ". For DIE at offset: "
                   << Twine::utohexstr(CurrentUnitOffset + Die.getOffset())
                   << ".\n";
       SecondIndex = Iter->second;
     }
+    std::optional<uint64_t> ParentOffset =
+        (Parent ? std::optional<uint64_t>(getEntryID(**Parent)) : std::nullopt);
+    // This will be populated later in writeEntry.
+    // This way only parent entries get tracked.
+    // Keeping memory footprint down.
+    if (ParentOffset)
+      EntryRelativeOffsets.insert({*ParentOffset, 0});
     It.Values.push_back(new (Allocator) BOLTDWARF5AccelTableData(
-        Die.getOffset(), std::nullopt, DieTag, UnitID, IsTU, SecondIndex));
+        Die.getOffset(), ParentOffset, DieTag, UnitID, IsTU, SecondIndex));
+    return It.Values.back();
   };
 
-  addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_name));
-  addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_linkage_name));
-  return;
+  // Minor optimization not to add entry twice for DW_TAG_namespace if it has no
+  // DW_AT_name.
+  if (!(Die.getTag() == dwarf::DW_TAG_namespace &&
+        !Die.findAttribute(dwarf::Attribute::DW_AT_name)))
+    addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_linkage_name));
+  // For the purposes of determining whether a debugging information entry has a
+  // particular attribute (such as DW_AT_name), if debugging information entry A
+  // has a DW_AT_specification or DW_AT_abstract_origin attribute pointing to
+  // another debugging information entry B, any attributes of B are considered
+  // to be part of A.
+  auto processReferencedDie = [&](const dwarf::Attribute &Attr)
+      -> std::optional<BOLTDWARF5AccelTableData *> {
+    const DIEValue Value = Die.findAttribute(Attr);
+    if (!Value)
+      return std::nullopt;
+    const DIEEntry &DIEENtry = Value.getDIEEntry();
+    DIE &EntryDie = DIEENtry.getEntry();
+    addEntry(EntryDie.findAttribute(dwarf::Attribute::DW_AT_linkage_name));
+    return addEntry(EntryDie.findAttribute(dwarf::Attribute::DW_AT_name));
+  };
+
+  if (std::optional<BOLTDWARF5AccelTableData *> Entry =
+          processReferencedDie(dwarf::Attribute::DW_AT_abstract_origin))
+    return *Entry;
+  if (std::optional<BOLTDWARF5AccelTableData *> Entry =
+          processReferencedDie(dwarf::Attribute::DW_AT_specification))
+    return *Entry;
+
+  return addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_name));
+  ;
 }
 
 /// Algorithm from llvm implementation.
@@ -382,6 +433,11 @@ void DWARF5AcceleratorTable::populateAbbrevsMap() {
         if (SecondEntryRet)
           Abbrev.addAttribute(SecondEntryRet->Encoding);
         Abbrev.addAttribute({dwarf::DW_IDX_die_offset, dwarf::DW_FORM_ref4});
+        if (std::optional<uint64_t> Offset = Value->getParentDieOffset())
+          Abbrev.addAttribute({dwarf::DW_IDX_parent, dwarf::DW_FORM_ref4});
+        else
+          Abbrev.addAttribute(
+              {dwarf::DW_IDX_parent, dwarf::DW_FORM_flag_present});
         FoldingSetNodeID ID;
         Abbrev.Profile(ID);
         void *InsertPos;
@@ -401,7 +457,11 @@ void DWARF5AcceleratorTable::populateAbbrevsMap() {
   }
 }
 
-void DWARF5AcceleratorTable::writeEntry(const BOLTDWARF5AccelTableData &Entry) {
+void DWARF5AcceleratorTable::writeEntry(BOLTDWARF5AccelTableData &Entry) {
+  const uint64_t EntryID = getEntryID(Entry);
+  if (EntryRelativeOffsets.find(EntryID) != EntryRelativeOffsets.end())
+    EntryRelativeOffsets[EntryID] = EntriesBuffer->size();
+
   const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
       getIndexForEntry(Entry);
   // For forgeign type (FTU) units that need to refer to the FTU and to the CU.
@@ -456,6 +516,17 @@ void DWARF5AcceleratorTable::writeEntry(const BOLTDWARF5AccelTableData &Entry) {
                              llvm::endianness::little);
       break;
     }
+    case dwarf::DW_IDX_parent: {
+      assert(
+          (AttrEnc.Form == dwarf::DW_FORM_ref4 && Entry.getParentDieOffset()) ||
+          AttrEnc.Form == dwarf::DW_FORM_flag_present);
+      if (std::optional<uint64_t> ParentOffset = Entry.getParentDieOffset()) {
+        Entry.setPatchOffset(EntriesBuffer->size());
+        support::endian::write(*Entriestream, static_cast<uint32_t>(UINT32_MAX),
+                               llvm::endianness::little);
+      }
+      break;
+    }
     }
   }
 }
@@ -464,13 +535,34 @@ void DWARF5AcceleratorTable::writeEntries() {
   for (auto &Bucket : getBuckets()) {
     for (DWARF5AcceleratorTable::HashData *Hash : Bucket) {
       Hash->EntryOffset = EntriesBuffer->size();
-      for (const BOLTDWARF5AccelTableData *Value : Hash->Values) {
+      for (BOLTDWARF5AccelTableData *Value : Hash->Values) {
         writeEntry(*Value);
       }
       support::endian::write(*Entriestream, static_cast<uint8_t>(0),
                              llvm::endianness::little);
     }
   }
+  // Patching parent offsets.
+  for (auto &Bucket : getBuckets()) {
+    for (DWARF5AcceleratorTable::HashData *Hash : Bucket) {
+      for (BOLTDWARF5AccelTableData *Entry : Hash->Values) {
+        std::optional<uint64_t> ParentOffset = Entry->getParentDieOffset();
+        if (!ParentOffset)
+          continue;
+        if (const auto Iter = EntryRelativeOffsets.find(*ParentOffset);
+            Iter != EntryRelativeOffsets.end()) {
+          const uint64_t PatchOffset = Entry->getPatchOffset();
+          uint32_t *Ptr = reinterpret_cast<uint32_t *>(
+              &EntriesBuffer.get()->data()[PatchOffset]);
+          *Ptr = Iter->second;
+        } else {
+          BC.errs() << "BOLT-WARNING: [internal-dwarf-warning]: Could not find "
+                       "entry with offset "
+                    << *ParentOffset << "\n";
+        }
+      }
+    }
+  }
 }
 
 void DWARF5AcceleratorTable::writeAugmentationString() {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d2850b03e2e13f..bf1c2ddd37dd24 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1056,10 +1056,9 @@ void Peepholes::addTailcallTraps(BinaryFunction &Function) {
     MCInst *Inst = BB.getLastNonPseudoInstr();
     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
       MCInst Trap;
-      if (MIB->createTrap(Trap)) {
-        BB.addInstruction(Trap);
-        ++TailCallTraps;
-      }
+      MIB->createTrap(Trap);
+      BB.addInstruction(Trap);
+      ++TailCallTraps;
     }
   }
 }
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index 9a1f9d72623a38..c9706500758d1f 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -680,16 +680,15 @@ void StackLayoutModifier::performChanges() {
       if (StackPtrReg != BC.MIB->getFramePointer())
         Adjustment = -Adjustment;
       if (IsLoad)
-        Success = BC.MIB->createRestoreFromStack(
-            Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size);
+        BC.MIB->createRestoreFromStack(Inst, StackPtrReg,
+                                       StackOffset + Adjustment, Reg, Size);
       else if (IsStore)
-        Success = BC.MIB->createSaveToStack(
-            Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size);
+        BC.MIB->createSaveToStack(Inst, StackPtrReg, StackOffset + Adjustment,
+                                  Reg, Size);
       LLVM_DEBUG({
         dbgs() << "Adjusted instruction: ";
         Inst.dump();
       });
-      assert(Success);
     }
   }
 }
@@ -1653,19 +1652,13 @@ Expected<MCInst> ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
   if (SPVal != StackPointerTracking::SUPERPOSITION &&
       SPVal != StackPointerTracking::EMPTY) {
     if (FIE.IsLoad) {
-      if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getStackPointer(),
-                                          FIE.StackOffset - SPVal, FIE.RegOrImm,
-                                          FIE.Size)) {
-        return createFatalBOLTError(
-            "createRestoreFromStack: not supported on this platform\n");
-      }
-    } else {
-      if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getStackPointer(),
+      BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getStackPointer(),
                                      FIE.StackOffset - SPVal, FIE.RegOrImm,
-                                     FIE.Size)) {
-        return createFatalBOLTError(
-            "createSaveToStack: not supported on this platform\n");
-      }
+                                     FIE.Size);
+    } else {
+      BC.MIB->createSaveToStack(NewInst, BC.MIB->getStackPointer(),
+                                FIE.StackOffset - SPVal, FIE.RegOrImm,
+                                FIE.Size);
     }
     if (CreatePushOrPop)
       BC.MIB->changeToPushOrPop(NewInst);
@@ -1675,19 +1668,12 @@ Expected<MCInst> ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
          FPVal != StackPointerTracking::EMPTY);
 
   if (FIE.IsLoad) {
-    if (!BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getFramePointer(),
-                                        FIE.StackOffset - FPVal, FIE.RegOrImm,
-                                        FIE.Size)) {
-      return createFatalBOLTError(
-          "createRestoreFromStack: not supported on this platform\n");
-    }
-  } else {
-    if (!BC.MIB->createSaveToStack(NewInst, BC.MIB->getFramePointer(),
+    BC.MIB->createRestoreFromStack(NewInst, BC.MIB->getFramePointer(),
                                    FIE.StackOffset - FPVal, FIE.RegOrImm,
-                                   FIE.Size)) {
-      return createFatalBOLTError(
-          "createSaveToStack: not supported on this platform\n");
-    }
+                                   FIE.Size);
+  } else {
+    BC.MIB->createSaveToStack(NewInst, BC.MIB->getFramePointer(),
+                              FIE.StackOffset - FPVal, FIE.RegOrImm, FIE.Size);
   }
   return NewInst;
 }
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 631ccaec6ae614..016962ff34d8df 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -705,6 +705,9 @@ void assignProfile(BinaryFunction &BF,
 
 bool YAMLProfileReader::inferStaleProfile(
     BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
+  if (!BF.hasCFG())
+    return false;
+
   LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
                     << "\"" << BF.getPrintName() << "\"\n");
 
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 85c2397dcc5b25..8dc7b90a6e307e 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -283,10 +283,9 @@ class DIEStreamer : public DwarfStreamer {
   DIEStreamer(DIEBuilder *DIEBldr, DWARFRewriter &Rewriter,
               DWARFLinkerBase::OutputFileType OutFileType,
               raw_pwrite_stream &OutFile,
-              std::function<StringRef(StringRef Input)> Translator,
               DWARFLinkerBase::MessageHandlerTy Warning)
-      : DwarfStreamer(OutFileType, OutFile, Translator, Warning),
-        DIEBldr(DIEBldr), Rewriter(Rewriter){};
+      : DwarfStreamer(OutFileType, OutFile, Warning), DIEBldr(DIEBldr),
+        Rewriter(Rewriter){};
 
   using DwarfStreamer::emitCompileUnitHeader;
 
@@ -469,7 +468,6 @@ createDIEStreamer(const Triple &TheTriple, raw_pwrite_stream &OutFile,
 
   std::unique_ptr<DIEStreamer> Streamer = std::make_unique<DIEStreamer>(
       &DIEBldr, Rewriter, DWARFLinkerBase::OutputFileType::Object, OutFile,
-      [](StringRef Input) -> StringRef { return Input; },
       [&](const Twine &Warning, StringRef Context, const DWARFDie *) {});
   Error Err = Streamer->init(TheTriple, Swift5ReflectionSegmentName);
   if (Err)
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 9efb428a6e1bae..0ae9d3668b93bb 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1026,7 +1026,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return Code;
   }
 
-  bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+  void createTailCall(MCInst &Inst, const MCSymbol *Target,
                       MCContext *Ctx) override {
     return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true);
   }
@@ -1036,11 +1036,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     createShortJmp(Seq, Target, Ctx, /*IsTailCall*/ true);
   }
 
-  bool createTrap(MCInst &Inst) const override {
+  void createTrap(MCInst &Inst) const override {
     Inst.clear();
     Inst.setOpcode(AArch64::BRK);
     Inst.addOperand(MCOperand::createImm(1));
-    return true;
   }
 
   bool convertJmpToTailCall(MCInst &Inst) override {
@@ -1068,16 +1067,15 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
            Inst.getOperand(0).getImm() == 0;
   }
 
-  bool createNoop(MCInst &Inst) const override {
+  void createNoop(MCInst &Inst) const override {
     Inst.setOpcode(AArch64::HINT);
     Inst.clear();
     Inst.addOperand(MCOperand::createImm(0));
-    return true;
   }
 
   bool mayStore(const MCInst &Inst) const override { return false; }
 
-  bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
+  void createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
                         bool IsTailCall) override {
     Inst.setOpcode(IsTailCall ? AArch64::B : AArch64::BL);
     Inst.clear();
@@ -1086,7 +1084,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
         *Ctx, 0)));
     if (IsTailCall)
       convertJmpToTailCall(Inst);
-    return true;
   }
 
   bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
@@ -1293,14 +1290,13 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+  void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
                           MCContext *Ctx) const override {
     Inst.setOpcode(AArch64::B);
     Inst.clear();
     Inst.addOperand(MCOperand::createExpr(getTargetExprFor(
         Inst, MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx),
         *Ctx, 0)));
-    return true;
   }
 
   bool shouldRecordCodeRelocation(uint64_t RelType) const override {
@@ -1353,14 +1349,13 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return StringRef("\0\0\0\0", 4);
   }
 
-  bool createReturn(MCInst &Inst) const override {
+  void createReturn(MCInst &Inst) const override {
     Inst.setOpcode(AArch64::RET);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(AArch64::LR));
-    return true;
   }
 
-  bool createStackPointerIncrement(
+  void createStackPointerIncrement(
       MCInst &Inst, int Size,
       bool NoFlagsClobber = false /*unused for AArch64*/) const override {
     Inst.setOpcode(AArch64::SUBXri);
@@ -1369,10 +1364,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Inst.addOperand(MCOperand::createReg(AArch64::SP));
     Inst.addOperand(MCOperand::createImm(Size));
     Inst.addOperand(MCOperand::createImm(0));
-    return true;
   }
 
-  bool createStackPointerDecrement(
+  void createStackPointerDecrement(
       MCInst &Inst, int Size,
       bool NoFlagsClobber = false /*unused for AArch64*/) const override {
     Inst.setOpcode(AArch64::ADDXri);
@@ -1381,12 +1375,12 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Inst.addOperand(MCOperand::createReg(AArch64::SP));
     Inst.addOperand(MCOperand::createImm(Size));
     Inst.addOperand(MCOperand::createImm(0));
-    return true;
   }
 
   void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg,
                             int64_t Disp) const {
     Inst.setOpcode(AArch64::BR);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(MemBaseReg));
   }
 
diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index e19c070b081733..ab9623d5c51b28 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -219,46 +219,43 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  bool createReturn(MCInst &Inst) const override {
+  void createReturn(MCInst &Inst) const override {
     // TODO "c.jr ra" when RVC is enabled
     Inst.setOpcode(RISCV::JALR);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(RISCV::X0));
     Inst.addOperand(MCOperand::createReg(RISCV::X1));
     Inst.addOperand(MCOperand::createImm(0));
-    return true;
   }
 
-  bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+  void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
                           MCContext *Ctx) const override {
     Inst.setOpcode(RISCV::JAL);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(RISCV::X0));
     Inst.addOperand(MCOperand::createExpr(
         MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx)));
-    return true;
   }
 
   StringRef getTrapFillValue() const override {
     return StringRef("\0\0\0\0", 4);
   }
 
-  bool createCall(unsigned Opcode, MCInst &Inst, const MCSymbol *Target,
+  void createCall(unsigned Opcode, MCInst &Inst, const MCSymbol *Target,
                   MCContext *Ctx) {
     Inst.setOpcode(Opcode);
     Inst.clear();
     Inst.addOperand(MCOperand::createExpr(RISCVMCExpr::create(
         MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
         RISCVMCExpr::VK_RISCV_CALL, *Ctx)));
-    return true;
   }
 
-  bool createCall(MCInst &Inst, const MCSymbol *Target,
+  void createCall(MCInst &Inst, const MCSymbol *Target,
                   MCContext *Ctx) override {
     return createCall(RISCV::PseudoCALL, Inst, Target, Ctx);
   }
 
-  bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+  void createTailCall(MCInst &Inst, const MCSymbol *Target,
                       MCContext *Ctx) override {
     return createCall(RISCV::PseudoTAIL, Inst, Target, Ctx);
   }
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 387cf481ba304f..de55fbe51764dd 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -2230,7 +2230,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  bool createStackPointerIncrement(MCInst &Inst, int Size,
+  void createStackPointerIncrement(MCInst &Inst, int Size,
                                    bool NoFlagsClobber) const override {
     if (NoFlagsClobber) {
       Inst.setOpcode(X86::LEA64r);
@@ -2241,17 +2241,16 @@ class X86MCPlusBuilder : public MCPlusBuilder {
       Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
       Inst.addOperand(MCOperand::createImm(-Size));           // Displacement
       Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
-      return true;
+      return;
     }
     Inst.setOpcode(X86::SUB64ri8);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(X86::RSP));
     Inst.addOperand(MCOperand::createReg(X86::RSP));
     Inst.addOperand(MCOperand::createImm(Size));
-    return true;
   }
 
-  bool createStackPointerDecrement(MCInst &Inst, int Size,
+  void createStackPointerDecrement(MCInst &Inst, int Size,
                                    bool NoFlagsClobber) const override {
     if (NoFlagsClobber) {
       Inst.setOpcode(X86::LEA64r);
@@ -2262,22 +2261,22 @@ class X86MCPlusBuilder : public MCPlusBuilder {
       Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
       Inst.addOperand(MCOperand::createImm(Size));            // Displacement
       Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
-      return true;
+      return;
     }
     Inst.setOpcode(X86::ADD64ri8);
     Inst.clear();
     Inst.addOperand(MCOperand::createReg(X86::RSP));
     Inst.addOperand(MCOperand::createReg(X86::RSP));
     Inst.addOperand(MCOperand::createImm(Size));
-    return true;
   }
 
-  bool createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg, int Offset,
+  void createSaveToStack(MCInst &Inst, const MCPhysReg &StackReg, int Offset,
                          const MCPhysReg &SrcReg, int Size) const override {
     unsigned NewOpcode;
     switch (Size) {
     default:
-      return false;
+      llvm_unreachable("Invalid operand size");
+      return;
     case 2:      NewOpcode = X86::MOV16mr; break;
     case 4:      NewOpcode = X86::MOV32mr; break;
     case 8:      NewOpcode = X86::MOV64mr; break;
@@ -2290,10 +2289,9 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     Inst.addOperand(MCOperand::createImm(Offset));          // Displacement
     Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
     Inst.addOperand(MCOperand::createReg(SrcReg));
-    return true;
   }
 
-  bool createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
+  void createRestoreFromStack(MCInst &Inst, const MCPhysReg &StackReg,
                               int Offset, const MCPhysReg &DstReg,
                               int Size) const override {
     return createLoad(Inst, StackReg, /*Scale=*/1, /*IndexReg=*/X86::NoRegister,
@@ -2301,14 +2299,15 @@ class X86MCPlusBuilder : public MCPlusBuilder {
                       DstReg, Size);
   }
 
-  bool createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
+  void createLoad(MCInst &Inst, const MCPhysReg &BaseReg, int64_t Scale,
                   const MCPhysReg &IndexReg, int64_t Offset,
                   const MCExpr *OffsetExpr, const MCPhysReg &AddrSegmentReg,
                   const MCPhysReg &DstReg, int Size) const override {
     unsigned NewOpcode;
     switch (Size) {
     default:
-      return false;
+      llvm_unreachable("Invalid operand size");
+      return;
     case 2:      NewOpcode = X86::MOV16rm; break;
     case 4:      NewOpcode = X86::MOV32rm; break;
     case 8:      NewOpcode = X86::MOV64rm; break;
@@ -2324,7 +2323,6 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     else
       Inst.addOperand(MCOperand::createImm(Offset)); // Displacement
     Inst.addOperand(MCOperand::createReg(AddrSegmentReg)); // AddrSegmentReg
-    return true;
   }
 
   InstructionListType createLoadImmediate(const MCPhysReg Dest,
@@ -2338,7 +2336,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return Insts;
   }
 
-  bool createIJmp32Frag(SmallVectorImpl<MCInst> &Insts,
+  void createIJmp32Frag(SmallVectorImpl<MCInst> &Insts,
                         const MCOperand &BaseReg, const MCOperand &Scale,
                         const MCOperand &IndexReg, const MCOperand &Offset,
                         const MCOperand &TmpReg) const override {
@@ -2362,17 +2360,16 @@ class X86MCPlusBuilder : public MCPlusBuilder {
 
     Insts.push_back(Load);
     Insts.push_back(IJmp);
-    return true;
   }
 
-  bool createNoop(MCInst &Inst) const override {
+  void createNoop(MCInst &Inst) const override {
     Inst.setOpcode(X86::NOOP);
-    return true;
+    Inst.clear();
   }
 
-  bool createReturn(MCInst &Inst) const override {
+  void createReturn(MCInst &Inst) const override {
     Inst.setOpcode(X86::RET64);
-    return true;
+    Inst.clear();
   }
 
   InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
@@ -2729,23 +2726,31 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return FoundOne;
   }
 
-  bool createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
+  void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
                           MCContext *Ctx) const override {
     Inst.setOpcode(X86::JMP_1);
+    Inst.clear();
     Inst.addOperand(MCOperand::createExpr(
         MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx)));
-    return true;
   }
 
-  bool createCall(MCInst &Inst, const MCSymbol *Target,
+  void createLongUncondBranch(MCInst &Inst, const MCSymbol *Target,
+                              MCContext *Ctx) const override {
+    Inst.setOpcode(X86::JMP_4);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
+  }
+
+  void createCall(MCInst &Inst, const MCSymbol *Target,
                   MCContext *Ctx) override {
     Inst.setOpcode(X86::CALL64pcrel32);
+    Inst.clear();
     Inst.addOperand(MCOperand::createExpr(
         MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
-    return true;
   }
 
-  bool createTailCall(MCInst &Inst, const MCSymbol *Target,
+  void createTailCall(MCInst &Inst, const MCSymbol *Target,
                       MCContext *Ctx) override {
     return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true);
   }
@@ -2757,10 +2762,18 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     createDirectCall(Seq.back(), Target, Ctx, /*IsTailCall*/ true);
   }
 
-  bool createTrap(MCInst &Inst) const override {
+  void createTrap(MCInst &Inst) const override {
     Inst.clear();
     Inst.setOpcode(X86::TRAP);
-    return true;
+  }
+
+  void createCondBranch(MCInst &Inst, const MCSymbol *Target, unsigned CC,
+                        MCContext *Ctx) const override {
+    Inst.setOpcode(X86::JCC_1);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
+    Inst.addOperand(MCOperand::createImm(CC));
   }
 
   bool reverseBranchCondition(MCInst &Inst, const MCSymbol *TBB,
@@ -2862,7 +2875,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     Inst.setOpcode(X86::LFENCE);
   }
 
-  bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
+  void createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
                         bool IsTailCall) override {
     Inst.clear();
     Inst.setOpcode(IsTailCall ? X86::JMP_4 : X86::CALL64pcrel32);
@@ -2870,7 +2883,6 @@ class X86MCPlusBuilder : public MCPlusBuilder {
         MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
     if (IsTailCall)
       setTailCall(Inst);
-    return true;
   }
 
   void createShortJmp(InstructionListType &Seq, const MCSymbol *Target,
@@ -3066,6 +3078,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
   void createSwap(MCInst &Inst, MCPhysReg Source, MCPhysReg MemBaseReg,
                   int64_t Disp) const {
     Inst.setOpcode(X86::XCHG64rm);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(Source));
     Inst.addOperand(MCOperand::createReg(Source));
     Inst.addOperand(MCOperand::createReg(MemBaseReg));      // BaseReg
@@ -3078,6 +3091,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
   void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg,
                             int64_t Disp) const {
     Inst.setOpcode(X86::JMP64m);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(MemBaseReg));      // BaseReg
     Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
     Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
@@ -3540,9 +3554,10 @@ class X86MCPlusBuilder : public MCPlusBuilder {
   }
 
 private:
-  bool createMove(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
+  void createMove(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
                   MCContext *Ctx) const {
     Inst.setOpcode(X86::MOV64rm);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(Reg));
     Inst.addOperand(MCOperand::createReg(X86::RIP));        // BaseReg
     Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
@@ -3551,13 +3566,12 @@ class X86MCPlusBuilder : public MCPlusBuilder {
         MCSymbolRefExpr::create(Src, MCSymbolRefExpr::VK_None,
                                 *Ctx)));                    // Displacement
     Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
-
-    return true;
   }
 
-  bool createLea(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
+  void createLea(MCInst &Inst, const MCSymbol *Src, unsigned Reg,
                  MCContext *Ctx) const {
     Inst.setOpcode(X86::LEA64r);
+    Inst.clear();
     Inst.addOperand(MCOperand::createReg(Reg));
     Inst.addOperand(MCOperand::createReg(X86::RIP));        // BaseReg
     Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
@@ -3566,7 +3580,6 @@ class X86MCPlusBuilder : public MCPlusBuilder {
         MCSymbolRefExpr::create(Src, MCSymbolRefExpr::VK_None,
                                 *Ctx)));                    // Displacement
     Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
-    return true;
   }
 };
 
diff --git a/bolt/test/X86/dwarf5-debug-names-dw-at-specification.s b/bolt/test/X86/dwarf5-debug-names-dw-at-specification.s
new file mode 100644
index 00000000000000..ec5b425625ce96
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-dw-at-specification.s
@@ -0,0 +1,694 @@
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s   -o %tmain.o
+# RUN: %clang %cflags -gdwarf-5 %tmain.o -o %tmain.exe
+# RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+# RUN: llvm-dwarfdump --debug-info -r 0 --debug-names %tmain.exe.bolt > %tlog.txt
+# RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+## Tests that BOLT correctly generates entries in .debug_names with DW_AT_specification.
+
+# BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+# BOLT:       Name Index @ 0x0
+# BOLT-NEXT:    Header {
+# BOLT-NEXT:      Length: 0x10F
+# BOLT-NEXT:      Format: DWARF32
+# BOLT-NEXT:      Version: 5
+# BOLT-NEXT:      CU count: 1
+# BOLT-NEXT:      Local TU count: 0
+# BOLT-NEXT:      Foreign TU count: 0
+# BOLT-NEXT:      Bucket count: 9
+# BOLT-NEXT:      Name count: 9
+# BOLT-NEXT:      Abbreviations table size: 0x21
+# BOLT-NEXT:      Augmentation: 'BOLT'
+# BOLT-NEXT:    }
+# BOLT-NEXT:    Compilation Unit offsets [
+# BOLT-NEXT:      CU[0]: [[OFFSET1]]
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Abbreviations [
+# BOLT-NEXT:      Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+# BOLT-NEXT:        Tag: DW_TAG_variable
+# BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+# BOLT-NEXT:        DW_IDX_parent: DW_FORM_flag_present
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+# BOLT-NEXT:        Tag: DW_TAG_structure_type
+# BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+# BOLT-NEXT:        DW_IDX_parent: DW_FORM_flag_present
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+# BOLT-NEXT:        Tag: DW_TAG_base_type
+# BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+# BOLT-NEXT:        DW_IDX_parent: DW_FORM_flag_present
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+# BOLT-NEXT:        Tag: DW_TAG_subprogram
+# BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+# BOLT-NEXT:        DW_IDX_parent: DW_FORM_flag_present
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 0 [
+# BOLT-NEXT:      Name 1 {
+# BOLT-NEXT:        Hash: 0x5D3CA9E0
+# BOLT-NEXT:        String: {{.+}} "_ZN1A15fully_specifiedE"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV1]]
+# BOLT-NEXT:          Tag: DW_TAG_variable
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000024
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Name 2 {
+# BOLT-NEXT:        Hash: 0x7C9DFC37
+# BOLT-NEXT:        String: {{.+}} "smem"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV1]]
+# BOLT-NEXT:          Tag: DW_TAG_variable
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000057
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 1 [
+# BOLT-NEXT:      Name 3 {
+# BOLT-NEXT:        Hash: 0x2B606
+# BOLT-NEXT:        String: {{.+}} "A"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV2]]
+# BOLT-NEXT:          Tag: DW_TAG_structure_type
+# BOLT-NEXT:          DW_IDX_die_offset: 0x0000002d
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 2 [
+# BOLT-NEXT:      Name 4 {
+# BOLT-NEXT:        Hash: 0xB888030
+# BOLT-NEXT:        String: {{.+}} "int"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV3]]
+# BOLT-NEXT:          Tag: DW_TAG_base_type
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000044
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 3 [
+# BOLT-NEXT:      EMPTY
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 4 [
+# BOLT-NEXT:      EMPTY
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 5 [
+# BOLT-NEXT:      EMPTY
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 6 [
+# BOLT-NEXT:      EMPTY
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 7 [
+# BOLT-NEXT:      Name 5 {
+# BOLT-NEXT:        Hash: 0x65788E1C
+# BOLT-NEXT:        String: {{.+}} "fully_specified"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV1]]
+# BOLT-NEXT:          Tag: DW_TAG_variable
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000024
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Name 6 {
+# BOLT-NEXT:        Hash: 0x7C9A7F6A
+# BOLT-NEXT:        String: {{.+}} "main"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV4]]
+# BOLT-NEXT:          Tag: DW_TAG_subprogram
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000070
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:    Bucket 8 [
+# BOLT-NEXT:      Name 7 {
+# BOLT-NEXT:        Hash: 0xCEF4CFB
+# BOLT-NEXT:        String: {{.+}} "__ARRAY_SIZE_TYPE__"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV3]]
+# BOLT-NEXT:          Tag: DW_TAG_base_type
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000053
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Name 8 {
+# BOLT-NEXT:        Hash: 0x48684B69
+# BOLT-NEXT:        String: {{.+}} "_ZN1A4smemE"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV1]]
+# BOLT-NEXT:          Tag: DW_TAG_variable
+# BOLT-NEXT:          DW_IDX_die_offset: 0x00000057
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:      Name 9 {
+# BOLT-NEXT:        Hash: 0x7C952063
+# BOLT-NEXT:        String: {{.+}} "char"
+# BOLT-NEXT:        Entry @ {{.+}} {
+# BOLT-NEXT:          Abbrev: [[ABBREV3]]
+# BOLT-NEXT:          Tag: DW_TAG_base_type
+# BOLT-NEXT:          DW_IDX_die_offset: 0x0000009e
+# BOLT-NEXT:          DW_IDX_parent: <parent not indexed>
+# BOLT-NEXT:        }
+# BOLT-NEXT:      }
+# BOLT-NEXT:    ]
+# BOLT-NEXT:  }
+
+# clang++ main.cpp -O2 -g2 -gdwarf-5 -gpubnames -S
+# struct A {
+#   static int fully_specified;
+#   static int smem[];
+# };
+#
+# int A::fully_specified;
+# int A::smem[] = { 0, 1, 2, 3 };
+# int main(int argc, char *argv[]) {
+#   return 0;
+# }
+	.text
+	.file	"main.cpp"
+	.file	0 "/specification" "main.cpp" md5 0x6c1b1c014d300f2e0efd26584acae1a9
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.cfi_startproc
+# %bb.0:                                # %entry
+	#DEBUG_VALUE: main:argc <- $edi
+	#DEBUG_VALUE: main:argv <- $rsi
+	.loc	0 9 3 prologue_end              # main.cpp:9:3
+	xorl	%eax, %eax
+	retq
+.Ltmp0:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.type	_ZN1A15fully_specifiedE,@object # @_ZN1A15fully_specifiedE
+	.bss
+	.globl	_ZN1A15fully_specifiedE
+	.p2align	2, 0x0
+_ZN1A15fully_specifiedE:
+	.long	0                               # 0x0
+	.size	_ZN1A15fully_specifiedE, 4
+
+	.type	_ZN1A4smemE,@object             # @_ZN1A4smemE
+	.data
+	.globl	_ZN1A4smemE
+	.p2align	4, 0x0
+_ZN1A4smemE:
+	.long	0                               # 0x0
+	.long	1                               # 0x1
+	.long	2                               # 0x2
+	.long	3                               # 0x3
+	.size	_ZN1A4smemE, 16
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	71                              # DW_AT_specification
+	.byte	19                              # DW_FORM_ref4
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	1                               # DW_TAG_array_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	33                              # DW_TAG_subrange_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	71                              # DW_AT_specification
+	.byte	19                              # DW_FORM_ref4
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	33                              # DW_TAG_subrange_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	55                              # DW_AT_count
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	122                             # DW_AT_call_all_calls
+	.byte	25                              # DW_FORM_flag_present
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	12                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	13                              # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	1                               # Abbrev [1] 0xc:0x96 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	2                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	2                               # Abbrev [2] 0x23:0x9 DW_TAG_variable
+	.long	50                              # DW_AT_specification
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	8                               # DW_AT_linkage_name
+	.byte	3                               # Abbrev [3] 0x2c:0x17 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	7                               # DW_AT_name
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x32:0x8 DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+                                        # DW_AT_external
+                                        # DW_AT_declaration
+	.byte	4                               # Abbrev [4] 0x3a:0x8 DW_TAG_variable
+	.byte	5                               # DW_AT_name
+	.long	71                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+                                        # DW_AT_external
+                                        # DW_AT_declaration
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x43:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	6                               # Abbrev [6] 0x47:0xb DW_TAG_array_type
+	.long	67                              # DW_AT_type
+	.byte	7                               # Abbrev [7] 0x4c:0x5 DW_TAG_subrange_type
+	.long	82                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x52:0x4 DW_TAG_base_type
+	.byte	6                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	7                               # DW_AT_encoding
+	.byte	9                               # Abbrev [9] 0x56:0xd DW_TAG_variable
+	.long	58                              # DW_AT_specification
+	.long	99                              # DW_AT_type
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	1
+	.byte	9                               # DW_AT_linkage_name
+	.byte	6                               # Abbrev [6] 0x63:0xc DW_TAG_array_type
+	.long	67                              # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x68:0x6 DW_TAG_subrange_type
+	.long	82                              # DW_AT_type
+	.byte	4                               # DW_AT_count
+	.byte	0                               # End Of Children Mark
+	.byte	11                              # Abbrev [11] 0x6f:0x24 DW_TAG_subprogram
+	.byte	2                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	87
+                                        # DW_AT_call_all_calls
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	67                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	12                              # Abbrev [12] 0x7e:0xa DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	85
+	.byte	11                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	67                              # DW_AT_type
+	.byte	12                              # Abbrev [12] 0x88:0xa DW_TAG_formal_parameter
+	.byte	1                               # DW_AT_location
+	.byte	84
+	.byte	12                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	147                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	13                              # Abbrev [13] 0x93:0x5 DW_TAG_pointer_type
+	.long	152                             # DW_AT_type
+	.byte	13                              # Abbrev [13] 0x98:0x5 DW_TAG_pointer_type
+	.long	157                             # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x9d:0x4 DW_TAG_base_type
+	.byte	13                              # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	60                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)" # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=104
+.Linfo_string2:
+	.asciz	"/specification" # string offset=113
+.Linfo_string3:
+	.asciz	"A"                             # string offset=165
+.Linfo_string4:
+	.asciz	"fully_specified"               # string offset=167
+.Linfo_string5:
+	.asciz	"int"                           # string offset=183
+.Linfo_string6:
+	.asciz	"smem"                          # string offset=187
+.Linfo_string7:
+	.asciz	"__ARRAY_SIZE_TYPE__"           # string offset=192
+.Linfo_string8:
+	.asciz	"_ZN1A15fully_specifiedE"       # string offset=212
+.Linfo_string9:
+	.asciz	"_ZN1A4smemE"                   # string offset=236
+.Linfo_string10:
+	.asciz	"main"                          # string offset=248
+.Linfo_string11:
+	.asciz	"argc"                          # string offset=253
+.Linfo_string12:
+	.asciz	"argv"                          # string offset=258
+.Linfo_string13:
+	.asciz	"char"                          # string offset=263
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string7
+	.long	.Linfo_string3
+	.long	.Linfo_string8
+	.long	.Linfo_string9
+	.long	.Linfo_string10
+	.long	.Linfo_string11
+	.long	.Linfo_string12
+	.long	.Linfo_string13
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	_ZN1A15fully_specifiedE
+	.quad	_ZN1A4smemE
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	9                               # Header: bucket count
+	.long	9                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	1                               # Bucket 0
+	.long	3                               # Bucket 1
+	.long	4                               # Bucket 2
+	.long	0                               # Bucket 3
+	.long	0                               # Bucket 4
+	.long	0                               # Bucket 5
+	.long	0                               # Bucket 6
+	.long	5                               # Bucket 7
+	.long	7                               # Bucket 8
+	.long	1564256736                      # Hash in Bucket 0
+	.long	2090728503                      # Hash in Bucket 0
+	.long	177670                          # Hash in Bucket 1
+	.long	193495088                       # Hash in Bucket 2
+	.long	1702399516                      # Hash in Bucket 7
+	.long	2090499946                      # Hash in Bucket 7
+	.long	217009403                       # Hash in Bucket 8
+	.long	1214794601                      # Hash in Bucket 8
+	.long	2090147939                      # Hash in Bucket 8
+	.long	.Linfo_string8                  # String in Bucket 0: _ZN1A15fully_specifiedE
+	.long	.Linfo_string6                  # String in Bucket 0: smem
+	.long	.Linfo_string3                  # String in Bucket 1: A
+	.long	.Linfo_string5                  # String in Bucket 2: int
+	.long	.Linfo_string4                  # String in Bucket 7: fully_specified
+	.long	.Linfo_string10                 # String in Bucket 7: main
+	.long	.Linfo_string7                  # String in Bucket 8: __ARRAY_SIZE_TYPE__
+	.long	.Linfo_string9                  # String in Bucket 8: _ZN1A4smemE
+	.long	.Linfo_string13                 # String in Bucket 8: char
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 7
+	.long	.Lnames7-.Lnames_entries0       # Offset in Bucket 7
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 8
+	.long	.Lnames6-.Lnames_entries0       # Offset in Bucket 8
+	.long	.Lnames8-.Lnames_entries0       # Offset in Bucket 8
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames4:
+.L3:
+	.byte	1                               # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _ZN1A15fully_specifiedE
+.Lnames5:
+.L4:
+	.byte	1                               # Abbreviation code
+	.long	86                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: smem
+.Lnames0:
+.L6:
+	.byte	2                               # Abbreviation code
+	.long	44                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: A
+.Lnames1:
+.L5:
+	.byte	3                               # Abbreviation code
+	.long	67                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames3:
+	.byte	1                               # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: fully_specified
+.Lnames7:
+.L0:
+	.byte	4                               # Abbreviation code
+	.long	111                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames2:
+.L2:
+	.byte	3                               # Abbreviation code
+	.long	82                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: __ARRAY_SIZE_TYPE__
+.Lnames6:
+	.byte	1                               # Abbreviation code
+	.long	86                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _ZN1A4smemE
+.Lnames8:
+.L1:
+	.byte	3                               # Abbreviation code
+	.long	157                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: char
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test b/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
index b8789a6ad2300a..b7d8fda63315fa 100644
--- a/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
+++ b/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
@@ -14,15 +14,15 @@
 ; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x103
+; BOLT-NEXT:     Length: 0x155
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 2
 ; BOLT-NEXT:     Local TU count: 0
 ; BOLT-NEXT:     Foreign TU count: 0
-; BOLT-NEXT:     Bucket count: 8
-; BOLT-NEXT:     Name count: 8
-; BOLT-NEXT:     Abbreviations table size: 0x19
+; BOLT-NEXT:     Bucket count: 10
+; BOLT-NEXT:     Name count: 10
+; BOLT-NEXT:     Abbreviations table size: 0x29
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -31,124 +31,183 @@
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Abbreviations [
 ; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
-; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
-; BOLT-NEXT:       Tag: DW_TAG_subprogram
-; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:          Tag: DW_TAG_inlined_subroutine
+; BOLT-NEXT:          DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:          DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:          DW_IDX_parent: DW_FORM_ref4
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
-; BOLT-NEXT:     Name 1 {
-; BOLT-NEXT:       Hash: 0xB888030
-; BOLT-NEXT:       String: {{.+}} "int"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
-; BOLT-NEXT:         Tag: DW_TAG_base_type
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:       Name 1 {
+; BOLT-NEXT:         Hash: 0x2124BB36
+; BOLT-NEXT:         String: {{.+}} "useFoo"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000037
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV2]]
+; BOLT-NEXT:           Tag: DW_TAG_inlined_subroutine
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000081
+; BOLT-NEXT:           DW_IDX_parent: Entry @ 0x14b
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
-; BOLT-NEXT:         Tag: DW_TAG_base_type
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x0000007f
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 1 [
+; BOLT-NEXT:       EMPTY
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 2 [
+; BOLT-NEXT:       Name 2 {
+; BOLT-NEXT:         Hash: 0xFDE4B5D2
+; BOLT-NEXT:         String: {{.+}} "fooVar"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV3]]
+; BOLT-NEXT:           Tag: DW_TAG_variable
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 1 [
-; BOLT-NEXT:     Name 2 {
-; BOLT-NEXT:       Hash: 0xB887389
-; BOLT-NEXT:       String: {{.+}} "foo"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 3 [
+; BOLT-NEXT:       EMPTY
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 4 [
+; BOLT-NEXT:       EMPTY
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 5 [
+; BOLT-NEXT:       EMPTY
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 6 [
+; BOLT-NEXT:       Name 3 {
+; BOLT-NEXT:         Hash: 0xB88B3D2
+; BOLT-NEXT:         String: {{.+}} "use"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:     Name 3 {
-; BOLT-NEXT:       Hash: 0x8C06E589
-; BOLT-NEXT:       String: {{.+}} "_Z3usePiS_"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       Name 4 {
+; BOLT-NEXT:         Hash: 0x69A7307E
+; BOLT-NEXT:         String: {{.+}} "_Z6useFooPi"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000037
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV2]]
+; BOLT-NEXT:           Tag: DW_TAG_inlined_subroutine
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000081
+; BOLT-NEXT:           DW_IDX_parent: Entry @ 0x14b
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 2 [
-; BOLT-NEXT:     Name 4 {
-; BOLT-NEXT:       Hash: 0xB88B3D2
-; BOLT-NEXT:       String: {{.+}} "use"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       Name 5 {
+; BOLT-NEXT:         Hash: 0x7C9A7F6A
+; BOLT-NEXT:         String: {{.+}} "main"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000049
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:     Name 5 {
-; BOLT-NEXT:       Hash: 0x7C9A7F6A
-; BOLT-NEXT:       String: {{.+}} "main"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000049
+; BOLT-NEXT:       Name 6 {
+; BOLT-NEXT:         Hash: 0xB5063CFE
+; BOLT-NEXT:         String: {{.+}} "_Z3fooi"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:     Name 6 {
-; BOLT-NEXT:       Hash: 0xFDE4B5D2
-; BOLT-NEXT:       String: {{.+}} "fooVar"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV3]]
-; BOLT-NEXT:         Tag: DW_TAG_variable
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 7 [
+; BOLT-NEXT:       Name 7 {
+; BOLT-NEXT:         Hash: 0x8C06E589
+; BOLT-NEXT:         String: {{.+}} "_Z3usePiS_"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 3 [
-; BOLT-NEXT:     Name 7 {
-; BOLT-NEXT:       Hash: 0x7C952063
-; BOLT-NEXT:       String: {{.+}} "char"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
-; BOLT-NEXT:         Tag: DW_TAG_base_type
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000092
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 8 [
+; BOLT-NEXT:       Name 8 {
+; BOLT-NEXT:         Hash: 0xB888030
+; BOLT-NEXT:         String: {{.+}} "int"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV4]]
+; BOLT-NEXT:           Tag: DW_TAG_base_type
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
+; BOLT-NEXT:         Entry @ 0x{{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV4]]
+; BOLT-NEXT:           Tag: DW_TAG_base_type
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x0000007f
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 4 [
-; BOLT-NEXT:     EMPTY
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 5 [
-; BOLT-NEXT:     EMPTY
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 6 [
-; BOLT-NEXT:     Name 8 {
-; BOLT-NEXT:       Hash: 0xB5063CFE
-; BOLT-NEXT:       String: {{.+}} "_Z3fooi"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_subprogram
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:     ]
+; BOLT-NEXT:     Bucket 9 [
+; BOLT-NEXT:       Name 9 {
+; BOLT-NEXT:         Hash: 0xB887389
+; BOLT-NEXT:         String: {{.+}} "foo"
+; BOLT-NEXT:         Entry @ 0x14b {
+; BOLT-NEXT:           Abbrev: [[ABBREV1]]
+; BOLT-NEXT:           Tag: DW_TAG_subprogram
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:           DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
 ; BOLT-NEXT:       }
-; BOLT-NEXT:     }
-; BOLT-NEXT:   ]
-; BOLT-NEXT:   Bucket 7 [
-; BOLT-NEXT:     EMPTY
-; BOLT-NEXT:   ]
-; BOLT-NEXT: }
+; BOLT-NEXT:       Name 10 {
+; BOLT-NEXT:         Hash: 0x7C952063
+; BOLT-NEXT:         String: {{.+}} "char"
+; BOLT-NEXT:         Entry @ {{.+}} {
+; BOLT-NEXT:           Abbrev: [[ABBREV4]]
+; BOLT-NEXT:           Tag: DW_TAG_base_type
+; BOLT-NEXT:           DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:           DW_IDX_die_offset: 0x00000092
+; BOLT-NEXT:           DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:         }
+; BOLT-NEXT:       }
+; BOLT-NEXT:     ]
+; BOLT-NEXT:   }
diff --git a/bolt/test/X86/dwarf5-debug-names.test b/bolt/test/X86/dwarf5-debug-names.test
index de0e88d7a36ae5..ef11ee0c479b7f 100644
--- a/bolt/test/X86/dwarf5-debug-names.test
+++ b/bolt/test/X86/dwarf5-debug-names.test
@@ -11,7 +11,7 @@
 ; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x1C2
+; BOLT-NEXT:     Length: 0x1D4
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 2
@@ -19,7 +19,7 @@
 ; BOLT-NEXT:     Foreign TU count: 0
 ; BOLT-NEXT:     Bucket count: 14
 ; BOLT-NEXT:     Name count: 15
-; BOLT-NEXT:     Abbreviations table size: 0x29
+; BOLT-NEXT:     Abbreviations table size: 0x3D
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -31,27 +31,38 @@
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_namespace
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
+; BOLT-NEXT:       Abbreviation [[ABBREV6:0x[0-9a-f]*]] {
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:         DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:         DW_IDX_parent: DW_FORM_ref4
+; BOLT-NEXT:       }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
 ; BOLT-NEXT:     Name 1 {
@@ -62,6 +73,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -74,6 +86,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000eb
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -81,17 +94,12 @@
 ; BOLT-NEXT:     Name 3 {
 ; BOLT-NEXT:       Hash: 0x8CFC710C
 ; BOLT-NEXT:       String: {{.+}} "(anonymous namespace)"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV2]]
-; BOLT-NEXT:         Tag: DW_TAG_namespace
-; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
-; BOLT-NEXT:       }
-; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:       Entry @ [[ENTRY:0x[0-9a-f]*]] {
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_namespace
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 4 {
@@ -102,6 +110,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -120,6 +129,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c9
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 6 {
@@ -130,6 +140,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 7 {
@@ -140,12 +151,14 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000009f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV4]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c5
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -158,6 +171,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 9 {
@@ -168,6 +182,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -180,6 +195,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -192,6 +208,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -207,12 +224,14 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV4]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005d
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 13 {
@@ -223,12 +242,14 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000104
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 14 {
@@ -239,6 +260,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000073
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -250,10 +272,11 @@
 ; BOLT-NEXT:       Hash: 0x59796A
 ; BOLT-NEXT:       String: {{.+}} "t1"
 ; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Abbrev: [[ABBREV6]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000062
+; BOLT-NEXT:         DW_IDX_parent: Entry @ [[ENTRY]]
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test b/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
index 3be327b780e525..b0b6707a4ea5cb 100644
--- a/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
@@ -17,7 +17,7 @@
 ; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x148
+; BOLT-NEXT:     Length: 0x14E
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 2
@@ -25,7 +25,7 @@
 ; BOLT-NEXT:     Foreign TU count: 0
 ; BOLT-NEXT:     Bucket count: 11
 ; BOLT-NEXT:     Name count: 11
-; BOLT-NEXT:     Abbreviations table size: 0x19
+; BOLT-NEXT:     Abbreviations table size: 0x1F
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -37,16 +37,19 @@
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -58,6 +61,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -68,6 +72,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000008c
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -80,6 +85,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 4 {
@@ -90,6 +96,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -102,6 +109,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -114,6 +122,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -129,6 +138,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -141,12 +151,14 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -159,6 +171,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 10 {
@@ -169,6 +182,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000057
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -187,6 +201,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-debug-names.test b/bolt/test/X86/dwarf5-df-debug-names.test
index 0352d0ff720822..d53f2bd13620fc 100644
--- a/bolt/test/X86/dwarf5-df-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-debug-names.test
@@ -14,136 +14,148 @@
 
 ; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
 ; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
-: BOLT:       Name Index @ 0x0 {
-: BOLT-NEXT:   Header {
-: BOLT-NEXT:     Length: 0xF4
-: BOLT-NEXT:     Format: DWARF32
-: BOLT-NEXT:     Version: 5
-: BOLT-NEXT:     CU count: 2
-: BOLT-NEXT:     Local TU count: 0
-: BOLT-NEXT:     Foreign TU count: 0
-: BOLT-NEXT:     Bucket count: 7
-: BOLT-NEXT:     Name count: 7
-: BOLT-NEXT:     Abbreviations table size: 0x21
-: BOLT-NEXT:     Augmentation: 'BOLT'
-: BOLT-NEXT:   }
-: BOLT-NEXT:   Compilation Unit offsets [
-: BOLT-NEXT:     CU[0]: [[OFFSET]]
-: BOLT-NEXT:     CU[1]: [[OFFSET1]]
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Abbreviations [
-: BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
-: BOLT-NEXT:       Tag: DW_TAG_structure_type
-: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
-: BOLT-NEXT:       Tag: DW_TAG_base_type
-: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
-: BOLT-NEXT:       Tag: DW_TAG_variable
-: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
-: BOLT-NEXT:       Tag: DW_TAG_subprogram
-: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
-: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 0 [
-: BOLT-NEXT:     EMPTY
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 1 [
-: BOLT-NEXT:     Name 1 {
-: BOLT-NEXT:       Hash: 0x7C96E4DB
-: BOLT-NEXT:       String: {{.+}} "Foo2"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV1]]
-: BOLT-NEXT:         Tag: DW_TAG_structure_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 2 [
-: BOLT-NEXT:     Name 2 {
-: BOLT-NEXT:       Hash: 0xBA564846
-: BOLT-NEXT:       String: {{.+}} "Foo2Int"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV1]]
-: BOLT-NEXT:         Tag: DW_TAG_structure_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 3 [
-: BOLT-NEXT:     Name 3 {
-: BOLT-NEXT:       Hash: 0xB888030
-: BOLT-NEXT:       String: {{.+}} "int"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV2]]
-: BOLT-NEXT:         Tag: DW_TAG_base_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000043
-: BOLT-NEXT:       }
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV2]]
-: BOLT-NEXT:         Tag: DW_TAG_base_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Name 4 {
-: BOLT-NEXT:       Hash: 0xF73809C
-: BOLT-NEXT:       String: {{.+}} "Foo2a"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV1]]
-: BOLT-NEXT:         Tag: DW_TAG_structure_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Name 5 {
-: BOLT-NEXT:       Hash: 0x7C96CB76
-: BOLT-NEXT:       String: {{.+}} "fint"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV3]]
-: BOLT-NEXT:         Tag: DW_TAG_variable
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
-: BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:     Name 6 {
-: BOLT-NEXT:       Hash: 0x7C9A7F6A
-: BOLT-NEXT:       String: {{.+}} "main"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV4]]
-: BOLT-NEXT:         Tag: DW_TAG_subprogram
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 4 [
-: BOLT-NEXT:     EMPTY
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 5 [
-: BOLT-NEXT:     Name 7 {
-: BOLT-NEXT:       Hash: 0x7C952063
-: BOLT-NEXT:       String: {{.+}} "char"
-: BOLT-NEXT:       Entry @ {{.+}} {
-: BOLT-NEXT:         Abbrev: [[ABBREV2]]
-: BOLT-NEXT:         Tag: DW_TAG_base_type
-: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
-: BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
-: BOLT-NEXT:       }
-: BOLT-NEXT:     }
-: BOLT-NEXT:   ]
-: BOLT-NEXT:   Bucket 6 [
-: BOLT-NEXT:     EMPTY
-: BOLT-NEXT:   ]
-: BOLT-NEXT: }
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0xFC
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 7
+; BOLT-NEXT:     Name count: 7
+; BOLT-NEXT:     Abbreviations table size: 0x29
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET]]
+; BOLT-NEXT:     CU[1]: [[OFFSET1]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0xBA564846
+; BOLT-NEXT:       String: {{.+}} "Foo2Int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000043
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C96CB76
+; BOLT-NEXT:       String: {{.+}} "fint"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test b/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
index 8a8a4b118b8c03..3a0260fd0bfb91 100644
--- a/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
+++ b/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
@@ -31,6 +31,7 @@
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: 0x02
 ; BOLT-NEXT:       DW_IDX_die_offset: 0x0000001e
+; BOLT-NEXT:       DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   }
 ; BOLT:        Name 6 {
@@ -42,6 +43,7 @@
 ; BOLT-NEXT:       DW_IDX_type_unit: 0x02
 ; BOLT-NEXT:       DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:       DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   }
 ; BOLT:        Name 7 {
@@ -52,5 +54,6 @@
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:       DW_IDX_die_offset: 0x00000023
+; BOLT-NEXT:       DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   }
diff --git a/bolt/test/X86/dwarf5-df-one-cu-debug-names.test b/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
index 246ce7efb3bad6..bf1cad81040bd9 100644
--- a/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
@@ -13,7 +13,7 @@
 ; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0xA9
+; BOLT-NEXT:     Length: 0xAF
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 1
@@ -21,7 +21,7 @@
 ; BOLT-NEXT:     Foreign TU count: 0
 ; BOLT-NEXT:     Bucket count: 5
 ; BOLT-NEXT:     Name count: 5
-; BOLT-NEXT:     Abbreviations table size: 0x13
+; BOLT-NEXT:     Abbreviations table size: 0x19
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -31,14 +31,17 @@
 ; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -52,6 +55,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -61,6 +65,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -75,6 +80,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -86,6 +92,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 5 {
@@ -95,6 +102,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-types-debug-names.test b/bolt/test/X86/dwarf5-df-types-debug-names.test
index fdb962b80c7512..f5a2c9c10353e9 100644
--- a/bolt/test/X86/dwarf5-df-types-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-types-debug-names.test
@@ -25,7 +25,7 @@
 
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x174
+; BOLT-NEXT:     Length: 0x17E
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 2
@@ -33,7 +33,7 @@
 ; BOLT-NEXT:     Foreign TU count: 4
 ; BOLT-NEXT:     Bucket count: 9
 ; BOLT-NEXT:     Name count: 9
-; BOLT-NEXT:     Abbreviations table size: 0x2D
+; BOLT-NEXT:     Abbreviations table size: 0x37
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -52,27 +52,32 @@
 ; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:         DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -88,6 +93,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -98,6 +104,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 3 {
@@ -108,6 +115,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -120,6 +128,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: 0x5
@@ -127,12 +136,14 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x02
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -145,6 +156,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 6 {
@@ -156,6 +168,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV]]
@@ -163,6 +176,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x03
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 7 {
@@ -174,6 +188,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x02
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -195,6 +210,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -208,6 +224,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000036
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: 0x5
@@ -215,6 +232,7 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: 0x5
@@ -222,12 +240,14 @@
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x03
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test b/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
index 1c77583f50883f..2a489d86f5d7de 100644
--- a/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
@@ -17,7 +17,7 @@
 ; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0xD1
+; BOLT-NEXT:     Length: 0xD9
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 1
@@ -25,7 +25,7 @@
 ; BOLT-NEXT:     Foreign TU count: 2
 ; BOLT-NEXT:     Bucket count: 5
 ; BOLT-NEXT:     Name count: 5
-; BOLT-NEXT:     Abbreviations table size: 0x1D
+; BOLT-NEXT:     Abbreviations table size: 0x25
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -40,19 +40,23 @@
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -67,6 +71,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -76,6 +81,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -90,6 +96,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -102,6 +109,7 @@
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 5 {
@@ -112,17 +120,20 @@
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000036
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV4]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_type_unit: 0x01
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:       Entry @ {{.+}} {
 ; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-one-cu-debug-names.test b/bolt/test/X86/dwarf5-one-cu-debug-names.test
index e7754521d61cbe..74d6b3041c9335 100644
--- a/bolt/test/X86/dwarf5-one-cu-debug-names.test
+++ b/bolt/test/X86/dwarf5-one-cu-debug-names.test
@@ -9,7 +9,7 @@
 ; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
 ; BOLT:       Name Index @ 0x0 {
 ; BOLT-NEXT:   Header {
-; BOLT-NEXT:     Length: 0x13E
+; BOLT-NEXT:     Length: 0x14F
 ; BOLT-NEXT:     Format: DWARF32
 ; BOLT-NEXT:     Version: 5
 ; BOLT-NEXT:     CU count: 1
@@ -17,7 +17,7 @@
 ; BOLT-NEXT:     Foreign TU count: 0
 ; BOLT-NEXT:     Bucket count: 11
 ; BOLT-NEXT:     Name count: 11
-; BOLT-NEXT:     Abbreviations table size: 0x1F
+; BOLT-NEXT:     Abbreviations table size: 0x31
 ; BOLT-NEXT:     Augmentation: 'BOLT'
 ; BOLT-NEXT:   }
 ; BOLT-NEXT:   Compilation Unit offsets [
@@ -27,22 +27,32 @@
 ; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_structure_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_base_type
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:        Tag: DW_TAG_structure_type
+; BOLT-NEXT:        DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:        DW_IDX_parent: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_variable
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
-; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_subprogram
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
-; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
+; BOLT-NEXT:     Abbreviation [[ABBREV6:0x[0-9a-f]*]] {
 ; BOLT-NEXT:       Tag: DW_TAG_namespace
 ; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
 ; BOLT-NEXT:   Bucket 0 [
@@ -53,6 +63,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000104
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 2 {
@@ -62,6 +73,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c5
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -73,6 +85,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c9
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 4 {
@@ -82,6 +95,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -93,6 +107,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x000000eb
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -107,18 +122,20 @@
 ; BOLT-NEXT:       Hash: 0x59796A
 ; BOLT-NEXT:       String: {{.+}} "t1"
 ; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000062
+; BOLT-NEXT:         DW_IDX_parent: Entry @ 0x14d
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 7 {
 ; BOLT-NEXT:       Hash: 0x5979AC
 ; BOLT-NEXT:       String: {{.+}} "v1"
 ; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
 ; BOLT-NEXT:         Tag: DW_TAG_variable
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -130,6 +147,7 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:         Tag: DW_TAG_base_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005d
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -141,15 +159,17 @@
 ; BOLT-NEXT:         Abbrev: [[ABBREV1]]
 ; BOLT-NEXT:         Tag: DW_TAG_structure_type
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:     Name 10 {
 ; BOLT-NEXT:       Hash: 0x7C9A7F6A
 ; BOLT-NEXT:       String: {{.+}} "main"
 ; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
 ; BOLT-NEXT:         Tag: DW_TAG_subprogram
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000073
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
@@ -157,15 +177,11 @@
 ; BOLT-NEXT:     Name 11 {
 ; BOLT-NEXT:       Hash: 0x8CFC710C
 ; BOLT-NEXT:       String: {{.+}} "(anonymous namespace)"
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV5]]
-; BOLT-NEXT:         Tag: DW_TAG_namespace
-; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
-; BOLT-NEXT:       }
-; BOLT-NEXT:       Entry @ {{.+}} {
-; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:       Entry @ 0x14d {
+; BOLT-NEXT:         Abbrev: [[ABBREV6]]
 ; BOLT-NEXT:         Tag: DW_TAG_namespace
 ; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:       }
 ; BOLT-NEXT:     }
 ; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-types-debug-names.test b/bolt/test/X86/dwarf5-types-debug-names.test
index 6a26477d4cdb55..94624298e289de 100644
--- a/bolt/test/X86/dwarf5-types-debug-names.test
+++ b/bolt/test/X86/dwarf5-types-debug-names.test
@@ -14,7 +14,7 @@
 
 ; BOLT: Name Index @ 0x0 {
 ; BOLT:   Header {
-; BOLT:     Length: 0xE1
+; BOLT:     Length: 0xE9
 ; BOLT:     Format: DWARF32
 ; BOLT:     Version: 5
 ; BOLT:     CU count: 2
@@ -22,7 +22,7 @@
 ; BOLT:     Foreign TU count: 0
 ; BOLT:     Bucket count: 6
 ; BOLT:     Name count: 6
-; BOLT:     Abbreviations table size: 0x21
+; BOLT:     Abbreviations table size: 0x29
 ; BOLT:     Augmentation: 'BOLT'
 ; BOLT:   }
 ; BOLT:   Compilation Unit offsets [
@@ -37,21 +37,25 @@
 ; BOLT:       Tag: DW_TAG_structure_type
 ; BOLT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT:     }
 ; BOLT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT:       Tag: DW_TAG_subprogram
 ; BOLT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT:     }
 ; BOLT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT:       Tag: DW_TAG_base_type
 ; BOLT:       DW_IDX_compile_unit: DW_FORM_data1
 ; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT:     }
 ; BOLT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT:       Tag: DW_TAG_base_type
 ; BOLT:       DW_IDX_type_unit: DW_FORM_data1
 ; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:       DW_IDX_parent: DW_FORM_flag_present
 ; BOLT:     }
 ; BOLT:   ]
 ; BOLT:   Bucket 0 [
@@ -63,6 +67,7 @@
 ; BOLT:         Tag: DW_TAG_structure_type
 ; BOLT:         DW_IDX_type_unit: 0x00
 ; BOLT:         DW_IDX_die_offset: 0x00000023
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -75,6 +80,7 @@
 ; BOLT:         Tag: DW_TAG_subprogram
 ; BOLT:         DW_IDX_compile_unit: 0x01
 ; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -87,6 +93,7 @@
 ; BOLT:         Tag: DW_TAG_base_type
 ; BOLT:         DW_IDX_compile_unit: 0x01
 ; BOLT:         DW_IDX_die_offset: 0x00000040
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -99,6 +106,7 @@
 ; BOLT:         Tag: DW_TAG_subprogram
 ; BOLT:         DW_IDX_compile_unit: 0x01
 ; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -111,6 +119,7 @@
 ; BOLT:         Tag: DW_TAG_subprogram
 ; BOLT:         DW_IDX_compile_unit: 0x00
 ; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
@@ -123,6 +132,7 @@
 ; BOLT:         Tag: DW_TAG_base_type
 ; BOLT:         DW_IDX_type_unit: 0x00
 ; BOLT:         DW_IDX_die_offset: 0x00000038
+; BOLT:         DW_IDX_parent: <parent not indexed>
 ; BOLT:       }
 ; BOLT:     }
 ; BOLT:   ]
diff --git a/bolt/test/X86/dwarf5-types-one-cu-debug-names.test b/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
index 00a1319a557869..02c0ef81351f1f 100644
--- a/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
+++ b/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
@@ -11,7 +11,7 @@
 
 ; BOLT:Name Index @ 0x0 {
 ; BOLT-NEXT:  Header {
-; BOLT-NEXT:    Length: 0xA3
+; BOLT-NEXT:    Length: 0xAB
 ; BOLT-NEXT:    Format: DWARF32
 ; BOLT-NEXT:    Version: 5
 ; BOLT-NEXT:    CU count: 1
@@ -19,7 +19,7 @@
 ; BOLT-NEXT:    Foreign TU count: 0
 ; BOLT-NEXT:    Bucket count: 4
 ; BOLT-NEXT:    Name count: 4
-; BOLT-NEXT:    Abbreviations table size: 0x1D
+; BOLT-NEXT:    Abbreviations table size: 0x25
 ; BOLT-NEXT:    Augmentation: 'BOLT'
 ; BOLT-NEXT:  }
 ; BOLT-NEXT:  Compilation Unit offsets [
@@ -32,20 +32,24 @@
 ; BOLT-NEXT:    Abbreviation [[ABBREV:0x[0-9a-f]*]] {
 ; BOLT-NEXT:      Tag: DW_TAG_base_type
 ; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:      DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:    Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; BOLT-NEXT:      Tag: DW_TAG_structure_type
 ; BOLT-NEXT:      DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:      DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:    Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; BOLT-NEXT:      Tag: DW_TAG_subprogram
 ; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:      DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:    Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; BOLT-NEXT:      Tag: DW_TAG_base_type
 ; BOLT-NEXT:      DW_IDX_type_unit: DW_FORM_data1
 ; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:      DW_IDX_parent: DW_FORM_flag_present
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:  ]
 ; BOLT-NEXT:  Bucket 0 [
@@ -56,6 +60,7 @@
 ; BOLT-NEXT:        Abbrev: [[ABBREV]]
 ; BOLT-NEXT:        Tag: DW_TAG_base_type
 ; BOLT-NEXT:        DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:        DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:      }
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:    Name 2 {
@@ -66,6 +71,7 @@
 ; BOLT-NEXT:        Tag: DW_TAG_structure_type
 ; BOLT-NEXT:        DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:        DW_IDX_die_offset: 0x00000023
+; BOLT-NEXT:        DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:      }
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:  ]
@@ -80,6 +86,7 @@
 ; BOLT-NEXT:        Abbrev: [[ABBREV2]]
 ; BOLT-NEXT:        Tag: DW_TAG_subprogram
 ; BOLT-NEXT:        DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:        DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:      }
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:  ]
@@ -92,6 +99,7 @@
 ; BOLT-NEXT:        Tag: DW_TAG_base_type
 ; BOLT-NEXT:        DW_IDX_type_unit: 0x00
 ; BOLT-NEXT:        DW_IDX_die_offset: 0x00000038
+; BOLT-NEXT:        DW_IDX_parent: <parent not indexed>
 ; BOLT-NEXT:      }
 ; BOLT-NEXT:    }
 ; BOLT-NEXT:  ]
diff --git a/bolt/test/X86/reader-stale-yaml.test b/bolt/test/X86/reader-stale-yaml.test
index 533ed917c1aeb0..f4a8865b1f9a46 100644
--- a/bolt/test/X86/reader-stale-yaml.test
+++ b/bolt/test/X86/reader-stale-yaml.test
@@ -14,6 +14,10 @@ RUN:   --profile-ignore-hash=1 --profile-use-dfs=0 --debug-only=bolt-prof 2>&1 |
 RUN: llvm-bolt %t.exe -o %t.null --b %p/Inputs/blarge_profile_stale.yaml \
 RUN:   --print-cfg --print-only=SolveCubic --infer-stale-profile=1 \
 RUN:   --profile-ignore-hash=1 --profile-use-dfs=0 --debug-only=bolt-prof 2>&1 | FileCheck %s -check-prefix=CHECK2
+# Testing skipped function
+RUN: llvm-bolt %t.exe -o %t.null --b %p/Inputs/blarge_profile_stale.yaml \
+RUN:   --print-cfg --print-only=usqrt --infer-stale-profile=1 --skip-funcs=usqrt \
+RUN:   --profile-ignore-hash=1 --profile-use-dfs=0
 
 CHECK0: BOLT-INFO: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile
 CHECK0: BOLT-WARNING: 2 (100.0% of all profiled) functions have invalid (possibly stale) profile
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index 63448039c53e67..daf9f392b82281 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -134,9 +134,8 @@ TEST_P(MCPlusBuilderTester, ReplaceRegWithImm) {
 
 TEST_P(MCPlusBuilderTester, Annotation) {
   MCInst Inst;
-  bool Success = BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
-                                         BC->Ctx.get());
-  ASSERT_TRUE(Success);
+  BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
+                          BC->Ctx.get());
   MCSymbol *LPSymbol = BC->Ctx->createNamedTempSymbol("LP");
   uint64_t Value = INT32_MIN;
   // Test encodeAnnotationImm using this indirect way
@@ -151,9 +150,8 @@ TEST_P(MCPlusBuilderTester, Annotation) {
   // Large int64 should trigger an out of range assertion
   Value = 0x1FF'FFFF'FFFF'FFFFULL;
   Inst.clear();
-  Success = BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
-                                    BC->Ctx.get());
-  ASSERT_TRUE(Success);
+  BC->MIB->createTailCall(Inst, BC->Ctx->createNamedTempSymbol(),
+                          BC->Ctx.get());
   ASSERT_DEATH(BC->MIB->addEHInfo(Inst, MCPlus::MCLandingPad(LPSymbol, Value)),
                "annotation value out of range");
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index e518a64abc52eb..2931325d8b5798 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -73,6 +73,7 @@
 #include "SuspiciousReallocUsageCheck.h"
 #include "SuspiciousSemicolonCheck.h"
 #include "SuspiciousStringCompareCheck.h"
+#include "SuspiciousStringviewDataUsageCheck.h"
 #include "SwappedArgumentsCheck.h"
 #include "SwitchMissingDefaultCaseCheck.h"
 #include "TerminatingContinueCheck.h"
@@ -218,6 +219,8 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-suspicious-semicolon");
     CheckFactories.registerCheck<SuspiciousStringCompareCheck>(
         "bugprone-suspicious-string-compare");
+    CheckFactories.registerCheck<SuspiciousStringviewDataUsageCheck>(
+        "bugprone-suspicious-stringview-data-usage");
     CheckFactories.registerCheck<SwappedArgumentsCheck>(
         "bugprone-swapped-arguments");
     CheckFactories.registerCheck<TerminatingContinueCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 638fba03a43587..081ba67efe1538 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -26,6 +26,7 @@ add_clang_library(clangTidyBugproneModule
   ImplicitWideningOfMultiplicationResultCheck.cpp
   InaccurateEraseCheck.cpp
   IncorrectEnableIfCheck.cpp
+  SuspiciousStringviewDataUsageCheck.cpp
   SwitchMissingDefaultCaseCheck.cpp
   IncDecInConditionsCheck.cpp
   IncorrectRoundingsCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp
new file mode 100644
index 00000000000000..8f4b0c5e0dceda
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.cpp
@@ -0,0 +1,97 @@
+//===--- SuspiciousStringviewDataUsageCheck.cpp - clang-tidy --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SuspiciousStringviewDataUsageCheck.h"
+#include "../utils/Matchers.h"
+#include "../utils/OptionsUtils.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::bugprone {
+
+SuspiciousStringviewDataUsageCheck::SuspiciousStringviewDataUsageCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      StringViewTypes(utils::options::parseStringList(Options.get(
+          "StringViewTypes", "::std::basic_string_view;::llvm::StringRef"))),
+      AllowedCallees(
+          utils::options::parseStringList(Options.get("AllowedCallees", ""))) {}
+
+void SuspiciousStringviewDataUsageCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "StringViewTypes",
+                utils::options::serializeStringList(StringViewTypes));
+  Options.store(Opts, "AllowedCallees",
+                utils::options::serializeStringList(AllowedCallees));
+}
+
+bool SuspiciousStringviewDataUsageCheck::isLanguageVersionSupported(
+    const LangOptions &LangOpts) const {
+  return LangOpts.CPlusPlus;
+}
+
+std::optional<TraversalKind>
+SuspiciousStringviewDataUsageCheck::getCheckTraversalKind() const {
+  return TK_AsIs;
+}
+
+void SuspiciousStringviewDataUsageCheck::registerMatchers(MatchFinder *Finder) {
+
+  auto AncestorCall = anyOf(
+      cxxConstructExpr(), callExpr(unless(cxxOperatorCallExpr())), lambdaExpr(),
+      initListExpr(
+          hasType(qualType(hasCanonicalType(hasDeclaration(recordDecl()))))));
+
+  auto DataMethod =
+      cxxMethodDecl(hasName("data"),
+                    ofClass(matchers::matchesAnyListedName(StringViewTypes)));
+
+  auto SizeCall = cxxMemberCallExpr(
+      callee(cxxMethodDecl(hasAnyName("size", "length"))),
+      on(ignoringParenImpCasts(
+          matchers::isStatementIdenticalToBoundNode("self"))));
+
+  auto DescendantSizeCall = expr(hasDescendant(
+      expr(SizeCall, hasAncestor(expr(AncestorCall).bind("ancestor-size")),
+           hasAncestor(expr(equalsBoundNode("parent"),
+                            equalsBoundNode("ancestor-size"))))));
+
+  Finder->addMatcher(
+      cxxMemberCallExpr(
+          on(ignoringParenImpCasts(expr().bind("self"))), callee(DataMethod),
+          expr().bind("data-call"),
+          hasParent(expr(anyOf(
+              invocation(
+                  expr().bind("parent"), unless(cxxOperatorCallExpr()),
+                  hasAnyArgument(
+                      ignoringParenImpCasts(equalsBoundNode("data-call"))),
+                  unless(hasAnyArgument(ignoringParenImpCasts(SizeCall))),
+                  unless(hasAnyArgument(DescendantSizeCall)),
+                  hasDeclaration(namedDecl(
+                      unless(matchers::matchesAnyListedName(AllowedCallees))))),
+              initListExpr(expr().bind("parent"),
+                           hasType(qualType(hasCanonicalType(hasDeclaration(
+                               recordDecl(unless(matchers::matchesAnyListedName(
+                                   AllowedCallees))))))),
+                           unless(DescendantSizeCall)))))),
+      this);
+}
+
+void SuspiciousStringviewDataUsageCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *DataCallExpr =
+      Result.Nodes.getNodeAs<CXXMemberCallExpr>("data-call");
+  diag(DataCallExpr->getExprLoc(),
+       "result of a `data()` call may not be null terminated, provide size "
+       "information to the callee to prevent potential issues")
+      << DataCallExpr->getCallee()->getSourceRange();
+}
+
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h
new file mode 100644
index 00000000000000..31eca0a48722fe
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h
@@ -0,0 +1,38 @@
+//===--- SuspiciousStringviewDataUsageCheck.h - clang-tidy -------//C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSSTRINGVIEWDATAUSAGECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSSTRINGVIEWDATAUSAGECHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::bugprone {
+
+/// Identifies suspicious usages of std::string_view::data() that could lead to
+/// reading out-of-bounds data due to inadequate or incorrect string null
+/// termination.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.html
+class SuspiciousStringviewDataUsageCheck : public ClangTidyCheck {
+public:
+  SuspiciousStringviewDataUsageCheck(StringRef Name, ClangTidyContext *Context);
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override;
+
+private:
+  std::vector<llvm::StringRef> StringViewTypes;
+  std::vector<llvm::StringRef> AllowedCallees;
+};
+
+} // namespace clang::tidy::bugprone
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SUSPICIOUSSTRINGVIEWDATAUSAGECHECK_H
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
index 243fe47c2036b6..73373147e96fc9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
@@ -31,9 +31,16 @@ AST_MATCHER_P(FunctionDecl, isInstantiatedFrom, Matcher<FunctionDecl>,
                               Finder, Builder);
 }
 
-AST_MATCHER_P(CXXMethodDecl, isOperatorOverloading,
-              llvm::SmallVector<OverloadedOperatorKind>, Kinds) {
-  return llvm::is_contained(Kinds, Node.getOverloadedOperator());
+constexpr std::initializer_list<OverloadedOperatorKind>
+    AssignmentOverloadedOperatorKinds = {
+        OO_Equal,      OO_PlusEqual,     OO_MinusEqual,          OO_StarEqual,
+        OO_SlashEqual, OO_PercentEqual,  OO_CaretEqual,          OO_AmpEqual,
+        OO_PipeEqual,  OO_LessLessEqual, OO_GreaterGreaterEqual, OO_PlusPlus,
+        OO_MinusMinus};
+
+AST_MATCHER(FunctionDecl, isAssignmentOverloadedOperator) {
+  return llvm::is_contained(AssignmentOverloadedOperatorKinds,
+                            Node.getOverloadedOperator());
 }
 } // namespace
 
@@ -164,22 +171,18 @@ void UnusedReturnValueCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 }
 
 void UnusedReturnValueCheck::registerMatchers(MatchFinder *Finder) {
-  auto MatchedDirectCallExpr = expr(
-      callExpr(
-          callee(functionDecl(
-              // Don't match void overloads of checked functions.
-              unless(returns(voidType())),
-              // Don't match copy or move assignment operator.
-              unless(cxxMethodDecl(isOperatorOverloading(
-                  {OO_Equal, OO_PlusEqual, OO_MinusEqual, OO_StarEqual,
-                   OO_SlashEqual, OO_PercentEqual, OO_CaretEqual, OO_AmpEqual,
-                   OO_PipeEqual, OO_LessLessEqual, OO_GreaterGreaterEqual}))),
-              anyOf(
-                  isInstantiatedFrom(
-                      matchers::matchesAnyListedName(CheckedFunctions)),
-                  returns(hasCanonicalType(hasDeclaration(namedDecl(
-                      matchers::matchesAnyListedName(CheckedReturnTypes)))))))))
-          .bind("match"));
+  auto MatchedDirectCallExpr =
+      expr(callExpr(callee(functionDecl(
+                        // Don't match copy or move assignment operator.
+                        unless(isAssignmentOverloadedOperator()),
+                        // Don't match void overloads of checked functions.
+                        unless(returns(voidType())),
+                        anyOf(isInstantiatedFrom(matchers::matchesAnyListedName(
+                                  CheckedFunctions)),
+                              returns(hasCanonicalType(hasDeclaration(
+                                  namedDecl(matchers::matchesAnyListedName(
+                                      CheckedReturnTypes)))))))))
+               .bind("match"));
 
   auto CheckCastToVoid =
       AllowCastToVoid ? castExpr(unless(hasCastKind(CK_ToVoid))) : castExpr();
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp
index 9215b833573afd..9b4d2ef99e5bf1 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.cpp
@@ -19,6 +19,17 @@ using namespace clang::ast_matchers::internal;
 
 namespace clang::tidy::cppcoreguidelines {
 
+namespace {
+AST_MATCHER_P(LambdaExpr, hasCallOperator, Matcher<CXXMethodDecl>,
+              InnerMatcher) {
+  return InnerMatcher.matches(*Node.getCallOperator(), Finder, Builder);
+}
+
+AST_MATCHER_P(LambdaExpr, hasLambdaBody, Matcher<Stmt>, InnerMatcher) {
+  return InnerMatcher.matches(*Node.getBody(), Finder, Builder);
+}
+} // namespace
+
 void OwningMemoryCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "LegacyResourceProducers", LegacyResourceProducers);
   Options.store(Opts, "LegacyResourceConsumers", LegacyResourceConsumers);
@@ -55,6 +66,8 @@ void OwningMemoryCheck::registerMatchers(MatchFinder *Finder) {
             CreatesLegacyOwner, LegacyOwnerCast);
 
   const auto ConsideredOwner = eachOf(IsOwnerType, CreatesOwner);
+  const auto ScopeDeclaration = anyOf(translationUnitDecl(), namespaceDecl(),
+                                      recordDecl(), functionDecl());
 
   // Find delete expressions that delete non-owners.
   Finder->addMatcher(
@@ -144,13 +157,51 @@ void OwningMemoryCheck::registerMatchers(MatchFinder *Finder) {
                              .bind("bad_owner_creation_parameter"))),
                      this);
 
+  auto IsNotInSubLambda = stmt(
+      hasAncestor(
+          stmt(anyOf(equalsBoundNode("body"), lambdaExpr())).bind("scope")),
+      hasAncestor(stmt(equalsBoundNode("scope"), equalsBoundNode("body"))));
+
   // Matching on functions, that return an owner/resource, but don't declare
   // their return type as owner.
   Finder->addMatcher(
-      functionDecl(hasDescendant(returnStmt(hasReturnValue(ConsideredOwner))
-                                     .bind("bad_owner_return")),
-                   unless(returns(qualType(hasDeclaration(OwnerDecl)))))
-          .bind("function_decl"),
+      functionDecl(
+          decl().bind("function_decl"),
+          hasBody(
+              stmt(stmt().bind("body"),
+                   hasDescendant(
+                       returnStmt(hasReturnValue(ConsideredOwner),
+                                  // Ignore sub-lambda expressions
+                                  IsNotInSubLambda,
+                                  // Ignore sub-functions
+                                  hasAncestor(functionDecl().bind("context")),
+                                  hasAncestor(functionDecl(
+                                      equalsBoundNode("context"),
+                                      equalsBoundNode("function_decl"))))
+                           .bind("bad_owner_return")))),
+          returns(qualType(unless(hasDeclaration(OwnerDecl))).bind("result"))),
+      this);
+
+  // Matching on lambdas, that return an owner/resource, but don't declare
+  // their return type as owner.
+  Finder->addMatcher(
+      lambdaExpr(
+          hasAncestor(decl(ScopeDeclaration).bind("scope-decl")),
+          hasLambdaBody(
+              stmt(stmt().bind("body"),
+                   hasDescendant(
+                       returnStmt(
+                           hasReturnValue(ConsideredOwner),
+                           // Ignore sub-lambdas
+                           IsNotInSubLambda,
+                           // Ignore sub-functions
+                           hasAncestor(decl(ScopeDeclaration).bind("context")),
+                           hasAncestor(decl(equalsBoundNode("context"),
+                                            equalsBoundNode("scope-decl"))))
+                           .bind("bad_owner_return")))),
+          hasCallOperator(returns(
+              qualType(unless(hasDeclaration(OwnerDecl))).bind("result"))))
+          .bind("lambda"),
       this);
 
   // Match on classes that have an owner as member, but don't declare a
@@ -329,7 +380,7 @@ bool OwningMemoryCheck::handleReturnValues(const BoundNodes &Nodes) {
   // Function return statements, that are owners/resources, but the function
   // declaration does not declare its return value as owner.
   const auto *BadReturnType = Nodes.getNodeAs<ReturnStmt>("bad_owner_return");
-  const auto *Function = Nodes.getNodeAs<FunctionDecl>("function_decl");
+  const auto *ResultType = Nodes.getNodeAs<QualType>("result");
 
   // Function return values, that should be owners but aren't.
   if (BadReturnType) {
@@ -338,8 +389,9 @@ bool OwningMemoryCheck::handleReturnValues(const BoundNodes &Nodes) {
     diag(BadReturnType->getBeginLoc(),
          "returning a newly created resource of "
          "type %0 or 'gsl::owner<>' from a "
-         "function whose return type is not 'gsl::owner<>'")
-        << Function->getReturnType() << BadReturnType->getSourceRange();
+         "%select{function|lambda}1 whose return type is not 'gsl::owner<>'")
+        << *ResultType << (Nodes.getNodeAs<Expr>("lambda") != nullptr)
+        << BadReturnType->getSourceRange();
 
     // FIXME: Rewrite the return type as 'gsl::owner<OriginalType>'
     return true;
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index 4f02950e7794cb..74152c6034510b 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -81,8 +81,7 @@ void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
 
   const Expr *SubExpr = Cast->getSubExpr();
 
-  bool NeedInnerParens =
-      SubExpr != nullptr && utils::fixit::areParensNeededForStatement(*SubExpr);
+  bool NeedInnerParens = utils::fixit::areParensNeededForStatement(*SubExpr);
   bool NeedOuterParens =
       Parent != nullptr && utils::fixit::areParensNeededForStatement(*Parent);
 
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index 036eb9808ea082..9e321dce4c5041 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -619,7 +619,8 @@ struct CodeCompletionBuilder {
     }
     // 'CompletionItemKind::Interface' matches template type aliases.
     if (Completion.Kind == CompletionItemKind::Interface ||
-        Completion.Kind == CompletionItemKind::Class) {
+        Completion.Kind == CompletionItemKind::Class ||
+        Completion.Kind == CompletionItemKind::Variable) {
       if (Snippet->front() != '<')
         return *Snippet; // Not an arg snippet?
 
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index 5721feecd58ea8..49337bddf98d5d 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -2648,12 +2648,15 @@ TEST(CompletionTest, CompletionFunctionArgsDisabled) {
       class foo_class{};
       template <class T>
       using foo_alias = T**;
+      template <class T>
+      T foo_var = T{};
       void f() { foo_^ })cpp",
         {}, Opts);
     EXPECT_THAT(
         Results.Completions,
         UnorderedElementsAre(AllOf(named("foo_class"), snippetSuffix("<$0>")),
-                             AllOf(named("foo_alias"), snippetSuffix("<$0>"))));
+                             AllOf(named("foo_alias"), snippetSuffix("<$0>")),
+                             AllOf(named("foo_var"), snippetSuffix("<$0>"))));
   }
   {
     auto Results = completions(
diff --git a/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp
index 42dd612eeeec46..656b62c9a1f4e1 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp
@@ -72,6 +72,22 @@ TEST_F(ExtractVariableTest, Test) {
     )cpp";
   EXPECT_UNAVAILABLE(NoCrashCasesC);
 
+  ExtraArgs = {"-xc"};
+  const char *NoCrashDesignator = R"cpp(
+    struct A {
+      struct {
+        int x;
+      };
+    };
+    struct B {
+      int y;
+    };
+    void foo(struct B *b) {
+      struct A a = {.x=b[[->]]y};
+    }
+  )cpp";
+  EXPECT_AVAILABLE(NoCrashDesignator);
+
   ExtraArgs = {"-xobjective-c"};
   const char *AvailableObjC = R"cpp(
     __attribute__((objc_root_class))
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 44680f79de6f54..a604e9276668ae 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -110,6 +110,13 @@ New checks
   Detects error-prone Curiously Recurring Template Pattern usage, when the CRTP
   can be constructed outside itself and the derived class.
 
+- New :doc:`bugprone-suspicious-stringview-data-usage
+  <clang-tidy/checks/bugprone/suspicious-stringview-data-usage>` check.
+
+  Identifies suspicious usages of ``std::string_view::data()`` that could lead
+  to reading out-of-bounds data due to inadequate or incorrect string null
+  termination.
+
 - New :doc:`modernize-use-designated-initializers
   <clang-tidy/checks/modernize/use-designated-initializers>` check.
 
@@ -165,6 +172,10 @@ Changes in existing checks
   giving false positives for deleted functions and fix false negative when some
   parameters are forwarded, but other aren't.
 
+- Improved :doc:`cppcoreguidelines-owning-memory
+  <clang-tidy/checks/cppcoreguidelines/owning-memory>` check to properly handle
+  return type in lambdas and in nested functions.
+
 - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>`
   by removing enforcement of rule `C.48
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst
new file mode 100644
index 00000000000000..9b38d836018103
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst
@@ -0,0 +1,58 @@
+.. title:: clang-tidy - bugprone-suspicious-stringview-data-usage
+
+bugprone-suspicious-stringview-data-usage
+=========================================
+
+Identifies suspicious usages of ``std::string_view::data()`` that could lead to
+reading out-of-bounds data due to inadequate or incorrect string null
+termination.
+
+It warns when the result of ``data()`` is passed to a constructor or function
+without also passing the corresponding result of ``size()`` or ``length()``
+member function. Such usage can lead to unintended behavior, particularly when
+assuming the data pointed to by ``data()`` is null-terminated.
+
+The absence of a ``c_str()`` method in ``std::string_view`` often leads
+developers to use ``data()`` as a substitute, especially when interfacing with
+C APIs that expect null-terminated strings. However, since ``data()`` does not
+guarantee null termination, this can result in unintended behavior if the API
+relies on proper null termination for correct string interpretation.
+
+In today's programming landscape, this scenario can occur when implicitly
+converting an ``std::string_view`` to an ``std::string``. Since the constructor
+in ``std::string`` designed for string-view-like objects is ``explicit``,
+attempting to pass an ``std::string_view`` to a function expecting an
+``std::string`` will result in a compilation error. As a workaround, developers
+may be tempted to utilize the ``.data()`` method to achieve compilation,
+introducing potential risks.
+
+For instance:
+
+.. code-block:: c++
+
+  void printString(const std::string& str) {
+    std::cout << "String: " << str << std::endl;
+  }
+
+  void something(std::string_view sv) {
+    printString(sv.data());
+  }
+
+In this example, directly passing ``sv`` to the ``printString`` function would
+lead to a compilation error due to the explicit nature of the ``std::string``
+constructor. Consequently, developers might opt for ``sv.data()`` to resolve the
+compilation error, albeit introducing potential hazards as discussed.
+
+.. option:: StringViewTypes
+
+  Option allows users to specify custom string view-like types for analysis. It
+  accepts a semicolon-separated list of type names or regular expressions
+  matching these types. Default value is:
+  `::std::basic_string_view;::llvm::StringRef`.
+
+.. option:: AllowedCallees
+
+  Specifies methods, functions, or classes where the result of ``.data()`` is
+  passed to. Allows to exclude such calls from the analysis. Accepts a
+  semicolon-separated list of names or regular expressions matching these
+  entities. Default value is: empty string.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index d03e7af688f007..79e81dd174e4f3 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -139,6 +139,7 @@ Clang-Tidy Checks
    :doc:`bugprone-suspicious-realloc-usage <bugprone/suspicious-realloc-usage>`,
    :doc:`bugprone-suspicious-semicolon <bugprone/suspicious-semicolon>`, "Yes"
    :doc:`bugprone-suspicious-string-compare <bugprone/suspicious-string-compare>`, "Yes"
+   :doc:`bugprone-suspicious-stringview-data-usage <bugprone/suspicious-stringview-data-usage>`,
    :doc:`bugprone-swapped-arguments <bugprone/swapped-arguments>`, "Yes"
    :doc:`bugprone-switch-missing-default-case <bugprone/switch-missing-default-case>`,
    :doc:`bugprone-terminating-continue <bugprone/terminating-continue>`, "Yes"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string
index f2e4159a224513..28e2b4a231e52e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/string
@@ -24,6 +24,7 @@ struct basic_string {
   basic_string();
   basic_string(const C *p, const A &a = A());
   basic_string(const C *p, size_type count);
+  basic_string(const C *b, const C *e);
 
   ~basic_string();
 
@@ -85,6 +86,12 @@ struct basic_string_view {
   const C *str;
   constexpr basic_string_view(const C* s) : str(s) {}
 
+  const C *data() const;
+
+  bool empty() const;
+  size_type size() const;
+  size_type length() const;
+
   size_type find(_Type v, size_type pos = 0) const;
   size_type find(C ch, size_type pos = 0) const;
   size_type find(const C* s, size_type pos, size_type count) const;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-stringview-data-usage.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-stringview-data-usage.cpp
new file mode 100644
index 00000000000000..8072d60a2f47a1
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-stringview-data-usage.cpp
@@ -0,0 +1,56 @@
+// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-suspicious-stringview-data-usage %t -- -- -isystem %clang_tidy_headers
+#include <string>
+
+struct View {
+   const char* str;
+};
+
+struct Pair {
+   const char* begin;
+   const char* end;
+};
+
+struct ViewWithSize {
+   const char* str;
+   std::string_view::size_type size;
+};
+
+void something(const char*);
+void something(const char*, unsigned);
+void something(const char*, unsigned, const char*);
+void something_str(std::string, unsigned);
+
+void invalid(std::string_view sv, std::string_view sv2) {
+  std::string s(sv.data());
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  std::string si{sv.data()};
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  std::string_view s2(sv.data());
+// CHECK-MESSAGES: :[[@LINE-1]]:26: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  something(sv.data());
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  something(sv.data(), sv.size(), sv2.data());
+// CHECK-MESSAGES: :[[@LINE-1]]:39: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  something_str(sv.data(), sv.size());
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+  View view{sv.data()};
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: result of a `data()` call may not be null terminated, provide size information to the callee to prevent potential issues
+}
+
+void valid(std::string_view sv) {
+  std::string s1(sv.data(), sv.data() + sv.size());
+  std::string s2(sv.data(), sv.data() + sv.length());
+  std::string s3(sv.data(), sv.size() + sv.data());
+  std::string s4(sv.data(), sv.length() + sv.data());
+  std::string s5(sv.data(), sv.size());
+  std::string s6(sv.data(), sv.length());
+  something(sv.data(), sv.size());
+  something(sv.data(), sv.length());
+  ViewWithSize view1{sv.data(), sv.size()};
+  ViewWithSize view2{sv.data(), sv.length()};
+  Pair view3{sv.data(), sv.data() + sv.size()};
+  Pair view4{sv.data(), sv.data() + sv.length()};
+  Pair view5{sv.data(), sv.size() + sv.data()};
+  Pair view6{sv.data(), sv.length() + sv.data()};
+  const char* str{sv.data()};
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-avoid-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-avoid-assignment.cpp
index b4a41004adf894..564c07a724ccde 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-avoid-assignment.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-avoid-assignment.cpp
@@ -3,23 +3,37 @@
 // RUN:  {bugprone-unused-return-value.CheckedFunctions: "::*"}}' \
 // RUN: --
 
-struct S {
-  S(){};
-  S(S const &);
-  S(S &&);
-  S &operator=(S const &);
-  S &operator=(S &&);
-  S &operator+=(S);
+struct S1 {
+  S1(){};
+  S1(S1 const &);
+  S1(S1 &&);
+  S1 &operator=(S1 const &);
+  S1 &operator=(S1 &&);
+  S1 &operator+=(S1);
+  S1 &operator++();
+  S1 &operator++(int);
+  S1 &operator--();
+  S1 &operator--(int);
 };
 
-S returnValue();
-S const &returnRef();
+struct S2 {
+  S2(){};
+  S2(S2 const &);
+  S2(S2 &&);
+};
+
+S2 &operator-=(S2&, int);
+S2 &operator++(S2 &);
+S2 &operator++(S2 &, int);
+
+S1 returnValue();
+S1 const &returnRef();
 
 void bar() {
   returnValue();
   // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 
-  S a{};
+  S1 a{};
   a = returnValue();
   a.operator=(returnValue());
 
@@ -27,4 +41,15 @@ void bar() {
   a.operator=(returnRef());
 
   a += returnRef();
+
+  a++;
+  ++a;
+  a--;
+  --a;
+
+  S2 b{};
+
+  b -= 1;
+  b++;
+  ++b;
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
index eb8ad1b8b87925..574efe7bd91478 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/owning-memory.cpp
@@ -395,3 +395,109 @@ namespace PR63994 {
     // CHECK-NOTES: [[@LINE-1]]:5: warning: returning a newly created resource of type 'A *' or 'gsl::owner<>' from a function whose return type is not 'gsl::owner<>'
   }
 }
+
+namespace PR59389 {
+  struct S {
+    S();
+    S(int);
+
+    int value = 1;
+  };
+
+  void testLambdaInFunctionNegative() {
+    const auto MakeS = []() -> ::gsl::owner<S*> {
+      return ::gsl::owner<S*>{new S{}};
+    };
+  }
+
+  void testLambdaInFunctionPositive() {
+    const auto MakeS = []() -> S* {
+      return ::gsl::owner<S*>{new S{}};
+      // CHECK-NOTES: [[@LINE-1]]:7: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>'
+    };
+  }
+
+  void testFunctionInFunctionNegative() {
+    struct C {
+      ::gsl::owner<S*> test() {
+        return ::gsl::owner<S*>{new S{}};
+      }
+    };
+  }
+
+  void testFunctionInFunctionPositive() {
+    struct C {
+      S* test() {
+        return ::gsl::owner<S*>{new S{}};
+        // CHECK-NOTES: [[@LINE-1]]:9: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a function whose return type is not 'gsl::owner<>'
+      }
+    };
+  }
+
+  ::gsl::owner<S*> testReverseLambdaNegative() {
+    const auto MakeI = [] -> int { return 5; };
+    return ::gsl::owner<S*>{new S(MakeI())};
+  }
+
+  S* testReverseLambdaPositive() {
+    const auto MakeI = [] -> int { return 5; };
+    return ::gsl::owner<S*>{new S(MakeI())};
+    // CHECK-NOTES: [[@LINE-1]]:5: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a function whose return type is not 'gsl::owner<>'
+  }
+
+  ::gsl::owner<S*> testReverseFunctionNegative() {
+    struct C {
+      int test() { return 5; }
+    };
+    return ::gsl::owner<S*>{new S(C().test())};
+  }
+
+  S* testReverseFunctionPositive() {
+    struct C {
+      int test() { return 5; }
+    };
+    return ::gsl::owner<S*>{new S(C().test())};
+    // CHECK-NOTES: [[@LINE-1]]:5: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a function whose return type is not 'gsl::owner<>'
+  }
+
+  void testLambdaInLambdaNegative() {
+    const auto MakeS = []() -> ::gsl::owner<S*> {
+      const auto MakeI = []() -> int { return 5; };
+      return ::gsl::owner<S*>{new S(MakeI())};
+    };
+  }
+
+  void testLambdaInLambdaPositive() {
+    const auto MakeS = []() -> S* {
+      const auto MakeI = []() -> int { return 5; };
+      return ::gsl::owner<S*>{new S(MakeI())};
+      // CHECK-NOTES: [[@LINE-1]]:7: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>'
+    };
+  }
+
+  void testLambdaInLambdaWithDoubleReturns() {
+    const auto MakeS = []() -> S* {
+      const auto MakeS2 = []() -> S* {
+        return ::gsl::owner<S*>{new S(1)};
+        // CHECK-NOTES: [[@LINE-1]]:9: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>' [cppcoreguidelines-owning-memory]
+      };
+      return ::gsl::owner<S*>{new S(2)};
+      // CHECK-NOTES: [[@LINE-1]]:7: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>'
+    };
+  }
+
+  void testReverseLambdaInLambdaNegative() {
+    const auto MakeI = []() -> int {
+      const auto MakeS = []() -> ::gsl::owner<S*> { return new S(); };
+      return 5;
+    };
+  }
+
+  void testReverseLambdaInLambdaPositive() {
+    const auto MakeI = []() -> int {
+      const auto MakeS = []() -> S* { return new S(); };
+      // CHECK-NOTES: [[@LINE-1]]:39: warning: returning a newly created resource of type 'S *' or 'gsl::owner<>' from a lambda whose return type is not 'gsl::owner<>'
+      return 5;
+    };
+  }
+}
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index 44a34ca196274c..302d99dccd77b5 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -1091,6 +1091,29 @@ def __repr__(self):
 # Represents an @available(...) check.
 CursorKind.OBJC_AVAILABILITY_CHECK_EXPR = CursorKind(148)
 
+# Fixed point literal.
+CursorKind.FIXED_POINT_LITERAL = CursorKind(149)
+
+# OpenMP 5.0 [2.1.4, Array Shaping].
+CursorKind.OMP_ARRAY_SHAPING_EXPR = CursorKind(150)
+
+# OpenMP 5.0 [2.1.6 Iterators].
+CursorKind.OMP_ITERATOR_EXPR = CursorKind(151)
+
+# OpenCL's addrspace_cast<> expression.
+CursorKind.CXX_ADDRSPACE_CAST_EXPR = CursorKind(152)
+
+# Expression that references a C++20 concept.
+CursorKind.CONCEPT_SPECIALIZATION_EXPR = CursorKind(153)
+
+# Expression that references a C++20 requires expression.
+CursorKind.REQUIRES_EXPR = CursorKind(154)
+
+# Expression that references a C++20 parenthesized list aggregate initializer.
+CursorKind.CXX_PAREN_LIST_INIT_EXPR = CursorKind(155)
+
+# Represents a C++26 pack indexing expression.
+CursorKind.PACK_INDEXING_EXPR = CursorKind(156)
 
 # A statement whose specific kind is not exposed via this interface.
 #
@@ -1312,6 +1335,114 @@ def __repr__(self):
 # OpenMP teams distribute directive.
 CursorKind.OMP_TEAMS_DISTRIBUTE_DIRECTIVE = CursorKind(271)
 
+# OpenMP teams distribute simd directive.
+CursorKind.OMP_TEAMS_DISTRIBUTE_DIRECTIVE = CursorKind(272)
+
+# OpenMP teams distribute parallel for simd directive.
+CursorKind.OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE = CursorKind(273)
+
+# OpenMP teams distribute parallel for directive.
+CursorKind.OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE = CursorKind(274)
+
+# OpenMP target teams directive.
+CursorKind.OMP_TARGET_TEAMS_DIRECTIVE = CursorKind(275)
+
+# OpenMP target teams distribute directive.
+CursorKind.OMP_TARGET_TEAMS_DISTRIBUTE_DIRECTIVE = CursorKind(276)
+
+# OpenMP target teams distribute parallel for directive.
+CursorKind.OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE = CursorKind(277)
+
+# OpenMP target teams distribute parallel for simd directive.
+CursorKind.OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE = CursorKind(278)
+
+# OpenMP target teams distribute simd directive.
+CursorKind.OMP_TARGET_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE = CursorKind(279)
+
+# C++2a std::bit_cast expression.
+CursorKind.BUILTIN_BIT_CAST_EXPR = CursorKind(280)
+
+# OpenMP master taskloop directive.
+CursorKind.OMP_MASTER_TASK_LOOP_DIRECTIVE = CursorKind(281)
+
+# OpenMP parallel master taskloop directive.
+CursorKind.OMP_PARALLEL_MASTER_TASK_LOOP_DIRECTIVE = CursorKind(282)
+
+# OpenMP master taskloop simd directive.
+CursorKind.OMP_MASTER_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(283)
+
+# OpenMP parallel master taskloop simd directive.
+CursorKind.OMP_PARALLEL_MASTER_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(284)
+
+# OpenMP parallel master directive.
+CursorKind.OMP_PARALLEL_MASTER_DIRECTIVE = CursorKind(285)
+
+# OpenMP depobj directive.
+CursorKind.OMP_DEPOBJ_DIRECTIVE = CursorKind(286)
+
+# OpenMP scan directive.
+CursorKind.OMP_SCAN_DIRECTIVE = CursorKind(287)
+
+# OpenMP tile directive.
+CursorKind.OMP_TILE_DIRECTIVE = CursorKind(288)
+
+# OpenMP canonical loop.
+CursorKind.OMP_CANONICAL_LOOP = CursorKind(289)
+
+# OpenMP interop directive.
+CursorKind.OMP_INTEROP_DIRECTIVE = CursorKind(290)
+
+# OpenMP dispatch directive.
+CursorKind.OMP_DISPATCH_DIRECTIVE = CursorKind(291)
+
+# OpenMP masked directive.
+CursorKind.OMP_MASKED_DIRECTIVE = CursorKind(292)
+
+# OpenMP unroll directive.
+CursorKind.OMP_UNROLL_DIRECTIVE = CursorKind(293)
+
+# OpenMP metadirective directive.
+CursorKind.OMP_META_DIRECTIVE = CursorKind(294)
+
+# OpenMP loop directive.
+CursorKind.OMP_GENERIC_LOOP_DIRECTIVE = CursorKind(295)
+
+# OpenMP teams loop directive.
+CursorKind.OMP_TEAMS_GENERIC_LOOP_DIRECTIVE = CursorKind(296)
+
+# OpenMP target teams loop directive.
+CursorKind.OMP_TARGET_TEAMS_GENERIC_LOOP_DIRECTIVE = CursorKind(297)
+
+# OpenMP parallel loop directive.
+CursorKind.OMP_PARALLEL_GENERIC_LOOP_DIRECTIVE = CursorKind(298)
+
+# OpenMP target parallel loop directive.
+CursorKind.OMP_TARGET_PARALLEL_GENERIC_LOOP_DIRECTIVE = CursorKind(299)
+
+# OpenMP parallel masked directive.
+CursorKind.OMP_PARALLEL_MASKED_DIRECTIVE = CursorKind(300)
+
+# OpenMP masked taskloop directive.
+CursorKind.OMP_MASKED_TASK_LOOP_DIRECTIVE = CursorKind(301)
+
+# OpenMP masked taskloop simd directive.
+CursorKind.OMP_MASKED_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(302)
+
+# OpenMP parallel masked taskloop directive.
+CursorKind.OMP_PARALLEL_MASKED_TASK_LOOP_DIRECTIVE = CursorKind(303)
+
+# OpenMP parallel masked taskloop simd directive.
+CursorKind.OMP_PARALLEL_MASKED_TASK_LOOP_SIMD_DIRECTIVE = CursorKind(304)
+
+# OpenMP error directive.
+CursorKind.OMP_ERROR_DIRECTIVE = CursorKind(305)
+
+# OpenMP scope directive.
+CursorKind.OMP_SCOPE_DIRECTIVE = CursorKind(306)
+
+# OpenACC Compute Construct.
+CursorKind.OPEN_ACC_COMPUTE_DIRECTIVE = CursorKind(320)
+
 ###
 # Other Kinds
 
@@ -1349,6 +1480,24 @@ def __repr__(self):
 
 CursorKind.DLLEXPORT_ATTR = CursorKind(418)
 CursorKind.DLLIMPORT_ATTR = CursorKind(419)
+CursorKind.NS_RETURNS_RETAINED = CursorKind(420)
+CursorKind.NS_RETURNS_NOT_RETAINED = CursorKind(421)
+CursorKind.NS_RETURNS_AUTORELEASED = CursorKind(422)
+CursorKind.NS_CONSUMES_SELF = CursorKind(423)
+CursorKind.NS_CONSUMED = CursorKind(424)
+CursorKind.OBJC_EXCEPTION = CursorKind(425)
+CursorKind.OBJC_NSOBJECT = CursorKind(426)
+CursorKind.OBJC_INDEPENDENT_CLASS = CursorKind(427)
+CursorKind.OBJC_PRECISE_LIFETIME = CursorKind(428)
+CursorKind.OBJC_RETURNS_INNER_POINTER = CursorKind(429)
+CursorKind.OBJC_REQUIRES_SUPER = CursorKind(430)
+CursorKind.OBJC_ROOT_CLASS = CursorKind(431)
+CursorKind.OBJC_SUBCLASSING_RESTRICTED = CursorKind(432)
+CursorKind.OBJC_EXPLICIT_PROTOCOL_IMPL = CursorKind(433)
+CursorKind.OBJC_DESIGNATED_INITIALIZER = CursorKind(434)
+CursorKind.OBJC_RUNTIME_VISIBLE = CursorKind(435)
+CursorKind.OBJC_BOXABLE = CursorKind(436)
+CursorKind.FLAG_ENUM = CursorKind(437)
 CursorKind.CONVERGENT_ATTR = CursorKind(438)
 CursorKind.WARN_UNUSED_ATTR = CursorKind(439)
 CursorKind.WARN_UNUSED_RESULT_ATTR = CursorKind(440)
@@ -1395,6 +1544,11 @@ class TemplateArgumentKind(BaseEnumeration):
 TemplateArgumentKind.DECLARATION = TemplateArgumentKind(2)
 TemplateArgumentKind.NULLPTR = TemplateArgumentKind(3)
 TemplateArgumentKind.INTEGRAL = TemplateArgumentKind(4)
+TemplateArgumentKind.TEMPLATE = TemplateArgumentKind(5)
+TemplateArgumentKind.TEMPLATE_EXPANSION = TemplateArgumentKind(6)
+TemplateArgumentKind.EXPRESSION = TemplateArgumentKind(7)
+TemplateArgumentKind.PACK = TemplateArgumentKind(8)
+TemplateArgumentKind.INVALID = TemplateArgumentKind(9)
 
 ### Exception Specification Kinds ###
 class ExceptionSpecificationKind(BaseEnumeration):
@@ -2240,9 +2394,26 @@ def __repr__(self):
 TypeKind.OCLQUEUE = TypeKind(159)
 TypeKind.OCLRESERVEID = TypeKind(160)
 
+TypeKind.OBJCOBJECT = TypeKind(161)
+TypeKind.OBJCCLASS = TypeKind(162)
+TypeKind.ATTRIBUTED = TypeKind(163)
+
+TypeKind.OCLINTELSUBGROUPAVCMCEPAYLOAD = TypeKind(164)
+TypeKind.OCLINTELSUBGROUPAVCIMEPAYLOAD = TypeKind(165)
+TypeKind.OCLINTELSUBGROUPAVCREFPAYLOAD = TypeKind(166)
+TypeKind.OCLINTELSUBGROUPAVCSICPAYLOAD = TypeKind(167)
+TypeKind.OCLINTELSUBGROUPAVCMCERESULT = TypeKind(168)
+TypeKind.OCLINTELSUBGROUPAVCIMERESULT = TypeKind(169)
+TypeKind.OCLINTELSUBGROUPAVCREFRESULT = TypeKind(170)
+TypeKind.OCLINTELSUBGROUPAVCSICRESULT = TypeKind(171)
+TypeKind.OCLINTELSUBGROUPAVCIMERESULTSINGLEREFERENCESTREAMOUT = TypeKind(172)
+TypeKind.OCLINTELSUBGROUPAVCIMERESULTSDUALREFERENCESTREAMOUT = TypeKind(173)
+TypeKind.OCLINTELSUBGROUPAVCIMERESULTSSINGLEREFERENCESTREAMIN = TypeKind(174)
+TypeKind.OCLINTELSUBGROUPAVCIMEDUALREFERENCESTREAMIN = TypeKind(175)
+
 TypeKind.EXTVECTOR = TypeKind(176)
 TypeKind.ATOMIC = TypeKind(177)
-
+TypeKind.BTFTAGATTRIBUTED = TypeKind(178)
 
 class RefQualifierKind(BaseEnumeration):
     """Describes a specific ref-qualifier of a type."""
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index db7430b3344c3e..d5546e20873b3c 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -11,6 +11,7 @@ set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "
 
 set(LLVM_ENABLE_BACKTRACES OFF CACHE BOOL "")
 set(LLVM_ENABLE_DIA_SDK OFF CACHE BOOL "")
+set(LLVM_ENABLE_FATLTO ON CACHE BOOL "")
 set(LLVM_ENABLE_HTTPLIB ON CACHE BOOL "")
 set(LLVM_ENABLE_LIBCXX ON CACHE BOOL "")
 set(LLVM_ENABLE_LIBEDIT OFF CACHE BOOL "")
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 5b00a8f4c00fb8..be021dfc5c084c 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -6158,6 +6158,70 @@ the configuration (without a prefix: ``Auto``).
 **TabWidth** (``Unsigned``) :versionbadge:`clang-format 3.7` :ref:`¶ <TabWidth>`
   The number of columns used for tab stops.
 
+.. _TableGenBreakInsideDAGArg:
+
+**TableGenBreakInsideDAGArg** (``DAGArgStyle``) :versionbadge:`clang-format 19` :ref:`¶ <TableGenBreakInsideDAGArg>`
+  The styles of the line break inside the DAGArg in TableGen.
+
+  Possible values:
+
+  * ``DAS_DontBreak`` (in configuration: ``DontBreak``)
+    Never break inside DAGArg.
+
+    .. code-block:: c++
+
+      let DAGArgIns = (ins i32:$src1, i32:$src2);
+
+  * ``DAS_BreakElements`` (in configuration: ``BreakElements``)
+    Break inside DAGArg after each list element but for the last.
+    This aligns to the first element.
+
+    .. code-block:: c++
+
+      let DAGArgIns = (ins i32:$src1,
+                           i32:$src2);
+
+  * ``DAS_BreakAll`` (in configuration: ``BreakAll``)
+    Break inside DAGArg after the operator and the all elements.
+
+    .. code-block:: c++
+
+      let DAGArgIns = (ins
+          i32:$src1,
+          i32:$src2
+      );
+
+
+
+.. _TableGenBreakingDAGArgOperators:
+
+**TableGenBreakingDAGArgOperators** (``List of Strings``) :versionbadge:`clang-format 19` :ref:`¶ <TableGenBreakingDAGArgOperators>`
+  Works only when TableGenBreakInsideDAGArg is not DontBreak.
+  The string list needs to consist of identifiers in TableGen.
+  If any identifier is specified, this limits the line breaks by
+  TableGenBreakInsideDAGArg option only on DAGArg values beginning with
+  the specified identifiers.
+
+  For example the configuration,
+
+  .. code-block:: yaml
+
+    TableGenBreakInsideDAGArg: BreakAll
+    TableGenBreakingDAGArgOperators: ['ins', 'outs']
+
+  makes the line break only occurs inside DAGArgs beginning with the
+  specified identifiers 'ins' and 'outs'.
+
+
+  .. code-block:: c++
+
+    let DAGArgIns = (ins
+        i32:$src1,
+        i32:$src2
+    );
+    let DAGArgOtherID = (other i32:$other1, i32:$other2);
+    let DAGArgBang = (!cast<SomeType>("Some") i32:$src1, i32:$src2)
+
 .. _TypeNames:
 
 **TypeNames** (``List of Strings``) :versionbadge:`clang-format 17` :ref:`¶ <TypeNames>`
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 9347703e96e21a..13d7261d83d7f1 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -13,6 +13,7 @@ Clang Language Extensions
    BlockLanguageSpec
    Block-ABI-Apple
    AutomaticReferenceCounting
+   PointerAuthentication
    MatrixTypes
 
 Introduction
@@ -3444,7 +3445,7 @@ Query for this feature with ``__has_builtin(__builtin_debugtrap)``.
 Query for this feature with ``__has_builtin(__builtin_trap)``.
 
 ``__builtin_arm_trap``
-------------------
+----------------------
 
 ``__builtin_arm_trap`` is an AArch64 extension to ``__builtin_trap`` which also accepts a compile-time constant value, encoded directly into the trap instruction for later inspection.
 
@@ -4318,6 +4319,10 @@ reordering of memory accesses and side effect instructions. Other instructions
 like simple arithmetic may be reordered around the intrinsic. If you expect to
 have no reordering at all, use inline assembly instead.
 
+Pointer Authentication
+^^^^^^^^^^^^^^^^^^^^^^
+See :doc:`PointerAuthentication`.
+
 X86/X86-64 Language Extensions
 ------------------------------
 
diff --git a/clang/docs/PointerAuthentication.rst b/clang/docs/PointerAuthentication.rst
new file mode 100644
index 00000000000000..19b3384293aede
--- /dev/null
+++ b/clang/docs/PointerAuthentication.rst
@@ -0,0 +1,485 @@
+Pointer Authentication
+======================
+
+.. contents::
+   :local:
+
+Introduction
+------------
+
+Pointer authentication is a technology which offers strong probabilistic
+protection against exploiting a broad class of memory bugs to take control of
+program execution.  When adopted consistently in a language ABI, it provides
+a form of relatively fine-grained control flow integrity (CFI) check that
+resists both return-oriented programming (ROP) and jump-oriented programming
+(JOP) attacks.
+
+While pointer authentication can be implemented purely in software, direct
+hardware support (e.g. as provided by Armv8.3 PAuth) can dramatically improve
+performance and code size.  Similarly, while pointer authentication
+can be implemented on any architecture, taking advantage of the (typically)
+excess addressing range of a target with 64-bit pointers minimizes the impact
+on memory performance and can allow interoperation with existing code (by
+disabling pointer authentication dynamically).  This document will generally
+attempt to present the pointer authentication feature independent of any
+hardware implementation or ABI.  Considerations that are
+implementation-specific are clearly identified throughout.
+
+Note that there are several different terms in use:
+
+- **Pointer authentication** is a target-independent language technology.
+
+- **PAuth** (sometimes referred to as **PAC**, for Pointer Authentication
+  Codes) is an AArch64 architecture extension that provides hardware support
+  for pointer authentication.  Additional extensions either modify some of the
+  PAuth instruction behavior (notably FPAC), or provide new instruction
+  variants (PAuth_LR).
+
+- **Armv8.3** is an AArch64 architecture revision that makes PAuth mandatory.
+
+- **arm64e** is a specific ABI (not yet fully stable) for implementing pointer
+  authentication using PAuth on certain Apple operating systems.
+
+This document serves four purposes:
+
+- It describes the basic ideas of pointer authentication.
+
+- It documents several language extensions that are useful on targets using
+  pointer authentication.
+
+- It will eventually present a theory of operation for the security mitigation,
+  describing the basic requirements for correctness, various weaknesses in the
+  mechanism, and ways in which programmers can strengthen its protections
+  (including recommendations for language implementors).
+
+- It will eventually document the language ABIs currently used for C, C++,
+  Objective-C, and Swift on arm64e, although these are not yet stable on any
+  target.
+
+Basic Concepts
+--------------
+
+The simple address of an object or function is a **raw pointer**.  A raw
+pointer can be **signed** to produce a **signed pointer**.  A signed pointer
+can be then **authenticated** in order to verify that it was **validly signed**
+and extract the original raw pointer.  These terms reflect the most likely
+implementation technique: computing and storing a cryptographic signature along
+with the pointer.
+
+An **abstract signing key** is a name which refers to a secret key which is
+used to sign and authenticate pointers.  The concrete key value for a
+particular name is consistent throughout a process.
+
+A **discriminator** is an arbitrary value used to **diversify** signed pointers
+so that one validly-signed pointer cannot simply be copied over another.
+A discriminator is simply opaque data of some implementation-defined size that
+is included in the signature as a salt (see `Discriminators`_ for details.)
+
+Nearly all aspects of pointer authentication use just these two primary
+operations:
+
+- ``sign(raw_pointer, key, discriminator)`` produces a signed pointer given
+  a raw pointer, an abstract signing key, and a discriminator.
+
+- ``auth(signed_pointer, key, discriminator)`` produces a raw pointer given
+  a signed pointer, an abstract signing key, and a discriminator.
+
+``auth(sign(raw_pointer, key, discriminator), key, discriminator)`` must
+succeed and produce ``raw_pointer``.  ``auth`` applied to a value that was
+ultimately produced in any other way is expected to fail, which halts the
+program either:
+
+- immediately, on implementations that enforce ``auth`` success (e.g., when
+  using compiler-generated ``auth`` failure checks, or Armv8.3 with the FPAC
+  extension), or
+
+- when the resulting pointer value is used, on implementations that don't.
+
+However, regardless of the implementation's handling of ``auth`` failures, it
+is permitted for ``auth`` to fail to detect that a signed pointer was not
+produced in this way, in which case it may return anything; this is what makes
+pointer authentication a probabilistic mitigation rather than a perfect one.
+
+There are two secondary operations which are required only to implement certain
+intrinsics in ``<ptrauth.h>``:
+
+- ``strip(signed_pointer, key)`` produces a raw pointer given a signed pointer
+  and a key without verifying its validity, unlike ``auth``.  This is useful
+  for certain kinds of tooling, such as crash backtraces; it should generally
+  not be used in the basic language ABI except in very careful ways.
+
+- ``sign_generic(value)`` produces a cryptographic signature for arbitrary
+  data, not necessarily a pointer.  This is useful for efficiently verifying
+  that non-pointer data has not been tampered with.
+
+Whenever any of these operations is called for, the key value must be known
+statically.  This is because the layout of a signed pointer may vary according
+to the signing key.  (For example, in Armv8.3, the layout of a signed pointer
+depends on whether Top Byte Ignore (TBI) is enabled, which can be set
+independently for I and D keys.)
+
+.. admonition:: Note for API designers and language implementors
+
+  These are the *primitive* operations of pointer authentication, provided for
+  clarity of description.  They are not suitable either as high-level
+  interfaces or as primitives in a compiler IR because they expose raw
+  pointers.  Raw pointers require special attention in the language
+  implementation to avoid the accidental creation of exploitable code
+  sequences.
+
+The following details are all implementation-defined:
+
+- the nature of a signed pointer
+- the size of a discriminator
+- the number and nature of the signing keys
+- the implementation of the ``sign``, ``auth``, ``strip``, and ``sign_generic``
+  operations
+
+While the use of the terms "sign" and "signed pointer" suggest the use of
+a cryptographic signature, other implementations may be possible.  See
+`Alternative implementations`_ for an exploration of implementation options.
+
+.. admonition:: Implementation example: Armv8.3
+
+  Readers may find it helpful to know how these terms map to Armv8.3 PAuth:
+
+  - A signed pointer is a pointer with a signature stored in the
+    otherwise-unused high bits.  The kernel configures the address width based
+    on the system's addressing needs, and enables TBI for I or D keys as
+    needed.  The bits above the address bits and below the TBI bits (if
+    enabled) are unused.  The signature width then depends on this addressing
+    configuration.
+
+  - A discriminator is a 64-bit integer.  Constant discriminators are 16-bit
+    integers.  Blending a constant discriminator into an address consists of
+    replacing the top 16 bits of the pointer containing the address with the
+    constant.  Pointers used for blending purposes should only have address
+    bits, since higher bits will be at least partially overwritten with the
+    constant discriminator.
+
+  - There are five 128-bit signing-key registers, each of which can only be
+    directly read or set by privileged code.  Of these, four are used for
+    signing pointers, and the fifth is used only for ``sign_generic``.  The key
+    data is simply a pepper added to the hash, not an encryption key, and so
+    can be initialized using random data.
+
+  - ``sign`` computes a cryptographic hash of the pointer, discriminator, and
+    signing key, and stores it in the high bits as the signature. ``auth``
+    removes the signature, computes the same hash, and compares the result with
+    the stored signature.  ``strip`` removes the signature without
+    authenticating it.  While ``aut*`` instructions do not themselves trap on
+    failure in Armv8.3 PAuth, they do with the later optional FPAC extension.
+    An implementation can also choose to emulate this trapping behavior by
+    emitting additional instructions around ``aut*``.
+
+  - ``sign_generic`` corresponds to the ``pacga`` instruction, which takes two
+    64-bit values and produces a 64-bit cryptographic hash. Implementations of
+    this instruction are not required to produce meaningful data in all bits of
+    the result.
+
+Discriminators
+~~~~~~~~~~~~~~
+
+A discriminator is arbitrary extra data which alters the signature calculated
+for a pointer.  When two pointers are signed differently --- either with
+different keys or with different discriminators --- an attacker cannot simply
+replace one pointer with the other.
+
+To use standard cryptographic terminology, a discriminator acts as a
+`salt <https://en.wikipedia.org/wiki/Salt_(cryptography)>`_ in the signing of a
+pointer, and the key data acts as a
+`pepper <https://en.wikipedia.org/wiki/Pepper_(cryptography)>`_.  That is,
+both the discriminator and key data are ultimately just added as inputs to the
+signing algorithm along with the pointer, but they serve significantly
+different roles.  The key data is a common secret added to every signature,
+whereas the discriminator is a value that can be derived from
+the context in which a specific pointer is signed.  However, unlike a password
+salt, it's important that discriminators be *independently* derived from the
+circumstances of the signing; they should never simply be stored alongside
+a pointer.  Discriminators are then re-derived in authentication operations.
+
+The intrinsic interface in ``<ptrauth.h>`` allows an arbitrary discriminator
+value to be provided, but can only be used when running normal code.  The
+discriminators used by language ABIs must be restricted to make it feasible for
+the loader to sign pointers stored in global memory without needing excessive
+amounts of metadata.  Under these restrictions, a discriminator may consist of
+either or both of the following:
+
+- The address at which the pointer is stored in memory.  A pointer signed with
+  a discriminator which incorporates its storage address is said to have
+  **address diversity**.  In general, using address diversity means that
+  a pointer cannot be reliably copied by an attacker to or from a different
+  memory location.  However, an attacker may still be able to attack a larger
+  call sequence if they can alter the address through which the pointer is
+  accessed.  Furthermore, some situations cannot use address diversity because
+  of language or other restrictions.
+
+- A constant integer, called a **constant discriminator**. A pointer signed
+  with a non-zero constant discriminator is said to have **constant
+  diversity**.  If the discriminator is specific to a single declaration, it is
+  said to have **declaration diversity**; if the discriminator is specific to
+  a type of value, it is said to have **type diversity**.  For example, C++
+  v-tables on arm64e sign their component functions using a hash of their
+  method names and signatures, which provides declaration diversity; similarly,
+  C++ member function pointers sign their invocation functions using a hash of
+  the member pointer type, which provides type diversity.
+
+The implementation may need to restrict constant discriminators to be
+significantly smaller than the full size of a discriminator.  For example, on
+arm64e, constant discriminators are only 16-bit values.  This is believed to
+not significantly weaken the mitigation, since collisions remain uncommon.
+
+The algorithm for blending a constant discriminator with a storage address is
+implementation-defined.
+
+.. _Signing schemas:
+
+Signing Schemas
+~~~~~~~~~~~~~~~
+
+Correct use of pointer authentication requires the signing code and the
+authenticating code to agree about the **signing schema** for the pointer:
+
+- the abstract signing key with which the pointer should be signed and
+- an algorithm for computing the discriminator.
+
+As described in the section above on `Discriminators`_, in most situations, the
+discriminator is produced by taking a constant discriminator and optionally
+blending it with the storage address of the pointer.  In these situations, the
+signing schema breaks down even more simply:
+
+- the abstract signing key,
+- a constant discriminator, and
+- whether to use address diversity.
+
+It is important that the signing schema be independently derived at all signing
+and authentication sites.  Preferably, the schema should be hard-coded
+everywhere it is needed, but at the very least, it must not be derived by
+inspecting information stored along with the pointer.
+
+Language Features
+-----------------
+
+There is currently one main pointer authentication language feature:
+
+- The language provides the ``<ptrauth.h>`` intrinsic interface for manually
+  signing and authenticating pointers in code.  These can be used in
+  circumstances where very specific behavior is required.
+
+
+Language Extensions
+~~~~~~~~~~~~~~~~~~~
+
+Feature Testing
+^^^^^^^^^^^^^^^
+
+Whether the current target uses pointer authentication can be tested for with
+a number of different tests.
+
+- ``__has_feature(ptrauth_intrinsics)`` is true if ``<ptrauth.h>`` provides its
+  normal interface.  This may be true even on targets where pointer
+  authentication is not enabled by default.
+
+``<ptrauth.h>``
+~~~~~~~~~~~~~~~
+
+This header defines the following types and operations:
+
+``ptrauth_key``
+^^^^^^^^^^^^^^^
+
+This ``enum`` is the type of abstract signing keys.  In addition to defining
+the set of implementation-specific signing keys (for example, Armv8.3 defines
+``ptrauth_key_asia``), it also defines some portable aliases for those keys.
+For example, ``ptrauth_key_function_pointer`` is the key generally used for
+C function pointers, which will generally be suitable for other
+function-signing schemas.
+
+In all the operation descriptions below, key values must be constant values
+corresponding to one of the implementation-specific abstract signing keys from
+this ``enum``.
+
+``ptrauth_extra_data_t``
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+This is a ``typedef`` of a standard integer type of the correct size to hold
+a discriminator value.
+
+In the signing and authentication operation descriptions below, discriminator
+values must have either pointer type or integer type. If the discriminator is
+an integer, it will be coerced to ``ptrauth_extra_data_t``.
+
+``ptrauth_blend_discriminator``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_blend_discriminator(pointer, integer)
+
+Produce a discriminator value which blends information from the given pointer
+and the given integer.
+
+Implementations may ignore some bits from each value, which is to say, the
+blending algorithm may be chosen for speed and convenience over theoretical
+strength as a hash-combining algorithm.  For example, arm64e simply overwrites
+the high 16 bits of the pointer with the low 16 bits of the integer, which can
+be done in a single instruction with an immediate integer.
+
+``pointer`` must have pointer type, and ``integer`` must have integer type. The
+result has type ``ptrauth_extra_data_t``.
+
+``ptrauth_strip``
+^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_strip(signedPointer, key)
+
+Given that ``signedPointer`` matches the layout for signed pointers signed with
+the given key, extract the raw pointer from it.  This operation does not trap
+and cannot fail, even if the pointer is not validly signed.
+
+``ptrauth_sign_unauthenticated``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_sign_unauthenticated(pointer, key, discriminator)
+
+Produce a signed pointer for the given raw pointer without applying any
+authentication or extra treatment.  This operation is not required to have the
+same behavior on a null pointer that the language implementation would.
+
+This is a treacherous operation that can easily result in signing oracles.
+Programs should use it seldom and carefully.
+
+``ptrauth_auth_and_resign``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_auth_and_resign(pointer, oldKey, oldDiscriminator, newKey, newDiscriminator)
+
+Authenticate that ``pointer`` is signed with ``oldKey`` and
+``oldDiscriminator`` and then resign the raw-pointer result of that
+authentication with ``newKey`` and ``newDiscriminator``.
+
+``pointer`` must have pointer type.  The result will have the same type as
+``pointer``.  This operation is not required to have the same behavior on
+a null pointer that the language implementation would.
+
+The code sequence produced for this operation must not be directly attackable.
+However, if the discriminator values are not constant integers, their
+computations may still be attackable.  In the future, Clang should be enhanced
+to guaranteed non-attackability if these expressions are safely-derived.
+
+``ptrauth_auth_data``
+^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_auth_data(pointer, key, discriminator)
+
+Authenticate that ``pointer`` is signed with ``key`` and ``discriminator`` and
+remove the signature.
+
+``pointer`` must have object pointer type.  The result will have the same type
+as ``pointer``.  This operation is not required to have the same behavior on
+a null pointer that the language implementation would.
+
+In the future when Clang makes safe derivation guarantees, the result of
+this operation should be considered safely-derived.
+
+``ptrauth_sign_generic_data``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_sign_generic_data(value1, value2)
+
+Computes a signature for the given pair of values, incorporating a secret
+signing key.
+
+This operation can be used to verify that arbitrary data has not been tampered
+with by computing a signature for the data, storing that signature, and then
+repeating this process and verifying that it yields the same result.  This can
+be reasonably done in any number of ways; for example, a library could compute
+an ordinary checksum of the data and just sign the result in order to get the
+tamper-resistance advantages of the secret signing key (since otherwise an
+attacker could reliably overwrite both the data and the checksum).
+
+``value1`` and ``value2`` must be either pointers or integers.  If the integers
+are larger than ``uintptr_t`` then data not representable in ``uintptr_t`` may
+be discarded.
+
+The result will have type ``ptrauth_generic_signature_t``, which is an integer
+type.  Implementations are not required to make all bits of the result equally
+significant; in particular, some implementations are known to not leave
+meaningful data in the low bits.
+
+
+
+Alternative Implementations
+---------------------------
+
+Signature Storage
+~~~~~~~~~~~~~~~~~
+
+It is not critical for the security of pointer authentication that the
+signature be stored "together" with the pointer, as it is in Armv8.3. An
+implementation could just as well store the signature in a separate word, so
+that the ``sizeof`` a signed pointer would be larger than the ``sizeof`` a raw
+pointer.
+
+Storing the signature in the high bits, as Armv8.3 does, has several trade-offs:
+
+- Disadvantage: there are substantially fewer bits available for the signature,
+  weakening the mitigation by making it much easier for an attacker to simply
+  guess the correct signature.
+
+- Disadvantage: future growth of the address space will necessarily further
+  weaken the mitigation.
+
+- Advantage: memory layouts don't change, so it's possible for
+  pointer-authentication-enabled code (for example, in a system library) to
+  efficiently interoperate with existing code, as long as pointer
+  authentication can be disabled dynamically.
+
+- Advantage: the size of a signed pointer doesn't grow, which might
+  significantly increase memory requirements, code size, and register pressure.
+
+- Advantage: the size of a signed pointer is the same as a raw pointer, so
+  generic APIs which work in types like `void *` (such as `dlsym`) can still
+  return signed pointers.  This means that clients of these APIs will not
+  require insecure code in order to correctly receive a function pointer.
+
+Hashing vs. Encrypting Pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Armv8.3 implements ``sign`` by computing a cryptographic hash and storing that
+in the spare bits of the pointer.  This means that there are relatively few
+possible values for the valid signed pointer, since the bits corresponding to
+the raw pointer are known.  Together with an ``auth`` oracle, this can make it
+computationally feasible to discover the correct signature with brute force.
+(The implementation should of course endeavor not to introduce ``auth``
+oracles, but this can be difficult, and attackers can be devious.)
+
+If the implementation can instead *encrypt* the pointer during ``sign`` and
+*decrypt* it during ``auth``, this brute-force attack becomes far less
+feasible, even with an ``auth`` oracle.  However, there are several problems
+with this idea:
+
+- It's unclear whether this kind of encryption is even possible without
+  increasing the storage size of a signed pointer.  If the storage size can be
+  increased, brute-force atacks can be equally well mitigated by simply storing
+  a larger signature.
+
+- It would likely be impossible to implement a ``strip`` operation, which might
+  make debuggers and other out-of-process tools far more difficult to write, as
+  well as generally making primitive debugging more challenging.
+
+- Implementations can benefit from being able to extract the raw pointer
+  immediately from a signed pointer.  An Armv8.3 processor executing an
+  ``auth``-and-load instruction can perform the load and ``auth`` in parallel;
+  a processor which instead encrypted the pointer would be forced to perform
+  these operations serially.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 71c1edc8f67b17..0ce3cbe3266efd 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -47,6 +47,12 @@ C++ Specific Potentially Breaking Changes
 
 ABI Changes in This Version
 ---------------------------
+- Fixed Microsoft name mangling of implicitly defined variables used for thread
+  safe static initialization of static local variables. This change resolves
+  incompatibilities with code compiled by MSVC but might introduce
+  incompatibilities with code compiled by earlier versions of Clang when an
+  inline member function that contains a static local variable with a dynamic
+  initializer is declared with ``__declspec(dllimport)``. (#GH83616).
 
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
@@ -191,6 +197,9 @@ Removed Compiler Flags
 -------------------------
 
 - The ``-freroll-loops`` flag has been removed. It had no effect since Clang 13.
+- ``-m[no-]unaligned-access`` is removed for RISC-V and LoongArch.
+  ``-m[no-]strict-align``, also supported by GCC, should be used instead.
+  (`#85350 <https://github.com/llvm/llvm-project/pull/85350>`_.)
 
 Attribute Changes in Clang
 --------------------------
@@ -241,6 +250,10 @@ Improvements to Clang's diagnostics
   such as attempting to call ``free`` on an unallocated object. Fixes
   `#79443 <https://github.com/llvm/llvm-project/issues/79443>`_.
 
+- Clang no longer warns when the ``bitand`` operator is used with boolean
+  operands, distinguishing it from potential typographical errors or unintended
+  bitwise operations. Fixes #GH77601.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -271,6 +284,9 @@ Bug Fixes in This Version
   for variables created through copy initialization having side-effects in C++17 and later.
   Fixes (#GH64356) (#GH79518).
 
+- Fix value of predefined macro ``__FUNCTION__`` in MSVC compatibility mode.
+  Fixes (#GH66114).
+
 - Clang now emits errors for explicit specializations/instatiations of lambda call
   operator.
   Fixes (#GH83267).
@@ -285,6 +301,9 @@ Bug Fixes in This Version
   by the C standard. This significantly improves codegen of `*` and `/` especially.
   Fixes (`#31205 <https://github.com/llvm/llvm-project/issues/31205>`_).
 
+- Fixes an assertion failure on invalid code when trying to define member
+  functions in lambdas.
+
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -378,6 +397,11 @@ Bug Fixes to C++ Support
   Fixes (#GH80997)
 - Fix an issue where missing set friend declaration in template class instantiation.
   Fixes (#GH84368).
+- Fixed a crash while checking constraints of a trailing requires-expression of a lambda, that the
+  expression references to an entity declared outside of the lambda. (#GH64808)
+- Clang's __builtin_bit_cast will now produce a constant value for records with empty bases. See:
+  (#GH82383)
+- Fix a crash when instantiating a lambda that captures ``this`` outside of its context. Fixes (#GH85343).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -425,6 +449,8 @@ Arm and AArch64 Support
   like ``target_version`` or ``target_clones``.
 - Support has been added for the following processors (-mcpu identifiers in parenthesis):
     * Arm Cortex-A78AE (cortex-a78ae).
+    * Arm Cortex-A520AE (cortex-a520ae).
+    * Arm Cortex-A720AE (cortex-a720ae).
 
 Android Support
 ^^^^^^^^^^^^^^^
@@ -432,6 +458,26 @@ Android Support
 Windows Support
 ^^^^^^^^^^^^^^^
 
+- Clang-cl now supports function targets with intrinsic headers. This allows
+  for runtime feature detection of intrinsics. Previously under clang-cl
+  ``immintrin.h`` and similar intrinsic headers would only include the intrinsics
+  if building with that feature enabled at compile time, e.g. ``avxintrin.h``
+  would only be included if AVX was enabled at compile time. This was done to work
+  around include times from MSVC STL including ``intrin.h`` under clang-cl.
+  Clang-cl now provides ``intrin0.h`` for MSVC STL and therefore all intrinsic
+  features without requiring enablement at compile time.
+  Fixes: (`#53520 <https://github.com/llvm/llvm-project/issues/53520>`_)
+
+- Improved compile times with MSVC STL. MSVC provides ``intrin0.h`` which is a
+  header that only includes intrinsics that are used by MSVC STL to avoid the
+  use of ``intrin.h``. MSVC STL when compiled under clang uses ``intrin.h``
+  instead. Clang-cl now provides ``intrin0.h`` for the same compiler throughput
+  purposes as MSVC. Clang-cl also provides ``yvals_core.h`` to redefine
+  ``_STL_INTRIN_HEADER`` to expand to ``intrin0.h`` instead of ``intrin.h``.
+  This also means that if all intrinsic features are enabled at compile time
+  including STL headers will no longer slow down compile times since ``intrin.h``
+  is not included from MSVC STL.
+
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 
@@ -530,6 +576,8 @@ Python Binding Changes
 ----------------------
 
 - Exposed `CXRewriter` API as `class Rewriter`.
+- Add some missing kinds from Index.h (CursorKind: 149-156, 272-320, 420-437.
+  TemplateArgumentKind: 5-9. TypeKind: 161-175 and 178).
 
 OpenMP Support
 --------------
diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
index 40ab76fa26a9e3..70687c23b15e61 100644
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -122,6 +122,7 @@ clang/include/clang/Analysis/MacroExpansionContext.h
 clang/include/clang/Analysis/Analyses/CalledOnceCheck.h
 clang/include/clang/Analysis/Analyses/CFGReachabilityAnalysis.h
 clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h
+clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
 clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
 clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
 clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -306,7 +307,7 @@ clang/include/clang-c/Index.h
 clang/lib/Analysis/CalledOnceCheck.cpp
 clang/lib/Analysis/CloneDetection.cpp
 clang/lib/Analysis/CodeInjector.cpp
-clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
+clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
 clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
 clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
 clang/lib/Analysis/FlowSensitive/DebugSupport.cpp
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 3f3620609b6ddc..60db3cf0966c02 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -1675,7 +1675,7 @@ enum CXCursorKind {
   CXCursor_ConceptSpecializationExpr = 153,
 
   /**
-   * Expression that references a C++20 concept.
+   * Expression that references a C++20 requires expression.
    */
   CXCursor_RequiresExpr = 154,
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index ff6b64c7f72d57..002f36ecbbaa3f 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -250,6 +250,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
       DependentBitIntTypes;
   llvm::FoldingSet<BTFTagAttributedType> BTFTagAttributedTypes;
 
+  mutable llvm::FoldingSet<CountAttributedType> CountAttributedTypes;
+
   mutable llvm::FoldingSet<QualifiedTemplateName> QualifiedTemplateNames;
   mutable llvm::FoldingSet<DependentTemplateName> DependentTemplateNames;
   mutable llvm::FoldingSet<SubstTemplateTemplateParmStorage>
@@ -1341,6 +1343,11 @@ class ASTContext : public RefCountedBase<ASTContext> {
     return CanQualType::CreateUnsafe(getPointerType((QualType) T));
   }
 
+  QualType
+  getCountAttributedType(QualType T, Expr *CountExpr, bool CountInBytes,
+                         bool OrNull,
+                         ArrayRef<TypeCoupledDeclRefInfo> DependentDecls) const;
+
   /// Return the uniqued reference to a type adjusted from the original
   /// type to a new type.
   QualType getAdjustedType(QualType Orig, QualType New) const;
diff --git a/clang/include/clang/AST/Availability.h b/clang/include/clang/AST/Availability.h
index ae3acbeffe7f18..26ae622e5b4496 100644
--- a/clang/include/clang/AST/Availability.h
+++ b/clang/include/clang/AST/Availability.h
@@ -67,6 +67,7 @@ struct AvailabilityInfo {
   VersionTuple Introduced;
   VersionTuple Deprecated;
   VersionTuple Obsoleted;
+  bool Unavailable = false;
   bool UnconditionallyDeprecated = false;
   bool UnconditionallyUnavailable = false;
 
@@ -75,6 +76,15 @@ struct AvailabilityInfo {
   /// Determine if this AvailabilityInfo represents the default availability.
   bool isDefault() const { return *this == AvailabilityInfo(); }
 
+  /// Check if the symbol has been obsoleted.
+  bool isObsoleted() const { return !Obsoleted.empty(); }
+
+  /// Check if the symbol is unavailable unconditionally or
+  /// on the active platform and os version.
+  bool isUnavailable() const {
+    return Unavailable || isUnconditionallyUnavailable();
+  }
+
   /// Check if the symbol is unconditionally deprecated.
   ///
   /// i.e. \code __attribute__((deprecated)) \endcode
@@ -88,9 +98,10 @@ struct AvailabilityInfo {
   }
 
   AvailabilityInfo(StringRef Domain, VersionTuple I, VersionTuple D,
-                   VersionTuple O, bool UD, bool UU)
+                   VersionTuple O, bool U, bool UD, bool UU)
       : Domain(Domain), Introduced(I), Deprecated(D), Obsoleted(O),
-        UnconditionallyDeprecated(UD), UnconditionallyUnavailable(UU) {}
+        Unavailable(U), UnconditionallyDeprecated(UD),
+        UnconditionallyUnavailable(UU) {}
 
   friend bool operator==(const AvailabilityInfo &Lhs,
                          const AvailabilityInfo &Rhs);
@@ -102,10 +113,10 @@ struct AvailabilityInfo {
 inline bool operator==(const AvailabilityInfo &Lhs,
                        const AvailabilityInfo &Rhs) {
   return std::tie(Lhs.Introduced, Lhs.Deprecated, Lhs.Obsoleted,
-                  Lhs.UnconditionallyDeprecated,
+                  Lhs.Unavailable, Lhs.UnconditionallyDeprecated,
                   Lhs.UnconditionallyUnavailable) ==
          std::tie(Rhs.Introduced, Rhs.Deprecated, Rhs.Obsoleted,
-                  Rhs.UnconditionallyDeprecated,
+                  Rhs.Unavailable, Rhs.UnconditionallyDeprecated,
                   Rhs.UnconditionallyUnavailable);
 }
 
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 446bec4081e869..f9313f87be3800 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -2045,7 +2045,8 @@ class PredefinedExpr final
   }
 
   static std::string ComputeName(PredefinedIdentKind IK,
-                                 const Decl *CurrentDecl);
+                                 const Decl *CurrentDecl,
+                                 bool ForceElaboratedPrinting = false);
 
   SourceLocation getBeginLoc() const { return getLocation(); }
   SourceLocation getEndLoc() const { return getLocation(); }
diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td
index 0270c086d06b6a..6df1d93a7ba2eb 100644
--- a/clang/include/clang/AST/PropertiesBase.td
+++ b/clang/include/clang/AST/PropertiesBase.td
@@ -143,6 +143,7 @@ def UInt32 : CountPropertyType<"uint32_t">;
 def UInt64 : CountPropertyType<"uint64_t">;
 def UnaryTypeTransformKind : EnumPropertyType<"UnaryTransformType::UTTKind">;
 def VectorKind : EnumPropertyType<"VectorKind">;
+def TypeCoupledDeclRefInfo : PropertyType;
 
 def ExceptionSpecInfo : PropertyType<"FunctionProtoType::ExceptionSpecInfo"> {
   let BufferElementTypes = [ QualType ];
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 5080551ada4fc6..4a1ff222ecadcd 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -1110,6 +1110,12 @@ DEF_TRAVERSE_TYPE(InjectedClassNameType, {})
 DEF_TRAVERSE_TYPE(AttributedType,
                   { TRY_TO(TraverseType(T->getModifiedType())); })
 
+DEF_TRAVERSE_TYPE(CountAttributedType, {
+  if (T->getCountExpr())
+    TRY_TO(TraverseStmt(T->getCountExpr()));
+  TRY_TO(TraverseType(T->desugar()));
+})
+
 DEF_TRAVERSE_TYPE(BTFTagAttributedType,
                   { TRY_TO(TraverseType(T->getWrappedType())); })
 
@@ -1401,6 +1407,9 @@ DEF_TRAVERSE_TYPELOC(MacroQualifiedType,
 DEF_TRAVERSE_TYPELOC(AttributedType,
                      { TRY_TO(TraverseTypeLoc(TL.getModifiedLoc())); })
 
+DEF_TRAVERSE_TYPELOC(CountAttributedType,
+                     { TRY_TO(TraverseTypeLoc(TL.getInnerLoc())); })
+
 DEF_TRAVERSE_TYPELOC(BTFTagAttributedType,
                      { TRY_TO(TraverseTypeLoc(TL.getWrappedLoc())); })
 
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 1942b0e67f65a3..b3bf23227a47f2 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -61,6 +61,7 @@ class BTFTypeTagAttr;
 class ExtQuals;
 class QualType;
 class ConceptDecl;
+class ValueDecl;
 class TagDecl;
 class TemplateParameterList;
 class Type;
@@ -2000,6 +2001,21 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned NumExpansions;
   };
 
+  class CountAttributedTypeBitfields {
+    friend class CountAttributedType;
+
+    LLVM_PREFERRED_TYPE(TypeBitfields)
+    unsigned : NumTypeBits;
+
+    static constexpr unsigned NumCoupledDeclsBits = 4;
+    unsigned NumCoupledDecls : NumCoupledDeclsBits;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned CountInBytes : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned OrNull : 1;
+  };
+  static_assert(sizeof(CountAttributedTypeBitfields) <= sizeof(unsigned));
+
   union {
     TypeBitfields TypeBits;
     ArrayTypeBitfields ArrayTypeBits;
@@ -2022,6 +2038,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     DependentTemplateSpecializationTypeBitfields
       DependentTemplateSpecializationTypeBits;
     PackExpansionTypeBitfields PackExpansionTypeBits;
+    CountAttributedTypeBitfields CountAttributedTypeBits;
   };
 
 private:
@@ -2244,6 +2261,8 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isFloatingType() const;     // C99 6.2.5p11 (real floating + complex)
   bool isHalfType() const;         // OpenCL 6.1.1.1, NEON (IEEE 754-2008 half)
   bool isFloat16Type() const;      // C11 extension ISO/IEC TS 18661
+  bool isFloat32Type() const;
+  bool isDoubleType() const;
   bool isBFloat16Type() const;
   bool isFloat128Type() const;
   bool isIbm128Type() const;
@@ -2262,6 +2281,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isFunctionProtoType() const { return getAs<FunctionProtoType>(); }
   bool isPointerType() const;
   bool isAnyPointerType() const;   // Any C pointer or ObjC object pointer
+  bool isCountAttributedType() const;
   bool isBlockPointerType() const;
   bool isVoidPointerType() const;
   bool isReferenceType() const;
@@ -2722,6 +2742,14 @@ template <> const TemplateSpecializationType *Type::getAs() const;
 /// until it reaches an AttributedType or a non-sugared type.
 template <> const AttributedType *Type::getAs() const;
 
+/// This will check for a BoundsAttributedType by removing any existing
+/// sugar until it reaches an BoundsAttributedType or a non-sugared type.
+template <> const BoundsAttributedType *Type::getAs() const;
+
+/// This will check for a CountAttributedType by removing any existing
+/// sugar until it reaches an CountAttributedType or a non-sugared type.
+template <> const CountAttributedType *Type::getAs() const;
+
 // We can do canonical leaf types faster, because we don't have to
 // worry about preserving child type decoration.
 #define TYPE(Class, Base)
@@ -2920,6 +2948,136 @@ class PointerType : public Type, public llvm::FoldingSetNode {
   static bool classof(const Type *T) { return T->getTypeClass() == Pointer; }
 };
 
+/// [BoundsSafety] Represents information of declarations referenced by the
+/// arguments of the `counted_by` attribute and the likes.
+class TypeCoupledDeclRefInfo {
+public:
+  using BaseTy = llvm::PointerIntPair<ValueDecl *, 1, unsigned>;
+
+private:
+  enum {
+    DerefShift = 0,
+    DerefMask = 1,
+  };
+  BaseTy Data;
+
+public:
+  /// \p D is to a declaration referenced by the argument of attribute. \p Deref
+  /// indicates whether \p D is referenced as a dereferenced form, e.g., \p
+  /// Deref is true for `*n` in `int *__counted_by(*n)`.
+  TypeCoupledDeclRefInfo(ValueDecl *D = nullptr, bool Deref = false);
+
+  bool isDeref() const;
+  ValueDecl *getDecl() const;
+  unsigned getInt() const;
+  void *getOpaqueValue() const;
+  bool operator==(const TypeCoupledDeclRefInfo &Other) const;
+  void setFromOpaqueValue(void *V);
+};
+
+/// [BoundsSafety] Represents a parent type class for CountAttributedType and
+/// similar sugar types that will be introduced to represent a type with a
+/// bounds attribute.
+///
+/// Provides a common interface to navigate declarations referred to by the
+/// bounds expression.
+
+class BoundsAttributedType : public Type, public llvm::FoldingSetNode {
+  QualType WrappedTy;
+
+protected:
+  ArrayRef<TypeCoupledDeclRefInfo> Decls; // stored in trailing objects
+
+  BoundsAttributedType(TypeClass TC, QualType Wrapped, QualType Canon);
+
+public:
+  bool isSugared() const { return true; }
+  QualType desugar() const { return WrappedTy; }
+
+  using decl_iterator = const TypeCoupledDeclRefInfo *;
+  using decl_range = llvm::iterator_range<decl_iterator>;
+
+  decl_iterator dependent_decl_begin() const { return Decls.begin(); }
+  decl_iterator dependent_decl_end() const { return Decls.end(); }
+
+  unsigned getNumCoupledDecls() const { return Decls.size(); }
+
+  decl_range dependent_decls() const {
+    return decl_range(dependent_decl_begin(), dependent_decl_end());
+  }
+
+  ArrayRef<TypeCoupledDeclRefInfo> getCoupledDecls() const {
+    return {dependent_decl_begin(), dependent_decl_end()};
+  }
+
+  bool referencesFieldDecls() const;
+
+  static bool classof(const Type *T) {
+    // Currently, only `class CountAttributedType` inherits
+    // `BoundsAttributedType` but the subclass will grow as we add more bounds
+    // annotations.
+    switch (T->getTypeClass()) {
+    case CountAttributed:
+      return true;
+    default:
+      return false;
+    }
+  }
+};
+
+/// Represents a sugar type with `__counted_by` or `__sized_by` annotations,
+/// including their `_or_null` variants.
+class CountAttributedType final
+    : public BoundsAttributedType,
+      public llvm::TrailingObjects<CountAttributedType,
+                                   TypeCoupledDeclRefInfo> {
+  friend class ASTContext;
+
+  Expr *CountExpr;
+  /// \p CountExpr represents the argument of __counted_by or the likes. \p
+  /// CountInBytes indicates that \p CountExpr is a byte count (i.e.,
+  /// __sized_by(_or_null)) \p OrNull means it's an or_null variant (i.e.,
+  /// __counted_by_or_null or __sized_by_or_null) \p CoupledDecls contains the
+  /// list of declarations referenced by \p CountExpr, which the type depends on
+  /// for the bounds information.
+  CountAttributedType(QualType Wrapped, QualType Canon, Expr *CountExpr,
+                      bool CountInBytes, bool OrNull,
+                      ArrayRef<TypeCoupledDeclRefInfo> CoupledDecls);
+
+  unsigned numTrailingObjects(OverloadToken<TypeCoupledDeclRefInfo>) const {
+    return CountAttributedTypeBits.NumCoupledDecls;
+  }
+
+public:
+  enum DynamicCountPointerKind {
+    CountedBy = 0,
+    SizedBy,
+    CountedByOrNull,
+    SizedByOrNull,
+  };
+
+  Expr *getCountExpr() const { return CountExpr; }
+  bool isCountInBytes() const { return CountAttributedTypeBits.CountInBytes; }
+  bool isOrNull() const { return CountAttributedTypeBits.OrNull; }
+
+  DynamicCountPointerKind getKind() const {
+    if (isOrNull())
+      return isCountInBytes() ? SizedByOrNull : CountedByOrNull;
+    return isCountInBytes() ? SizedBy : CountedBy;
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) {
+    Profile(ID, desugar(), CountExpr, isCountInBytes(), isOrNull());
+  }
+
+  static void Profile(llvm::FoldingSetNodeID &ID, QualType WrappedTy,
+                      Expr *CountExpr, bool CountInBytes, bool Nullable);
+
+  static bool classof(const Type *T) {
+    return T->getTypeClass() == CountAttributed;
+  }
+};
+
 /// Represents a type which was implicitly adjusted by the semantic
 /// engine for arbitrary reasons.  For example, array and function types can
 /// decay, and function types can have their calling conventions adjusted.
@@ -7452,6 +7610,14 @@ inline bool Type::isFloat16Type() const {
   return isSpecificBuiltinType(BuiltinType::Float16);
 }
 
+inline bool Type::isFloat32Type() const {
+  return isSpecificBuiltinType(BuiltinType::Float);
+}
+
+inline bool Type::isDoubleType() const {
+  return isSpecificBuiltinType(BuiltinType::Double);
+}
+
 inline bool Type::isBFloat16Type() const {
   return isSpecificBuiltinType(BuiltinType::BFloat16);
 }
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 32028e877fc8dc..b09eb3539a4bad 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -1120,6 +1120,32 @@ class ObjCInterfaceTypeLoc : public ConcreteTypeLoc<ObjCObjectTypeLoc,
   }
 };
 
+struct BoundsAttributedLocInfo {};
+class BoundsAttributedTypeLoc
+    : public ConcreteTypeLoc<UnqualTypeLoc, BoundsAttributedTypeLoc,
+                             BoundsAttributedType, BoundsAttributedLocInfo> {
+public:
+  TypeLoc getInnerLoc() const { return getInnerTypeLoc(); }
+  QualType getInnerType() const { return getTypePtr()->desugar(); }
+  void initializeLocal(ASTContext &Context, SourceLocation Loc) {
+    // nothing to do
+  }
+  // LocalData is empty and TypeLocBuilder doesn't handle DataSize 1.
+  unsigned getLocalDataSize() const { return 0; }
+};
+
+class CountAttributedTypeLoc final
+    : public InheritingConcreteTypeLoc<BoundsAttributedTypeLoc,
+                                       CountAttributedTypeLoc,
+                                       CountAttributedType> {
+public:
+  Expr *getCountExpr() const { return getTypePtr()->getCountExpr(); }
+  bool isCountInBytes() const { return getTypePtr()->isCountInBytes(); }
+  bool isOrNull() const { return getTypePtr()->isOrNull(); }
+
+  SourceRange getLocalSourceRange() const;
+};
+
 struct MacroQualifiedLocInfo {
   SourceLocation ExpansionLoc;
 };
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index 0ba172a4035fdb..f3906cb7f86fbf 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -25,6 +25,25 @@ let Class = PointerType in {
   def : Creator<[{ return ctx.getPointerType(pointeeType); }]>;
 }
 
+let Class = CountAttributedType in {
+  def : Property<"WrappedTy", QualType> {
+    let Read = [{ node->desugar() }];
+  }
+  def : Property<"CountExpr", ExprRef> {
+    let Read = [{ node->getCountExpr() }];
+  }
+  def : Property<"CountInBytes", Bool> {
+    let Read = [{ node->isCountInBytes() }];
+  }
+  def : Property<"OrNull", Bool> {
+    let Read = [{ node->isOrNull() }];
+  }
+  def : Property<"CoupledDecls", Array<TypeCoupledDeclRefInfo>> {
+    let Read = [{ node->getCoupledDecls() }];
+  }
+  def : Creator<[{ return ctx.getCountAttributedType(WrappedTy, CountExpr, CountInBytes, OrNull, CoupledDecls); }]>;
+}
+
 let Class = AdjustedType in {
   def : Property<"originalType", QualType> {
     let Read = [{ node->getOriginalType() }];
diff --git a/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
new file mode 100644
index 00000000000000..420f13ce11bfde
--- /dev/null
+++ b/clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
@@ -0,0 +1,96 @@
+//===-- AdornedCFG.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines an AdornedCFG class that is used by dataflow analyses that
+//  run over Control-Flow Graphs (CFGs).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ADORNEDCFG_H
+#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ADORNEDCFG_H
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/Stmt.h"
+#include "clang/Analysis/CFG.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Error.h"
+#include <memory>
+#include <utility>
+
+namespace clang {
+namespace dataflow {
+
+/// Holds CFG with additional information derived from it that is needed to
+/// perform dataflow analysis.
+class AdornedCFG {
+public:
+  /// Builds an `AdornedCFG` from a `FunctionDecl`.
+  /// `Func.doesThisDeclarationHaveABody()` must be true, and
+  /// `Func.isTemplated()` must be false.
+  static llvm::Expected<AdornedCFG> build(const FunctionDecl &Func);
+
+  /// Builds an `AdornedCFG` from an AST node. `D` is the function in which
+  /// `S` resides. `D.isTemplated()` must be false.
+  static llvm::Expected<AdornedCFG> build(const Decl &D, Stmt &S,
+                                          ASTContext &C);
+
+  /// Returns the `Decl` containing the statement used to construct the CFG, if
+  /// available.
+  const Decl &getDecl() const { return ContainingDecl; }
+
+  /// Returns the CFG that is stored in this context.
+  const CFG &getCFG() const { return *Cfg; }
+
+  /// Returns a mapping from statements to basic blocks that contain them.
+  const llvm::DenseMap<const Stmt *, const CFGBlock *> &getStmtToBlock() const {
+    return StmtToBlock;
+  }
+
+  /// Returns whether `B` is reachable from the entry block.
+  bool isBlockReachable(const CFGBlock &B) const {
+    return BlockReachable[B.getBlockID()];
+  }
+
+  /// Returns whether `B` contains an expression that is consumed in a
+  /// different block than `B` (i.e. the parent of the expression is in a
+  /// different block).
+  /// This happens if there is control flow within a full-expression (triggered
+  /// by `&&`, `||`, or the conditional operator). Note that the operands of
+  /// these operators are not the only expressions that can be consumed in a
+  /// different block. For example, in the function call
+  /// `f(&i, cond() ? 1 : 0)`, `&i` is in a different block than the `CallExpr`.
+  bool containsExprConsumedInDifferentBlock(const CFGBlock &B) const {
+    return ContainsExprConsumedInDifferentBlock.contains(&B);
+  }
+
+private:
+  AdornedCFG(
+      const Decl &D, std::unique_ptr<CFG> Cfg,
+      llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock,
+      llvm::BitVector BlockReachable,
+      llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock)
+      : ContainingDecl(D), Cfg(std::move(Cfg)),
+        StmtToBlock(std::move(StmtToBlock)),
+        BlockReachable(std::move(BlockReachable)),
+        ContainsExprConsumedInDifferentBlock(
+            std::move(ContainsExprConsumedInDifferentBlock)) {}
+
+  /// The `Decl` containing the statement used to construct the CFG.
+  const Decl &ContainingDecl;
+  std::unique_ptr<CFG> Cfg;
+  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
+  llvm::BitVector BlockReachable;
+  llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock;
+};
+
+} // namespace dataflow
+} // namespace clang
+
+#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ADORNEDCFG_H
diff --git a/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
index 9a0a00f3c01343..3972962d0b2daa 100644
--- a/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h
@@ -6,89 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file defines a ControlFlowContext class that is used by dataflow
-//  analyses that run over Control-Flow Graphs (CFGs).
+//  This file defines a deprecated alias for AdornedCFG.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CONTROLFLOWCONTEXT_H
 #define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CONTROLFLOWCONTEXT_H
 
-#include "clang/AST/ASTContext.h"
-#include "clang/AST/Decl.h"
-#include "clang/AST/Stmt.h"
-#include "clang/Analysis/CFG.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Error.h"
-#include <memory>
-#include <utility>
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 
 namespace clang {
 namespace dataflow {
 
-/// Holds CFG and other derived context that is needed to perform dataflow
-/// analysis.
-class ControlFlowContext {
-public:
-  /// Builds a ControlFlowContext from a `FunctionDecl`.
-  /// `Func.doesThisDeclarationHaveABody()` must be true, and
-  /// `Func.isTemplated()` must be false.
-  static llvm::Expected<ControlFlowContext> build(const FunctionDecl &Func);
-
-  /// Builds a ControlFlowContext from an AST node. `D` is the function in which
-  /// `S` resides. `D.isTemplated()` must be false.
-  static llvm::Expected<ControlFlowContext> build(const Decl &D, Stmt &S,
-                                                  ASTContext &C);
-
-  /// Returns the `Decl` containing the statement used to construct the CFG, if
-  /// available.
-  const Decl &getDecl() const { return ContainingDecl; }
-
-  /// Returns the CFG that is stored in this context.
-  const CFG &getCFG() const { return *Cfg; }
-
-  /// Returns a mapping from statements to basic blocks that contain them.
-  const llvm::DenseMap<const Stmt *, const CFGBlock *> &getStmtToBlock() const {
-    return StmtToBlock;
-  }
-
-  /// Returns whether `B` is reachable from the entry block.
-  bool isBlockReachable(const CFGBlock &B) const {
-    return BlockReachable[B.getBlockID()];
-  }
-
-  /// Returns whether `B` contains an expression that is consumed in a
-  /// different block than `B` (i.e. the parent of the expression is in a
-  /// different block).
-  /// This happens if there is control flow within a full-expression (triggered
-  /// by `&&`, `||`, or the conditional operator). Note that the operands of
-  /// these operators are not the only expressions that can be consumed in a
-  /// different block. For example, in the function call
-  /// `f(&i, cond() ? 1 : 0)`, `&i` is in a different block than the `CallExpr`.
-  bool containsExprConsumedInDifferentBlock(const CFGBlock &B) const {
-    return ContainsExprConsumedInDifferentBlock.contains(&B);
-  }
-
-private:
-  ControlFlowContext(
-      const Decl &D, std::unique_ptr<CFG> Cfg,
-      llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock,
-      llvm::BitVector BlockReachable,
-      llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock)
-      : ContainingDecl(D), Cfg(std::move(Cfg)),
-        StmtToBlock(std::move(StmtToBlock)),
-        BlockReachable(std::move(BlockReachable)),
-        ContainsExprConsumedInDifferentBlock(
-            std::move(ContainsExprConsumedInDifferentBlock)) {}
-
-  /// The `Decl` containing the statement used to construct the CFG.
-  const Decl &ContainingDecl;
-  std::unique_ptr<CFG> Cfg;
-  llvm::DenseMap<const Stmt *, const CFGBlock *> StmtToBlock;
-  llvm::BitVector BlockReachable;
-  llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock;
-};
+// This is a deprecated alias. Use `AdornedCFG` instead.
+using ControlFlowContext = AdornedCFG;
 
 } // namespace dataflow
 } // namespace clang
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
index 3c84704d0d6ceb..67eccdd030dcdd 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
@@ -22,7 +22,7 @@
 
 #include "clang/AST/ASTContext.h"
 #include "clang/Analysis/CFG.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
 #include "clang/Analysis/FlowSensitive/MatchSwitch.h"
@@ -195,8 +195,7 @@ template <typename AnalysisT>
 llvm::Expected<std::vector<
     std::optional<DataflowAnalysisState<typename AnalysisT::Lattice>>>>
 runDataflowAnalysis(
-    const ControlFlowContext &CFCtx, AnalysisT &Analysis,
-    const Environment &InitEnv,
+    const AdornedCFG &ACFG, AnalysisT &Analysis, const Environment &InitEnv,
     std::function<void(const CFGElement &, const DataflowAnalysisState<
                                                typename AnalysisT::Lattice> &)>
         PostVisitCFG = nullptr,
@@ -218,7 +217,7 @@ runDataflowAnalysis(
   }
 
   auto TypeErasedBlockStates = runTypeErasedDataflowAnalysis(
-      CFCtx, Analysis, InitEnv, PostVisitCFGClosure, MaxBlockVisits);
+      ACFG, Analysis, InitEnv, PostVisitCFGClosure, MaxBlockVisits);
   if (!TypeErasedBlockStates)
     return TypeErasedBlockStates.takeError();
 
@@ -280,8 +279,7 @@ llvm::Expected<llvm::SmallVector<Diagnostic>> diagnoseFunction(
         Diagnoser,
     std::int64_t MaxSATIterations = 1'000'000'000,
     std::int32_t MaxBlockVisits = 20'000) {
-  llvm::Expected<ControlFlowContext> Context =
-      ControlFlowContext::build(FuncDecl);
+  llvm::Expected<AdornedCFG> Context = AdornedCFG::build(FuncDecl);
   if (!Context)
     return Context.takeError();
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 98bdf037880ab0..909a91059438ca 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -18,8 +18,8 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/TypeOrdering.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/Arena.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
 #include "clang/Analysis/FlowSensitive/Solver.h"
 #include "clang/Analysis/FlowSensitive/StorageLocation.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
@@ -183,9 +183,9 @@ class DataflowAnalysisContext {
   LLVM_DUMP_METHOD void dumpFlowCondition(Atom Token,
                                           llvm::raw_ostream &OS = llvm::dbgs());
 
-  /// Returns the `ControlFlowContext` registered for `F`, if any. Otherwise,
+  /// Returns the `AdornedCFG` registered for `F`, if any. Otherwise,
   /// returns null.
-  const ControlFlowContext *getControlFlowContext(const FunctionDecl *F);
+  const AdornedCFG *getAdornedCFG(const FunctionDecl *F);
 
   const Options &getOptions() { return Opts; }
 
@@ -296,7 +296,7 @@ class DataflowAnalysisContext {
   llvm::DenseMap<Atom, const Formula *> FlowConditionConstraints;
   const Formula *Invariant = nullptr;
 
-  llvm::DenseMap<const FunctionDecl *, ControlFlowContext> FunctionContexts;
+  llvm::DenseMap<const FunctionDecl *, AdornedCFG> FunctionContexts;
 
   // Fields modeled by environments covered by this context.
   FieldSet ModeledFields;
diff --git a/clang/include/clang/Analysis/FlowSensitive/Logger.h b/clang/include/clang/Analysis/FlowSensitive/Logger.h
index f4bd39f6ed49ef..f2e64810d00506 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Logger.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Logger.h
@@ -15,7 +15,7 @@
 
 namespace clang::dataflow {
 // Forward declarations so we can use Logger anywhere in the framework.
-class ControlFlowContext;
+class AdornedCFG;
 class TypeErasedDataflowAnalysis;
 struct TypeErasedDataflowAnalysisState;
 
@@ -40,8 +40,8 @@ class Logger {
 
   /// Called by the framework as we start analyzing a new function or statement.
   /// Forms a pair with endAnalysis().
-  virtual void beginAnalysis(const ControlFlowContext &,
-                             TypeErasedDataflowAnalysis &) {}
+  virtual void beginAnalysis(const AdornedCFG &, TypeErasedDataflowAnalysis &) {
+  }
   virtual void endAnalysis() {}
 
   // At any time during the analysis, we're computing the state for some target
diff --git a/clang/include/clang/Analysis/FlowSensitive/RecordOps.h b/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
index 783e53e980aa2c..8fad45fc11d81e 100644
--- a/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
+++ b/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
@@ -31,7 +31,11 @@ namespace dataflow {
 ///
 /// Requirements:
 ///
-///  `Src` and `Dst` must have the same canonical unqualified type.
+///  Either:
+///    - `Src` and `Dest` must have the same canonical unqualified type, or
+///    - The type of `Src` must be derived from `Dest`, or
+///    - The type of `Dest` must be derived from `Src` (in this case, any fields
+///      that are only present in `Dest` are not overwritten).
 void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
                 Environment &Env);
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/Transfer.h b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
index 7713df747cb76e..ed148250d8eb29 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Transfer.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
@@ -29,12 +29,12 @@ class StmtToEnvMap {
   // `CurState` is the pending state currently associated with this block. These
   // are supplied separately as the pending state for the current block may not
   // yet be represented in `BlockToState`.
-  StmtToEnvMap(const ControlFlowContext &CFCtx,
+  StmtToEnvMap(const AdornedCFG &ACFG,
                llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>>
                    BlockToState,
                unsigned CurBlockID,
                const TypeErasedDataflowAnalysisState &CurState)
-      : CFCtx(CFCtx), BlockToState(BlockToState), CurBlockID(CurBlockID),
+      : ACFG(ACFG), BlockToState(BlockToState), CurBlockID(CurBlockID),
         CurState(CurState) {}
 
   /// Returns the environment of the basic block that contains `S`.
@@ -42,7 +42,7 @@ class StmtToEnvMap {
   const Environment *getEnvironment(const Stmt &S) const;
 
 private:
-  const ControlFlowContext &CFCtx;
+  const AdornedCFG &ACFG;
   llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>> BlockToState;
   unsigned CurBlockID;
   const TypeErasedDataflowAnalysisState &CurState;
diff --git a/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
index a0ca7440230b04..b3722bf3ec80a8 100644
--- a/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
@@ -21,7 +21,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Analysis/CFG.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
@@ -146,7 +146,7 @@ struct TypeErasedDataflowAnalysisState {
 /// from converging.
 llvm::Expected<std::vector<std::optional<TypeErasedDataflowAnalysisState>>>
 runTypeErasedDataflowAnalysis(
-    const ControlFlowContext &CFCtx, TypeErasedDataflowAnalysis &Analysis,
+    const AdornedCFG &ACFG, TypeErasedDataflowAnalysis &Analysis,
     const Environment &InitEnv,
     std::function<void(const CFGElement &,
                        const TypeErasedDataflowAnalysisState &)>
diff --git a/clang/include/clang/Basic/AllDiagnostics.h b/clang/include/clang/Basic/AllDiagnostics.h
index cc6aa631534a5d..e64634cc138f7f 100644
--- a/clang/include/clang/Basic/AllDiagnostics.h
+++ b/clang/include/clang/Basic/AllDiagnostics.h
@@ -20,6 +20,7 @@
 #include "clang/Basic/DiagnosticCrossTU.h"
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Basic/DiagnosticFrontend.h"
+#include "clang/Basic/DiagnosticInstallAPI.h"
 #include "clang/Basic/DiagnosticLex.h"
 #include "clang/Basic/DiagnosticParse.h"
 #include "clang/Basic/DiagnosticSema.h"
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 67d87eca16ede8..3e03e55612645b 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2193,6 +2193,15 @@ def TypeNullUnspecified : TypeAttr {
   let Documentation = [TypeNullUnspecifiedDocs];
 }
 
+def CountedBy : DeclOrTypeAttr {
+  let Spellings = [Clang<"counted_by">];
+  let Subjects = SubjectList<[Field], ErrorDiag>;
+  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel">];
+  let ParseArgumentsAsUnevaluated = 1;
+  let Documentation = [CountedByDocs];
+  let LangOpts = [COnly];
+}
+
 // This is a marker used to indicate that an __unsafe_unretained qualifier was
 // ignored because ARC is not enabled. The usual representation for this
 // qualifier is as an ObjCOwnership attribute with Kind == "none".
@@ -4487,21 +4496,3 @@ def CodeAlign: StmtAttr {
     static constexpr int MaximumAlignment = 4096;
   }];
 }
-
-def CountedBy : InheritableAttr {
-  let Spellings = [Clang<"counted_by">];
-  let Subjects = SubjectList<[Field]>;
-  let Args = [IdentifierArgument<"CountedByField">];
-  let Documentation = [CountedByDocs];
-  let LangOpts = [COnly];
-  // FIXME: This is ugly. Let using a DeclArgument would be nice, but a Decl
-  // isn't yet available due to the fact that we're still parsing the
-  // structure. Maybe that code could be changed sometime in the future.
-  code AdditionalMembers = [{
-    private:
-      SourceRange CountedByFieldLoc;
-    public:
-      SourceRange getCountedByFieldLoc() const { return CountedByFieldLoc; }
-      void setCountedByFieldLoc(SourceRange Loc) { CountedByFieldLoc = Loc; }
-  }];
-}
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 9c703377ca8d3e..491c9d8954130f 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4354,6 +4354,43 @@ def CoroSuspend : CoroLangBuiltin {
   let Prototype = "char(_Constant bool)";
 }
 
+// Pointer authentication builtins.
+def PtrauthStrip : Builtin {
+  let Spellings = ["__builtin_ptrauth_strip"];
+  let Attributes = [CustomTypeChecking, NoThrow, Const];
+  let Prototype = "void*(void*,int)";
+}
+
+def PtrauthBlendDiscriminator : Builtin {
+  let Spellings = ["__builtin_ptrauth_blend_discriminator"];
+  let Attributes = [CustomTypeChecking, NoThrow, Const];
+  let Prototype = "size_t(void*,int)";
+}
+
+def PtrauthSignUnauthenticated : Builtin {
+  let Spellings = ["__builtin_ptrauth_sign_unauthenticated"];
+  let Attributes = [CustomTypeChecking, NoThrow, Const];
+  let Prototype = "void*(void*,int,void*)";
+}
+
+def PtrauthSignGenericData : Builtin {
+  let Spellings = ["__builtin_ptrauth_sign_generic_data"];
+  let Attributes = [CustomTypeChecking, NoThrow, Const];
+  let Prototype = "size_t(void*,void*)";
+}
+
+def PtrauthAuthAndResign : Builtin {
+  let Spellings = ["__builtin_ptrauth_auth_and_resign"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void*(void*,int,void*,int,void*)";
+}
+
+def PtrauthAuth : Builtin {
+  let Spellings = ["__builtin_ptrauth_auth"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void*(void*,int,void*)";
+}
+
 // OpenCL v2.0 s6.13.16, s9.17.3.5 - Pipe functions.
 // We need the generic prototype, since the packet type could be anything.
 def ReadPipe : OCLPipeLangBuiltin {
@@ -4554,6 +4591,12 @@ def HLSLWaveActiveCountBits : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "unsigned int(bool)";
 }
 
+def HLSLClamp : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_clamp"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLCreateHandle : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_create_handle"];
   let Attributes = [NoThrow, Const];
@@ -4572,6 +4615,12 @@ def HLSLFrac : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLIsinf : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_isinf"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLLerp : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_lerp"];
   let Attributes = [NoThrow, Const];
@@ -4590,6 +4639,12 @@ def HLSLRcp : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLRSqrt : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_rsqrt"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 7785fb430c069b..7d53c751c13ac4 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -12,6 +12,7 @@ clang_diag_gen(Common)
 clang_diag_gen(CrossTU)
 clang_diag_gen(Driver)
 clang_diag_gen(Frontend)
+clang_diag_gen(InstallAPI)
 clang_diag_gen(Lex)
 clang_diag_gen(Parse)
 clang_diag_gen(Refactoring)
diff --git a/clang/include/clang/Basic/Diagnostic.td b/clang/include/clang/Basic/Diagnostic.td
index 8d66e265fbaef0..0b8b3af939ba0e 100644
--- a/clang/include/clang/Basic/Diagnostic.td
+++ b/clang/include/clang/Basic/Diagnostic.td
@@ -162,6 +162,7 @@ include "DiagnosticCommonKinds.td"
 include "DiagnosticCrossTUKinds.td"
 include "DiagnosticDriverKinds.td"
 include "DiagnosticFrontendKinds.td"
+include "DiagnosticInstallAPIKinds.td"
 include "DiagnosticLexKinds.td"
 include "DiagnosticParseKinds.td"
 include "DiagnosticRefactoringKinds.td"
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index 08bb1d81ba29f1..43e132e5665850 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -356,6 +356,8 @@ def warn_target_unrecognized_env : Warning<
 def warn_knl_knm_isa_support_removed : Warning<
   "KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.">,
   InGroup<DiagGroup<"knl-knm-isa-support-removed">>;
+def err_target_unsupported_abi_with_fpu : Error<
+  "'%0' ABI is not supported with FPU">;
 
 // Source manager
 def err_cannot_open_file : Error<"cannot open file '%0': %1">, DefaultFatal;
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 794a0a82be6d74..ba23cf84c5e343 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -281,6 +281,8 @@ def err_builtin_needs_feature : Error<"%0 needs target feature %1">;
 def err_function_needs_feature : Error<
   "always_inline function %1 requires target feature '%2', but would "
   "be inlined into function %0 that is compiled without support for '%2'">;
+
+let CategoryName = "Codegen ABI Check" in {
 def err_function_always_inline_attribute_mismatch : Error<
   "always_inline function %1 and its caller %0 have mismatching %2 attributes">;
 def err_function_always_inline_new_za : Error<
@@ -292,6 +294,10 @@ def warn_avx_calling_convention
       InGroup<DiagGroup<"psabi">>;
 def err_avx_calling_convention : Error<warn_avx_calling_convention.Summary>;
 
+def err_target_unsupported_type_for_abi
+    : Error<"%0 requires %1 type support, but ABI '%2' does not support it">;
+}
+
 def err_alias_to_undefined : Error<
   "%select{alias|ifunc}0 must point to a defined "
   "%select{variable or |}1function">;
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 3f14167d6b8469..dee555f783cc45 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -875,6 +875,7 @@ def ZeroLengthArray : DiagGroup<"zero-length-array">;
 def GNUZeroLineDirective : DiagGroup<"gnu-zero-line-directive">;
 def GNUZeroVariadicMacroArguments : DiagGroup<"gnu-zero-variadic-macro-arguments">;
 def MisleadingIndentation : DiagGroup<"misleading-indentation">;
+def PtrAuthNullPointers : DiagGroup<"ptrauth-null-pointers">;
 
 // This covers both the deprecated case (in C++98)
 // and the extension case (in C++11 onwards).
@@ -1507,3 +1508,7 @@ def ReadOnlyPlacementChecks : DiagGroup<"read-only-types">;
 // Warnings and fixes to support the "safe buffers" programming model.
 def UnsafeBufferUsageInContainer : DiagGroup<"unsafe-buffer-usage-in-container">;
 def UnsafeBufferUsage : DiagGroup<"unsafe-buffer-usage", [UnsafeBufferUsageInContainer]>;
+
+// Warnings and notes InstallAPI verification.
+def InstallAPIViolation : DiagGroup<"installapi-violation">;
+
diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h
index 0cdda42793f6f0..5ff782c7f8c7e8 100644
--- a/clang/include/clang/Basic/DiagnosticIDs.h
+++ b/clang/include/clang/Basic/DiagnosticIDs.h
@@ -42,6 +42,7 @@ namespace clang {
       DIAG_SIZE_SEMA          = 4500,
       DIAG_SIZE_ANALYSIS      =  100,
       DIAG_SIZE_REFACTORING   = 1000,
+      DIAG_SIZE_INSTALLAPI    =  100,
     };
     // Start position for diagnostics.
     enum {
@@ -57,7 +58,8 @@ namespace clang {
       DIAG_START_SEMA          = DIAG_START_CROSSTU       + static_cast<int>(DIAG_SIZE_CROSSTU),
       DIAG_START_ANALYSIS      = DIAG_START_SEMA          + static_cast<int>(DIAG_SIZE_SEMA),
       DIAG_START_REFACTORING   = DIAG_START_ANALYSIS      + static_cast<int>(DIAG_SIZE_ANALYSIS),
-      DIAG_UPPER_LIMIT         = DIAG_START_REFACTORING   + static_cast<int>(DIAG_SIZE_REFACTORING)
+      DIAG_START_INSTALLAPI    = DIAG_START_REFACTORING   + static_cast<int>(DIAG_SIZE_REFACTORING),
+      DIAG_UPPER_LIMIT         = DIAG_START_INSTALLAPI    + static_cast<int>(DIAG_SIZE_INSTALLAPI)
     };
 
     class CustomDiagInfo;
@@ -276,6 +278,9 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   /// category.
   static bool isARCDiagnostic(unsigned DiagID);
 
+  /// Return true if a given diagnostic is a codegen-time ABI check.
+  static bool isCodegenABICheckDiagnostic(unsigned DiagID);
+
   /// Enumeration describing how the emission of a diagnostic should
   /// be treated when it occurs during C++ template argument deduction.
   enum SFINAEResponse {
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPI.h b/clang/include/clang/Basic/DiagnosticInstallAPI.h
new file mode 100644
index 00000000000000..a76f6e087a2b0a
--- /dev/null
+++ b/clang/include/clang/Basic/DiagnosticInstallAPI.h
@@ -0,0 +1,26 @@
+//===--- DiagnosticInstallAPI.h - Diagnostics for InstallAPI-----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H
+#define LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H
+
+#include "clang/Basic/Diagnostic.h"
+namespace clang {
+namespace diag {
+enum {
+#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR,      \
+             SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
+  ENUM,
+#define INSTALLAPISTART
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+#undef DIAG
+  NUM_BUILTIN_INSTALLAPI_DIAGNOSTICS
+};
+} // namespace diag
+} // namespace clang
+#endif // LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
new file mode 100644
index 00000000000000..f99a5fca64cb46
--- /dev/null
+++ b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
@@ -0,0 +1,43 @@
+//==--- DiagnosticInstallAPIKinds.td - installapi diagnostics -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// InstallAPI Diagnostics
+//===----------------------------------------------------------------------===//
+
+let Component = "InstallAPI" in {
+let CategoryName = "Command line" in {
+def err_cannot_write_file : Error<"cannot write file '%0': %1">;
+def err_no_install_name : Error<"no install name specified: add -install_name <path>">;
+def err_no_output_file: Error<"no output file specified">;
+} // end of command line category.
+
+let CategoryName = "Verification" in {
+def warn_target: Warning<"violations found for %0">, InGroup<InstallAPIViolation>;
+def err_library_missing_symbol : Error<"declaration has external linkage, but dynamic library doesn't have symbol '%0'">;
+def warn_library_missing_symbol : Warning<"declaration has external linkage, but dynamic library doesn't have symbol '%0'">, InGroup<InstallAPIViolation>;
+def err_library_hidden_symbol : Error<"declaration has external linkage, but symbol has internal linkage in dynamic library '%0'">;
+def warn_library_hidden_symbol : Warning<"declaration has external linkage, but symbol has internal linkage in dynamic library '%0'">, InGroup<InstallAPIViolation>;
+def warn_header_hidden_symbol : Warning<"symbol exported in dynamic library, but marked hidden in declaration '%0'">, InGroup<InstallAPIViolation>;
+def err_header_hidden_symbol : Error<"symbol exported in dynamic library, but marked hidden in declaration '%0'">;
+def err_header_symbol_missing : Error<"no declaration found for exported symbol '%0' in dynamic library">;
+def warn_header_availability_mismatch : Warning<"declaration '%0' is marked %select{available|unavailable}1,"
+  " but symbol is %select{not |}2exported in dynamic library">, InGroup<InstallAPIViolation>;
+def err_header_availability_mismatch : Error<"declaration '%0' is marked %select{available|unavailable}1,"
+  " but symbol is %select{not |}2exported in dynamic library">;
+def warn_dylib_symbol_flags_mismatch : Warning<"dynamic library symbol '%0' is "
+  "%select{weak defined|thread local}1, but its declaration is not">, InGroup<InstallAPIViolation>;
+def warn_header_symbol_flags_mismatch : Warning<"declaration '%0' is "
+  "%select{weak defined|thread local}1, but symbol is not in dynamic library">, InGroup<InstallAPIViolation>;
+def err_dylib_symbol_flags_mismatch : Error<"dynamic library symbol '%0' is "
+  "%select{weak defined|thread local}1, but its declaration is not">;
+def err_header_symbol_flags_mismatch : Error<"declaration '%0' is "
+  "%select{weak defined|thread local}1, but symbol is not in dynamic library">;
+} // end of Verification category.
+
+} // end of InstallAPI component
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index d7ab1635cf12bc..2646942a53e30d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -911,6 +911,22 @@ def warn_fortify_scanf_overflow : Warning<
 def err_function_start_invalid_type: Error<
   "argument must be a function">;
 
+def err_ptrauth_disabled :
+  Error<"this target does not support pointer authentication">;
+def err_ptrauth_invalid_key :
+  Error<"%0 does not identify a valid pointer authentication key for "
+        "the current target">;
+def err_ptrauth_value_bad_type :
+  Error<"%select{signed value|extra discriminator|blended pointer|blended "
+        "integer}0 must have %select{pointer|integer|pointer or integer}1 "
+        "type; type here is %2">;
+def warn_ptrauth_sign_null_pointer :
+  Warning<"signing a null pointer will yield a non-null pointer">,
+  InGroup<PtrAuthNullPointers>;
+def warn_ptrauth_auth_null_pointer :
+  Warning<"authenticating a null pointer will almost certainly trap">,
+  InGroup<PtrAuthNullPointers>;
+
 /// main()
 // static main() is not an error in C, just in C++.
 def warn_static_main : Warning<"'main' should not be declared static">,
@@ -6503,12 +6519,18 @@ def err_flexible_array_count_not_in_same_struct : Error<
   "'counted_by' field %0 isn't within the same struct as the flexible array">;
 def err_counted_by_attr_not_on_flexible_array_member : Error<
   "'counted_by' only applies to C99 flexible array members">;
-def err_counted_by_attr_refers_to_flexible_array : Error<
-  "'counted_by' cannot refer to the flexible array %0">;
+def err_counted_by_attr_refer_to_itself : Error<
+  "'counted_by' cannot refer to the flexible array member %0">;
 def err_counted_by_must_be_in_structure : Error<
   "field %0 in 'counted_by' not inside structure">;
-def err_flexible_array_counted_by_attr_field_not_integer : Error<
-  "field %0 in 'counted_by' must be a non-boolean integer type">;
+def err_counted_by_attr_argument_not_integer : Error<
+  "'counted_by' requires a non-boolean integer type argument">;
+def err_counted_by_attr_only_support_simple_decl_reference : Error<
+  "'counted_by' argument must be a simple declaration reference">;
+def err_counted_by_attr_in_union : Error<
+  "'counted_by' cannot be applied to a union member">;
+def err_counted_by_attr_refer_to_union : Error<
+  "'counted_by' argument cannot refer to a union member">;
 def note_flexible_array_counted_by_attr_field : Note<
   "field %0 declared here">;
 
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 5fad5fc3623cb6..eeed5f4751f2f4 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -101,6 +101,7 @@ FEATURE(memory_sanitizer,
 FEATURE(thread_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Thread))
 FEATURE(dataflow_sanitizer, LangOpts.Sanitize.has(SanitizerKind::DataFlow))
 FEATURE(scudo, LangOpts.Sanitize.hasOneOf(SanitizerKind::Scudo))
+FEATURE(ptrauth_intrinsics, LangOpts.PointerAuthIntrinsics)
 FEATURE(swiftasynccc,
   PP.getTargetInfo().checkCallingConvention(CC_SwiftAsync) ==
   clang::TargetInfo::CCCR_OK)
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 472fd9f093a718..8ef6700ecdc78e 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -161,6 +161,8 @@ LANGOPT(DllExportInlines  , 1, 1, "dllexported classes dllexport inline methods"
 LANGOPT(RelaxedTemplateTemplateArgs, 1, 0, "C++17 relaxed matching of template template arguments")
 LANGOPT(ExperimentalLibrary, 1, 0, "enable unstable and experimental library features")
 
+LANGOPT(PointerAuthIntrinsics, 1, 0, "pointer authentication intrinsics")
+
 LANGOPT(DoubleSquareBracketAttributes, 1, 0, "'[[]]' attributes extension for all language standard modes")
 
 COMPATIBLE_LANGOPT(RecoveryAST, 1, 1, "Preserve expressions in AST when encountering errors")
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 7682f84e491c7b..374595edd2ce4a 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -24,6 +24,7 @@
 #include "clang/Basic/TargetOptions.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallSet.h"
@@ -1571,6 +1572,11 @@ class TargetInfo : public TransferrableTargetInfo,
     return getAddressSpaceMap()[(unsigned)AS];
   }
 
+  /// Determine whether the given pointer-authentication key is valid.
+  ///
+  /// The value has been coerced to type 'int'.
+  virtual bool validatePointerAuthKey(const llvm::APSInt &value) const;
+
   /// Map from the address space field in builtin description strings to the
   /// language address space.
   virtual LangAS getOpenCLBuiltinAddressSpace(unsigned AS) const {
diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td
index 6419762417371b..3625f063758915 100644
--- a/clang/include/clang/Basic/TypeNodes.td
+++ b/clang/include/clang/Basic/TypeNodes.td
@@ -108,6 +108,8 @@ def ObjCTypeParamType : TypeNode<Type>, NeverCanonical;
 def ObjCObjectType : TypeNode<Type>;
 def ObjCInterfaceType : TypeNode<ObjCObjectType>, LeafType;
 def ObjCObjectPointerType : TypeNode<Type>;
+def BoundsAttributedType : TypeNode<Type, 1>;
+def CountAttributedType : TypeNode<BoundsAttributedType>, NeverCanonical;
 def PipeType : TypeNode<Type>;
 def AtomicType : TypeNode<Type>;
 def BitIntType : TypeNode<Type>;
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index c4cab360bab3bb..2ffc52bcb7ad3b 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -28,8 +28,8 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/StringSaver.h"
 
-#include <list>
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -615,6 +615,16 @@ class Driver {
   // FIXME: This should be in CompilationInfo.
   std::string GetProgramPath(StringRef Name, const ToolChain &TC) const;
 
+  /// Lookup the path to the Standard library module manifest.
+  ///
+  /// \param C - The compilation.
+  /// \param TC - The tool chain for additional information on
+  /// directories to search.
+  //
+  // FIXME: This should be in CompilationInfo.
+  std::string GetStdModuleManifestPath(const Compilation &C,
+                                       const ToolChain &TC) const;
+
   /// HandleAutocompletions - Handle --autocomplete by searching and printing
   /// possible flags, descriptions, and its arguments.
   void HandleAutocompletions(StringRef PassedFlags) const;
@@ -839,6 +849,13 @@ llvm::Error expandResponseFiles(SmallVectorImpl<const char *> &Args,
                                 bool ClangCLMode, llvm::BumpPtrAllocator &Alloc,
                                 llvm::vfs::FileSystem *FS = nullptr);
 
+/// Apply a space separated list of edits to the input argument lists.
+/// See applyOneOverrideOption.
+void applyOverrideOptions(SmallVectorImpl<const char *> &Args,
+                          const char *OverrideOpts,
+                          llvm::StringSet<> &SavedStrings,
+                          raw_ostream *OS = nullptr);
+
 } // end namespace driver
 } // end namespace clang
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a7e43b4d179a4d..9b5125ecfed8b3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4106,6 +4106,14 @@ defm strict_return : BoolFOption<"strict-return",
             " of a non-void function as unreachable">,
   PosFlag<SetTrue>>;
 
+let Group = f_Group in {
+  let Visibility = [ClangOption,CC1Option] in {
+    def fptrauth_intrinsics : Flag<["-"], "fptrauth-intrinsics">,
+      HelpText<"Enable pointer authentication intrinsics">;
+  }
+  def fno_ptrauth_intrinsics : Flag<["-"], "fno-ptrauth-intrinsics">;
+}
+
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
     Visibility<[ClangOption, CC1Option]>,
     HelpText<"Enable matrix data type and related builtin functions">,
@@ -4696,21 +4704,17 @@ def mrvv_vector_bits_EQ : Joined<["-"], "mrvv-vector-bits=">, Group<m_Group>,
     " (RISC-V only)")>;
 
 def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_Group>,
-  HelpText<"Allow memory accesses to be unaligned (AArch32/AArch64/LoongArch/RISC-V only)">;
+  HelpText<"Allow memory accesses to be unaligned (AArch32/MIPSr6 only)">;
 def mno_unaligned_access : Flag<["-"], "mno-unaligned-access">, Group<m_Group>,
-  HelpText<"Force all memory accesses to be aligned (AArch32/AArch64/LoongArch/RISC-V only)">;
+  HelpText<"Force all memory accesses to be aligned (AArch32/MIPSr6 only)">;
 def munaligned_symbols : Flag<["-"], "munaligned-symbols">, Group<m_Group>,
   HelpText<"Expect external char-aligned symbols to be without ABI alignment (SystemZ only)">;
 def mno_unaligned_symbols : Flag<["-"], "mno-unaligned-symbols">, Group<m_Group>,
   HelpText<"Expect external char-aligned symbols to be without ABI alignment (SystemZ only)">;
-} // let Flags = [TargetSpecific]
-def mstrict_align : Flag<["-"], "mstrict-align">, Alias<mno_unaligned_access>,
-  Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Force all memory accesses to be aligned (same as mno-unaligned-access)">;
-def mno_strict_align : Flag<["-"], "mno-strict-align">, Alias<munaligned_access>,
-  Flags<[HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Allow memory accesses to be unaligned (same as munaligned-access)">;
-let Flags = [TargetSpecific] in {
+def mstrict_align : Flag<["-"], "mstrict-align">,
+  HelpText<"Force all memory accesses to be aligned (AArch64/LoongArch/RISC-V only)">;
+def mno_strict_align : Flag<["-"], "mno-strict-align">,
+  HelpText<"Allow memory accesses to be unaligned (AArch64/LoongArch/RISC-V only)">;
 def mno_thumb : Flag<["-"], "mno-thumb">, Group<m_arm_Features_Group>;
 def mrestrict_it: Flag<["-"], "mrestrict-it">, Group<m_arm_Features_Group>,
   HelpText<"Disallow generation of complex IT blocks. It is off by default.">;
@@ -5395,6 +5399,9 @@ def print_resource_dir : Flag<["-", "--"], "print-resource-dir">,
 def print_search_dirs : Flag<["-", "--"], "print-search-dirs">,
   HelpText<"Print the paths used for finding libraries and programs">,
   Visibility<[ClangOption, CLOption]>;
+def print_std_module_manifest_path : Flag<["-", "--"], "print-library-module-manifest-path">,
+  HelpText<"Print the path for the C++ Standard library module manifest">,
+  Visibility<[ClangOption, CLOption]>;
 def print_targets : Flag<["-", "--"], "print-targets">,
   HelpText<"Print the registered targets">,
   Visibility<[ClangOption, CLOption]>;
@@ -6434,11 +6441,6 @@ def flang_deprecated_no_hlfir : Flag<["-"], "flang-deprecated-no-hlfir">,
   Flags<[HelpHidden]>, Visibility<[FlangOption, FC1Option]>,
   HelpText<"Do not use HLFIR lowering (deprecated)">;
 
-def flang_experimental_polymorphism : Flag<["-"], "flang-experimental-polymorphism">,
-  Flags<[HelpHidden]>, Visibility<[FlangOption, FC1Option]>,
-  HelpText<"Enable Fortran 2003 polymorphism (experimental)">;
-
-
 //===----------------------------------------------------------------------===//
 // FLangOption + CoreOption + NoXarchOption
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index fbe2e8fe8e88d8..a4f9cad98aa8b1 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -580,7 +580,7 @@ class ToolChain {
 
   // Return the DWARF version to emit, in the absence of arguments
   // to the contrary.
-  virtual unsigned GetDefaultDwarfVersion() const;
+  virtual unsigned GetDefaultDwarfVersion() const { return 5; }
 
   // Some toolchains may have different restrictions on the DWARF version and
   // may need to adjust it. E.g. NVPTX may need to enforce DWARF2 even when host
diff --git a/clang/include/clang/Format/.clang-format b/clang/include/clang/Format/.clang-format
index f95602cab0f7fc..d7331b3c8cf02e 100644
--- a/clang/include/clang/Format/.clang-format
+++ b/clang/include/clang/Format/.clang-format
@@ -1,6 +1 @@
-BasedOnStyle: LLVM
-InsertBraces: true
-InsertNewlineAtEOF: true
-LineEnding: LF
-RemoveBracesLLVM: true
-RemoveParentheses: ReturnStatement
+BasedOnStyle: clang-format
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 590297fd89a398..7ad2579bf7773b 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4728,6 +4728,60 @@ struct FormatStyle {
   /// \version 8
   std::vector<std::string> StatementMacros;
 
+  /// Works only when TableGenBreakInsideDAGArg is not DontBreak.
+  /// The string list needs to consist of identifiers in TableGen.
+  /// If any identifier is specified, this limits the line breaks by
+  /// TableGenBreakInsideDAGArg option only on DAGArg values beginning with
+  /// the specified identifiers.
+  ///
+  /// For example the configuration,
+  /// \code{.yaml}
+  ///   TableGenBreakInsideDAGArg: BreakAll
+  ///   TableGenBreakingDAGArgOperators: ['ins', 'outs']
+  /// \endcode
+  ///
+  /// makes the line break only occurs inside DAGArgs beginning with the
+  /// specified identifiers 'ins' and 'outs'.
+  ///
+  /// \code
+  ///   let DAGArgIns = (ins
+  ///       i32:$src1,
+  ///       i32:$src2
+  ///   );
+  ///   let DAGArgOtherID = (other i32:$other1, i32:$other2);
+  ///   let DAGArgBang = (!cast<SomeType>("Some") i32:$src1, i32:$src2)
+  /// \endcode
+  /// \version 19
+  std::vector<std::string> TableGenBreakingDAGArgOperators;
+
+  /// Different ways to control the format inside TableGen DAGArg.
+  enum DAGArgStyle : int8_t {
+    /// Never break inside DAGArg.
+    /// \code
+    ///   let DAGArgIns = (ins i32:$src1, i32:$src2);
+    /// \endcode
+    DAS_DontBreak,
+    /// Break inside DAGArg after each list element but for the last.
+    /// This aligns to the first element.
+    /// \code
+    ///   let DAGArgIns = (ins i32:$src1,
+    ///                        i32:$src2);
+    /// \endcode
+    DAS_BreakElements,
+    /// Break inside DAGArg after the operator and the all elements.
+    /// \code
+    ///   let DAGArgIns = (ins
+    ///       i32:$src1,
+    ///       i32:$src2
+    ///   );
+    /// \endcode
+    DAS_BreakAll,
+  };
+
+  /// The styles of the line break inside the DAGArg in TableGen.
+  /// \version 19
+  DAGArgStyle TableGenBreakInsideDAGArg;
+
   /// The number of columns used for tab stops.
   /// \version 3.7
   unsigned TabWidth;
@@ -4980,9 +5034,12 @@ struct FormatStyle {
            SpacesInSquareBrackets == R.SpacesInSquareBrackets &&
            Standard == R.Standard &&
            StatementAttributeLikeMacros == R.StatementAttributeLikeMacros &&
-           StatementMacros == R.StatementMacros && TabWidth == R.TabWidth &&
-           TypeNames == R.TypeNames && TypenameMacros == R.TypenameMacros &&
-           UseTab == R.UseTab &&
+           StatementMacros == R.StatementMacros &&
+           TableGenBreakingDAGArgOperators ==
+               R.TableGenBreakingDAGArgOperators &&
+           TableGenBreakInsideDAGArg == R.TableGenBreakInsideDAGArg &&
+           TabWidth == R.TabWidth && TypeNames == R.TypeNames &&
+           TypenameMacros == R.TypenameMacros && UseTab == R.UseTab &&
            VerilogBreakBetweenInstancePorts ==
                R.VerilogBreakBetweenInstancePorts &&
            WhitespaceSensitiveMacros == R.WhitespaceSensitiveMacros;
diff --git a/clang/include/clang/InstallAPI/Context.h b/clang/include/clang/InstallAPI/Context.h
index bdb576d7d85fb6..54e517544b8edf 100644
--- a/clang/include/clang/InstallAPI/Context.h
+++ b/clang/include/clang/InstallAPI/Context.h
@@ -11,6 +11,7 @@
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/FileManager.h"
+#include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/HeaderFile.h"
 #include "clang/InstallAPI/MachO.h"
 #include "llvm/ADT/DenseMap.h"
@@ -45,6 +46,9 @@ struct InstallAPIContext {
   /// DiagnosticsEngine for all error reporting.
   DiagnosticsEngine *Diags = nullptr;
 
+  /// Verifier when binary dylib is passed as input.
+  std::unique_ptr<DylibVerifier> Verifier = nullptr;
+
   /// File Path of output location.
   llvm::StringRef OutputLoc{};
 
diff --git a/clang/include/clang/InstallAPI/DylibVerifier.h b/clang/include/clang/InstallAPI/DylibVerifier.h
new file mode 100644
index 00000000000000..8269715c7f2345
--- /dev/null
+++ b/clang/include/clang/InstallAPI/DylibVerifier.h
@@ -0,0 +1,149 @@
+//===- InstallAPI/DylibVerifier.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_DYLIBVERIFIER_H
+#define LLVM_CLANG_INSTALLAPI_DYLIBVERIFIER_H
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/InstallAPI/MachO.h"
+
+namespace clang {
+namespace installapi {
+struct FrontendAttrs;
+
+/// A list of InstallAPI verification modes.
+enum class VerificationMode {
+  Invalid,
+  ErrorsOnly,
+  ErrorsAndWarnings,
+  Pedantic,
+};
+
+/// Service responsible to tracking state of verification across the
+/// lifetime of InstallAPI.
+/// As declarations are collected during AST traversal, they are
+/// compared as symbols against what is available in the binary dylib.
+class DylibVerifier {
+private:
+  struct SymbolContext;
+
+public:
+  enum class Result { NoVerify, Ignore, Valid, Invalid };
+  struct VerifierContext {
+    // Current target being verified against the AST.
+    llvm::MachO::Target Target;
+
+    // Target specific API from binary.
+    RecordsSlice *DylibSlice = nullptr;
+
+    // Query state of verification after AST has been traversed.
+    Result FrontendState = Result::Ignore;
+
+    // First error for AST traversal, which is tied to the target triple.
+    bool DiscoveredFirstError = false;
+
+    // Determines what kind of banner to print a violation for.
+    bool PrintArch = false;
+
+    // Engine for reporting violations.
+    DiagnosticsEngine *Diag = nullptr;
+
+    // Handle diagnostics reporting for target level violations.
+    void emitDiag(llvm::function_ref<void()> Report);
+
+    VerifierContext() = default;
+    VerifierContext(DiagnosticsEngine *Diag) : Diag(Diag) {}
+  };
+
+  DylibVerifier() = default;
+
+  DylibVerifier(llvm::MachO::Records &&Dylib, DiagnosticsEngine *Diag,
+                VerificationMode Mode, bool Demangle)
+      : Dylib(std::move(Dylib)), Mode(Mode), Demangle(Demangle),
+        Exports(std::make_unique<SymbolSet>()), Ctx(VerifierContext{Diag}) {}
+
+  Result verify(GlobalRecord *R, const FrontendAttrs *FA);
+  Result verify(ObjCInterfaceRecord *R, const FrontendAttrs *FA);
+  Result verify(ObjCIVarRecord *R, const FrontendAttrs *FA,
+                const StringRef SuperClass);
+
+  /// Initialize target for verification.
+  void setTarget(const Target &T);
+
+  /// Release ownership over exports.
+  std::unique_ptr<SymbolSet> getExports() { return std::move(Exports); }
+
+  /// Get result of verification.
+  Result getState() const { return Ctx.FrontendState; }
+
+  /// Set different source managers to the same diagnostics engine.
+  void setSourceManager(SourceManager &SourceMgr) const {
+    if (!Ctx.Diag)
+      return;
+    Ctx.Diag->setSourceManager(&SourceMgr);
+  }
+
+private:
+  /// Determine whether to compare declaration to symbol in binary.
+  bool canVerify();
+
+  /// Shared implementation for verifying exported symbols.
+  Result verifyImpl(Record *R, SymbolContext &SymCtx);
+
+  /// Check if declaration is marked as obsolete, they are
+  // expected to result in a symbol mismatch.
+  bool shouldIgnoreObsolete(const Record *R, SymbolContext &SymCtx,
+                            const Record *DR);
+
+  /// Compare the visibility declarations to the linkage of symbol found in
+  /// dylib.
+  Result compareVisibility(const Record *R, SymbolContext &SymCtx,
+                           const Record *DR);
+
+  /// An ObjCInterfaceRecord can represent up to three symbols. When verifying,
+  // account for this granularity.
+  bool compareObjCInterfaceSymbols(const Record *R, SymbolContext &SymCtx,
+                                   const ObjCInterfaceRecord *DR);
+
+  /// Validate availability annotations against dylib.
+  Result compareAvailability(const Record *R, SymbolContext &SymCtx,
+                             const Record *DR);
+
+  /// Compare and validate matching symbol flags.
+  bool compareSymbolFlags(const Record *R, SymbolContext &SymCtx,
+                          const Record *DR);
+
+  /// Update result state on each call to `verify`.
+  void updateState(Result State);
+
+  /// Add verified exported symbol.
+  void addSymbol(const Record *R, SymbolContext &SymCtx,
+                 TargetList &&Targets = {});
+
+  /// Find matching dylib slice for target triple that is being parsed.
+  void assignSlice(const Target &T);
+
+  // Symbols in dylib.
+  llvm::MachO::Records Dylib;
+
+  // Controls what class of violations to report.
+  VerificationMode Mode = VerificationMode::Invalid;
+
+  // Attempt to demangle when reporting violations.
+  bool Demangle = false;
+
+  // Valid symbols in final text file.
+  std::unique_ptr<SymbolSet> Exports = std::make_unique<SymbolSet>();
+
+  // Track current state of verification while traversing AST.
+  VerifierContext Ctx;
+};
+
+} // namespace installapi
+} // namespace clang
+#endif // LLVM_CLANG_INSTALLAPI_DYLIBVERIFIER_H
diff --git a/clang/include/clang/InstallAPI/Frontend.h b/clang/include/clang/InstallAPI/Frontend.h
index 873cb50d60a542..5cccd891c58093 100644
--- a/clang/include/clang/InstallAPI/Frontend.h
+++ b/clang/include/clang/InstallAPI/Frontend.h
@@ -14,10 +14,10 @@
 #define LLVM_CLANG_INSTALLAPI_FRONTEND_H
 
 #include "clang/AST/ASTConsumer.h"
-#include "clang/AST/Availability.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/Visitor.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -35,6 +35,8 @@ class InstallAPIAction : public ASTFrontendAction {
 
   std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
                                                  StringRef InFile) override {
+    Ctx.Diags->getClient()->BeginSourceFile(CI.getLangOpts());
+    Ctx.Verifier->setSourceManager(CI.getSourceManager());
     return std::make_unique<InstallAPIVisitor>(
         CI.getASTContext(), Ctx, CI.getSourceManager(), CI.getPreprocessor());
   }
diff --git a/clang/include/clang/InstallAPI/FrontendRecords.h b/clang/include/clang/InstallAPI/FrontendRecords.h
index 1f5bc37798befd..59271e81e230c2 100644
--- a/clang/include/clang/InstallAPI/FrontendRecords.h
+++ b/clang/include/clang/InstallAPI/FrontendRecords.h
@@ -43,13 +43,13 @@ class FrontendRecordsSlice : public llvm::MachO::RecordsSlice {
   /// \param Flags The flags that describe attributes of the symbol.
   /// \param Inlined Whether declaration is inlined, only applicable to
   /// functions.
-  /// \return The non-owning pointer to added record in slice.
-  GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage,
-                          GlobalRecord::Kind GV,
-                          const clang::AvailabilityInfo Avail, const Decl *D,
-                          const HeaderType Access,
-                          SymbolFlags Flags = SymbolFlags::None,
-                          bool Inlined = false);
+  /// \return The non-owning pointer to added record in slice with it's frontend
+  /// attributes.
+  std::pair<GlobalRecord *, FrontendAttrs *>
+  addGlobal(StringRef Name, RecordLinkage Linkage, GlobalRecord::Kind GV,
+            const clang::AvailabilityInfo Avail, const Decl *D,
+            const HeaderType Access, SymbolFlags Flags = SymbolFlags::None,
+            bool Inlined = false);
 
   /// Add ObjC Class record with attributes from AST.
   ///
@@ -60,11 +60,12 @@ class FrontendRecordsSlice : public llvm::MachO::RecordsSlice {
   /// \param D The pointer to the declaration from traversing AST.
   /// \param Access The intended access level of symbol.
   /// \param IsEHType Whether declaration has an exception attribute.
-  /// \return The non-owning pointer to added record in slice.
-  ObjCInterfaceRecord *addObjCInterface(StringRef Name, RecordLinkage Linkage,
-                                        const clang::AvailabilityInfo Avail,
-                                        const Decl *D, HeaderType Access,
-                                        bool IsEHType);
+  /// \return The non-owning pointer to added record in slice with it's frontend
+  /// attributes.
+  std::pair<ObjCInterfaceRecord *, FrontendAttrs *>
+  addObjCInterface(StringRef Name, RecordLinkage Linkage,
+                   const clang::AvailabilityInfo Avail, const Decl *D,
+                   HeaderType Access, bool IsEHType);
 
   /// Add ObjC Category record with attributes from AST.
   ///
@@ -75,11 +76,12 @@ class FrontendRecordsSlice : public llvm::MachO::RecordsSlice {
   /// to the active target triple.
   /// \param D The pointer to the declaration from traversing AST.
   /// \param Access The intended access level of symbol.
-  /// \return The non-owning pointer to added record in slice.
-  ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend,
-                                      StringRef CategoryName,
-                                      const clang::AvailabilityInfo Avail,
-                                      const Decl *D, HeaderType Access);
+  /// \return The non-owning pointer to added record in slice with it's frontend
+  /// attributes.
+  std::pair<ObjCCategoryRecord *, FrontendAttrs *>
+  addObjCCategory(StringRef ClassToExtend, StringRef CategoryName,
+                  const clang::AvailabilityInfo Avail, const Decl *D,
+                  HeaderType Access);
 
   /// Add ObjC IVar record with attributes from AST.
   ///
@@ -91,12 +93,13 @@ class FrontendRecordsSlice : public llvm::MachO::RecordsSlice {
   /// \param D The pointer to the declaration from traversing AST.
   /// \param Access The intended access level of symbol.
   /// \param AC The access control tied to the ivar declaration.
-  /// \return The non-owning pointer to added record in slice.
-  ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container,
-                              StringRef IvarName, RecordLinkage Linkage,
-                              const clang::AvailabilityInfo Avail,
-                              const Decl *D, HeaderType Access,
-                              const clang::ObjCIvarDecl::AccessControl AC);
+  /// \return The non-owning pointer to added record in slice with it's frontend
+  /// attributes.
+  std::pair<ObjCIVarRecord *, FrontendAttrs *>
+  addObjCIVar(ObjCContainerRecord *Container, StringRef IvarName,
+              RecordLinkage Linkage, const clang::AvailabilityInfo Avail,
+              const Decl *D, HeaderType Access,
+              const clang::ObjCIvarDecl::AccessControl AC);
 
 private:
   /// Mapping of records stored in slice to their frontend attributes.
diff --git a/clang/include/clang/InstallAPI/InstallAPIDiagnostic.h b/clang/include/clang/InstallAPI/InstallAPIDiagnostic.h
new file mode 100644
index 00000000000000..547fb0bcf9a893
--- /dev/null
+++ b/clang/include/clang/InstallAPI/InstallAPIDiagnostic.h
@@ -0,0 +1,14 @@
+//===--- InstallAPIDiagnostic.h - Diagnostics for InstallAPI ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_INSTALLAPIDIAGNOSTIC_H
+#define LLVM_CLANG_INSTALLAPI_INSTALLAPIDIAGNOSTIC_H
+
+#include "clang/Basic/DiagnosticInstallAPI.h"
+
+#endif
diff --git a/clang/include/clang/InstallAPI/MachO.h b/clang/include/clang/InstallAPI/MachO.h
index 55e5591389ce1f..a77766511fa3e5 100644
--- a/clang/include/clang/InstallAPI/MachO.h
+++ b/clang/include/clang/InstallAPI/MachO.h
@@ -18,6 +18,7 @@
 #include "llvm/TextAPI/PackedVersion.h"
 #include "llvm/TextAPI/Platform.h"
 #include "llvm/TextAPI/RecordVisitor.h"
+#include "llvm/TextAPI/Symbol.h"
 #include "llvm/TextAPI/Target.h"
 #include "llvm/TextAPI/TextAPIWriter.h"
 #include "llvm/TextAPI/Utils.h"
@@ -31,10 +32,13 @@ using ObjCInterfaceRecord = llvm::MachO::ObjCInterfaceRecord;
 using ObjCCategoryRecord = llvm::MachO::ObjCCategoryRecord;
 using ObjCIVarRecord = llvm::MachO::ObjCIVarRecord;
 using Records = llvm::MachO::Records;
+using RecordsSlice = llvm::MachO::RecordsSlice;
 using BinaryAttrs = llvm::MachO::RecordsSlice::BinaryAttrs;
 using SymbolSet = llvm::MachO::SymbolSet;
+using SimpleSymbol = llvm::MachO::SimpleSymbol;
 using FileType = llvm::MachO::FileType;
 using PackedVersion = llvm::MachO::PackedVersion;
 using Target = llvm::MachO::Target;
+using TargetList = llvm::MachO::TargetList;
 
 #endif // LLVM_CLANG_INSTALLAPI_MACHO_H
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 64e031d5094c74..bba8ef4ff01739 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3073,6 +3073,11 @@ class Parser : public CodeCompletionHandler {
                                  SourceLocation ScopeLoc,
                                  ParsedAttr::Form Form);
 
+  void ParseBoundsAttribute(IdentifierInfo &AttrName,
+                            SourceLocation AttrNameLoc, ParsedAttributes &Attrs,
+                            IdentifierInfo *ScopeName, SourceLocation ScopeLoc,
+                            ParsedAttr::Form Form);
+
   void ParseTypeofSpecifier(DeclSpec &DS);
   SourceLocation ParseDecltypeSpecifier(DeclSpec &DS);
   void AnnotateExistingDecltypeSpecifier(const DeclSpec &DS,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 69d5709e5e2f06..5ecd2f9eb2881f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3060,6 +3060,8 @@ class Sema final {
                                     TemplateIdAnnotation *TemplateId,
                                     bool IsMemberSpecialization);
 
+  bool checkConstantPointerAuthKey(Expr *keyExpr, unsigned &key);
+
   void DiagnoseFunctionSpecifiers(const DeclSpec &DS);
   NamedDecl *getShadowedDeclaration(const TypedefNameDecl *D,
                                     const LookupResult &R);
@@ -3929,8 +3931,6 @@ class Sema final {
                                             bool BestCase,
                                             MSInheritanceModel Model);
 
-  bool CheckCountedByAttr(Scope *Scope, const FieldDecl *FD);
-
   EnforceTCBAttr *mergeEnforceTCBAttr(Decl *D, const EnforceTCBAttr &AL);
   EnforceTCBLeafAttr *mergeEnforceTCBLeafAttr(Decl *D,
                                               const EnforceTCBLeafAttr &AL);
@@ -5243,6 +5243,7 @@ class Sema final {
     enum ExpressionKind {
       EK_Decltype,
       EK_TemplateArgument,
+      EK_BoundsAttrArgument,
       EK_Other
     } ExprContext;
 
@@ -5360,6 +5361,12 @@ class Sema final {
     return ExprEvalContexts.back();
   };
 
+  bool isBoundsAttrContext() const {
+    return ExprEvalContexts.back().ExprContext ==
+           ExpressionEvaluationContextRecord::ExpressionKind::
+               EK_BoundsAttrArgument;
+  }
+
   /// Increment when we find a reference; decrement when we find an ignored
   /// assignment.  Ultimately the value is 0 if every reference is an ignored
   /// assignment.
@@ -11745,6 +11752,8 @@ class Sema final {
   QualType BuildMatrixType(QualType T, Expr *NumRows, Expr *NumColumns,
                            SourceLocation AttrLoc);
 
+  QualType BuildCountAttributedArrayType(QualType WrappedTy, Expr *CountExpr);
+
   QualType BuildAddressSpaceAttr(QualType &T, LangAS ASIdx, Expr *AddrSpace,
                                  SourceLocation AttrLoc);
 
diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h
index 80a1359fd16aa0..5d3e95cb5d630f 100644
--- a/clang/include/clang/Serialization/ASTRecordReader.h
+++ b/clang/include/clang/Serialization/ASTRecordReader.h
@@ -221,6 +221,8 @@ class ASTRecordReader
     return Reader->ReadSelector(*F, Record, Idx);
   }
 
+  TypeCoupledDeclRefInfo readTypeCoupledDeclRefInfo();
+
   /// Read a declaration name, advancing Idx.
   // DeclarationName readDeclarationName(); (inherited)
   DeclarationNameLoc readDeclarationNameLoc(DeclarationName Name);
diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h
index 9a64735c9fa55d..e007d4a70843a1 100644
--- a/clang/include/clang/Serialization/ASTRecordWriter.h
+++ b/clang/include/clang/Serialization/ASTRecordWriter.h
@@ -141,6 +141,11 @@ class ASTRecordWriter
     AddSourceLocation(Loc);
   }
 
+  void writeTypeCoupledDeclRefInfo(TypeCoupledDeclRefInfo Info) {
+    writeDeclRef(Info.getDecl());
+    writeBool(Info.isDeref());
+  }
+
   /// Emit a source range.
   void AddSourceRange(SourceRange Range, LocSeq *Seq = nullptr) {
     return Writer->AddSourceRange(Range, *Record, Seq);
diff --git a/clang/include/clang/Serialization/TypeBitCodes.def b/clang/include/clang/Serialization/TypeBitCodes.def
index adbd5ec5d4b544..3c82dfed9497d5 100644
--- a/clang/include/clang/Serialization/TypeBitCodes.def
+++ b/clang/include/clang/Serialization/TypeBitCodes.def
@@ -65,6 +65,6 @@ TYPE_BIT_CODE(DependentSizedMatrix, DEPENDENT_SIZE_MATRIX, 53)
 TYPE_BIT_CODE(Using, USING, 54)
 TYPE_BIT_CODE(BTFTagAttributed, BTFTAG_ATTRIBUTED, 55)
 TYPE_BIT_CODE(PackIndexing, PACK_INDEXING, 56)
-
+TYPE_BIT_CODE(CountAttributed, COUNT_ATTRIBUTED, 57)
 
 #undef TYPE_BIT_CODE
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 5a8fae76a43a4d..fcf801adeaf5ef 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2348,6 +2348,9 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     return getTypeInfo(
                   cast<AttributedType>(T)->getEquivalentType().getTypePtr());
 
+  case Type::CountAttributed:
+    return getTypeInfo(cast<CountAttributedType>(T)->desugar().getTypePtr());
+
   case Type::BTFTagAttributed:
     return getTypeInfo(
         cast<BTFTagAttributedType>(T)->getWrappedType().getTypePtr());
@@ -3122,6 +3125,32 @@ QualType ASTContext::removePtrSizeAddrSpace(QualType T) const {
   return T;
 }
 
+QualType ASTContext::getCountAttributedType(
+    QualType WrappedTy, Expr *CountExpr, bool CountInBytes, bool OrNull,
+    ArrayRef<TypeCoupledDeclRefInfo> DependentDecls) const {
+  assert(WrappedTy->isPointerType() || WrappedTy->isArrayType());
+
+  llvm::FoldingSetNodeID ID;
+  CountAttributedType::Profile(ID, WrappedTy, CountExpr, CountInBytes, OrNull);
+
+  void *InsertPos = nullptr;
+  CountAttributedType *CATy =
+      CountAttributedTypes.FindNodeOrInsertPos(ID, InsertPos);
+  if (CATy)
+    return QualType(CATy, 0);
+
+  QualType CanonTy = getCanonicalType(WrappedTy);
+  size_t Size = CountAttributedType::totalSizeToAlloc<TypeCoupledDeclRefInfo>(
+      DependentDecls.size());
+  CATy = (CountAttributedType *)Allocate(Size, TypeAlignment);
+  new (CATy) CountAttributedType(WrappedTy, CanonTy, CountExpr, CountInBytes,
+                                 OrNull, DependentDecls);
+  Types.push_back(CATy);
+  CountAttributedTypes.InsertNode(CATy, InsertPos);
+
+  return QualType(CATy, 0);
+}
+
 const FunctionType *ASTContext::adjustFunctionType(const FunctionType *T,
                                                    FunctionType::ExtInfo Info) {
   if (T->getExtInfo() == Info)
@@ -13234,6 +13263,32 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X,
       return QualType();
     return Ctx.getUsingType(CD, Ctx.getQualifiedType(Underlying));
   }
+  case Type::CountAttributed: {
+    const auto *DX = cast<CountAttributedType>(X),
+               *DY = cast<CountAttributedType>(Y);
+    if (DX->isCountInBytes() != DY->isCountInBytes())
+      return QualType();
+    if (DX->isOrNull() != DY->isOrNull())
+      return QualType();
+    Expr *CEX = DX->getCountExpr();
+    Expr *CEY = DY->getCountExpr();
+    llvm::ArrayRef<clang::TypeCoupledDeclRefInfo> CDX = DX->getCoupledDecls();
+    if (Ctx.hasSameExpr(CEX, CEY))
+      return Ctx.getCountAttributedType(Ctx.getQualifiedType(Underlying), CEX,
+                                        DX->isCountInBytes(), DX->isOrNull(),
+                                        CDX);
+    if (!CEX->isIntegerConstantExpr(Ctx) || !CEY->isIntegerConstantExpr(Ctx))
+      return QualType();
+    // Two declarations with the same integer constant may still differ in their
+    // expression pointers, so we need to evaluate them.
+    llvm::APSInt VX = *CEX->getIntegerConstantExpr(Ctx);
+    llvm::APSInt VY = *CEY->getIntegerConstantExpr(Ctx);
+    if (VX != VY)
+      return QualType();
+    return Ctx.getCountAttributedType(Ctx.getQualifiedType(Underlying), CEX,
+                                      DX->isCountInBytes(), DX->isOrNull(),
+                                      CDX);
+  }
   }
   llvm_unreachable("Unhandled Type Class");
 }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 023aaa7f0572b4..786695f00fadcc 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1527,6 +1527,28 @@ ExpectedType ASTNodeImporter::VisitAttributedType(const AttributedType *T) {
       *ToModifiedTypeOrErr, *ToEquivalentTypeOrErr);
 }
 
+ExpectedType
+ASTNodeImporter::VisitCountAttributedType(const CountAttributedType *T) {
+  ExpectedType ToWrappedTypeOrErr = import(T->desugar());
+  if (!ToWrappedTypeOrErr)
+    return ToWrappedTypeOrErr.takeError();
+
+  Error Err = Error::success();
+  Expr *CountExpr = importChecked(Err, T->getCountExpr());
+
+  SmallVector<TypeCoupledDeclRefInfo, 1> CoupledDecls;
+  for (auto TI : T->dependent_decls()) {
+    Expected<ValueDecl *> ToDeclOrErr = import(TI.getDecl());
+    if (!ToDeclOrErr)
+      return ToDeclOrErr.takeError();
+    CoupledDecls.emplace_back(*ToDeclOrErr, TI.isDeref());
+  }
+
+  return Importer.getToContext().getCountAttributedType(
+      *ToWrappedTypeOrErr, CountExpr, T->isCountInBytes(), T->isOrNull(),
+      ArrayRef(CoupledDecls.data(), CoupledDecls.size()));
+}
+
 ExpectedType ASTNodeImporter::VisitTemplateTypeParmType(
     const TemplateTypeParmType *T) {
   Expected<TemplateTypeParmDecl *> ToDeclOrErr = import(T->getDecl());
@@ -9305,16 +9327,6 @@ Expected<Attr *> ASTImporter::Import(const Attr *FromAttr) {
                   From->args_size());
     break;
   }
-  case attr::CountedBy: {
-    AI.cloneAttr(FromAttr);
-    const auto *CBA = cast<CountedByAttr>(FromAttr);
-    Expected<SourceRange> SR = Import(CBA->getCountedByFieldLoc()).get();
-    if (!SR)
-      return SR.takeError();
-    AI.castAttrAs<CountedByAttr>()->setCountedByFieldLoc(SR.get());
-    break;
-  }
-
   default: {
     // The default branch works for attributes that have no arguments to import.
     // FIXME: Handle every attribute type that has arguments of type to import
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index fe6e03ce174e59..226e0aa38ece70 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -1069,6 +1069,13 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
       return false;
     break;
 
+  case Type::CountAttributed:
+    if (!IsStructurallyEquivalent(Context,
+                                  cast<CountAttributedType>(T1)->desugar(),
+                                  cast<CountAttributedType>(T2)->desugar()))
+      return false;
+    break;
+
   case Type::BTFTagAttributed:
     if (!IsStructurallyEquivalent(
             Context, cast<BTFTagAttributedType>(T1)->getWrappedType(),
diff --git a/clang/lib/AST/Availability.cpp b/clang/lib/AST/Availability.cpp
index d0054e37e4dcd2..238359a2dedfcf 100644
--- a/clang/lib/AST/Availability.cpp
+++ b/clang/lib/AST/Availability.cpp
@@ -28,9 +28,9 @@ AvailabilityInfo AvailabilityInfo::createFromDecl(const Decl *Decl) {
     for (const auto *A : RD->specific_attrs<AvailabilityAttr>()) {
       if (A->getPlatform()->getName() != PlatformName)
         continue;
-      Availability =
-          AvailabilityInfo(A->getPlatform()->getName(), A->getIntroduced(),
-                           A->getDeprecated(), A->getObsoleted(), false, false);
+      Availability = AvailabilityInfo(
+          A->getPlatform()->getName(), A->getIntroduced(), A->getDeprecated(),
+          A->getObsoleted(), A->getUnavailable(), false, false);
       break;
     }
 
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 1c3dcf63465c68..645ec2f7563bca 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -1567,10 +1567,9 @@ bool CXXRecordDecl::isGenericLambda() const {
 
 #ifndef NDEBUG
 static bool allLookupResultsAreTheSame(const DeclContext::lookup_result &R) {
-  for (auto *D : R)
-    if (!declaresSameEntity(D, R.front()))
-      return false;
-  return true;
+  return llvm::all_of(R, [&](NamedDecl *D) {
+    return D->isInvalidDecl() || declaresSameEntity(D, R.front());
+  });
 }
 #endif
 
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index b701581b2474a9..edbcdfe4d55bc9 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -679,6 +679,16 @@ static void printExplicitSpecifier(ExplicitSpecifier ES, llvm::raw_ostream &Out,
   Out << Proto;
 }
 
+static void MaybePrintTagKeywordIfSupressingScopes(PrintingPolicy &Policy,
+                                                   QualType T,
+                                                   llvm::raw_ostream &Out) {
+  StringRef prefix = T->isClassType()       ? "class "
+                     : T->isStructureType() ? "struct "
+                     : T->isUnionType()     ? "union "
+                                            : "";
+  Out << prefix;
+}
+
 void DeclPrinter::VisitFunctionDecl(FunctionDecl *D) {
   if (!D->getDescribedFunctionTemplate() &&
       !D->isFunctionTemplateSpecialization())
@@ -855,6 +865,10 @@ void DeclPrinter::VisitFunctionDecl(FunctionDecl *D) {
         Out << Proto << " -> ";
         Proto.clear();
       }
+      if (!Policy.SuppressTagKeyword && Policy.SuppressScope &&
+          !Policy.SuppressUnwrittenScope)
+        MaybePrintTagKeywordIfSupressingScopes(Policy, AFT->getReturnType(),
+                                               Out);
       AFT->getReturnType().print(Out, Policy, Proto);
       Proto.clear();
     }
@@ -1022,6 +1036,9 @@ void DeclPrinter::VisitVarDecl(VarDecl *D) {
              ? D->getIdentifier()->deuglifiedName()
              : D->getName();
 
+  if (!Policy.SuppressTagKeyword && Policy.SuppressScope &&
+      !Policy.SuppressUnwrittenScope)
+    MaybePrintTagKeywordIfSupressingScopes(Policy, T, Out);
   printDeclType(T, Name);
 
   // Print the attributes that should be placed right before the end of the
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index f5ad402e3bd73e..6221ebd5c9b4e9 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -676,7 +676,8 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedIdentKind IK) {
 // FIXME: Maybe this should use DeclPrinter with a special "print predefined
 // expr" policy instead.
 std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
-                                        const Decl *CurrentDecl) {
+                                        const Decl *CurrentDecl,
+                                        bool ForceElaboratedPrinting) {
   ASTContext &Context = CurrentDecl->getASTContext();
 
   if (IK == PredefinedIdentKind::FuncDName) {
@@ -724,10 +725,21 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
     return std::string(Out.str());
   }
   if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(CurrentDecl)) {
-    if (IK != PredefinedIdentKind::PrettyFunction &&
+    const auto &LO = Context.getLangOpts();
+    bool IsFuncOrFunctionInNonMSVCCompatEnv =
+        ((IK == PredefinedIdentKind::Func ||
+          IK == PredefinedIdentKind ::Function) &&
+         !LO.MSVCCompat);
+    bool IsLFunctionInMSVCCommpatEnv =
+        IK == PredefinedIdentKind::LFunction && LO.MSVCCompat;
+    bool IsFuncOrFunctionOrLFunctionOrFuncDName =
+        IK != PredefinedIdentKind::PrettyFunction &&
         IK != PredefinedIdentKind::PrettyFunctionNoVirtual &&
         IK != PredefinedIdentKind::FuncSig &&
-        IK != PredefinedIdentKind::LFuncSig)
+        IK != PredefinedIdentKind::LFuncSig;
+    if ((ForceElaboratedPrinting &&
+         (IsFuncOrFunctionInNonMSVCCompatEnv || IsLFunctionInMSVCCommpatEnv)) ||
+        (!ForceElaboratedPrinting && IsFuncOrFunctionOrLFunctionOrFuncDName))
       return FD->getNameAsString();
 
     SmallString<256> Name;
@@ -755,6 +767,8 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
     PrintingPolicy Policy(Context.getLangOpts());
     PrettyCallbacks PrettyCB(Context.getLangOpts());
     Policy.Callbacks = &PrettyCB;
+    if (IK == PredefinedIdentKind::Function && ForceElaboratedPrinting)
+      Policy.SuppressTagKeyword = !LO.MSVCCompat;
     std::string Proto;
     llvm::raw_string_ostream POut(Proto);
 
@@ -782,6 +796,12 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
 
     FD->printQualifiedName(POut, Policy);
 
+    if (IK == PredefinedIdentKind::Function) {
+      POut.flush();
+      Out << Proto;
+      return std::string(Name);
+    }
+
     POut << "(";
     if (FT) {
       for (unsigned i = 0, e = Decl->getNumParams(); i != e; ++i) {
@@ -4604,8 +4624,17 @@ SourceRange DesignatedInitExpr::getDesignatorsSourceRange() const {
 SourceLocation DesignatedInitExpr::getBeginLoc() const {
   auto *DIE = const_cast<DesignatedInitExpr *>(this);
   Designator &First = *DIE->getDesignator(0);
-  if (First.isFieldDesignator())
-    return GNUSyntax ? First.getFieldLoc() : First.getDotLoc();
+  if (First.isFieldDesignator()) {
+    // Skip past implicit designators for anonymous structs/unions, since
+    // these do not have valid source locations.
+    for (unsigned int i = 0; i < DIE->size(); i++) {
+      Designator &Des = *DIE->getDesignator(i);
+      SourceLocation retval = GNUSyntax ? Des.getFieldLoc() : Des.getDotLoc();
+      if (!retval.isValid())
+        continue;
+      return retval;
+    }
+  }
   return First.getLBracketLoc();
 }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b154a196e11c7d..592d43597dc1b4 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -5594,6 +5594,9 @@ static EvalStmtResult EvaluateStmt(StmtResult &Result, EvalInfo &Info,
         if (Assumption->isValueDependent())
           return ESR_Failed;
 
+        if (Assumption->HasSideEffects(Info.getCtx()))
+          continue;
+
         bool Value;
         if (!EvaluateAsBooleanCondition(Assumption, Value, Info))
           return ESR_Failed;
@@ -7344,9 +7347,6 @@ class BufferToAPValueConverter {
       for (size_t I = 0, E = CXXRD->getNumBases(); I != E; ++I) {
         const CXXBaseSpecifier &BS = CXXRD->bases_begin()[I];
         CXXRecordDecl *BaseDecl = BS.getType()->getAsCXXRecordDecl();
-        if (BaseDecl->isEmpty() ||
-            Info.Ctx.getASTRecordLayout(BaseDecl).getNonVirtualSize().isZero())
-          continue;
 
         std::optional<APValue> SubObj = visitType(
             BS.getType(), Layout.getBaseClassOffset(BaseDecl) + Offset);
@@ -9253,7 +9253,8 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) {
            Info.getLangOpts().CPlusPlus26)) {
         // Permitted.
       } else {
-        if (SubExpr->getType()->isVoidPointerType()) {
+        if (SubExpr->getType()->isVoidPointerType() &&
+            Info.getLangOpts().CPlusPlus) {
           if (HasValidResult)
             CCEDiag(E, diag::note_constexpr_invalid_void_star_cast)
                 << SubExpr->getType() << Info.getLangOpts().CPlusPlus26
@@ -11858,8 +11859,8 @@ static QualType getObjectType(APValue::LValueBase B) {
 static const Expr *ignorePointerCastsAndParens(const Expr *E) {
   assert(E->isPRValue() && E->getType()->hasPointerRepresentation());
 
-  auto *NoParens = E->IgnoreParens();
-  auto *Cast = dyn_cast<CastExpr>(NoParens);
+  const Expr *NoParens = E->IgnoreParens();
+  const auto *Cast = dyn_cast<CastExpr>(NoParens);
   if (Cast == nullptr)
     return NoParens;
 
@@ -11870,7 +11871,7 @@ static const Expr *ignorePointerCastsAndParens(const Expr *E) {
       CastKind != CK_AddressSpaceConversion)
     return NoParens;
 
-  auto *SubExpr = Cast->getSubExpr();
+  const auto *SubExpr = Cast->getSubExpr();
   if (!SubExpr->getType()->hasPointerRepresentation() || !SubExpr->isPRValue())
     return NoParens;
   return ignorePointerCastsAndParens(SubExpr);
@@ -12942,6 +12943,10 @@ static bool isOnePastTheEndOfCompleteObject(const ASTContext &Ctx,
   if (Ty->isIncompleteType())
     return true;
 
+  // Can't be past the end of an invalid object.
+  if (LV.getLValueDesignator().Invalid)
+    return false;
+
   // We're a past-the-end pointer if we point to the byte after the object,
   // no matter what our type or path is.
   auto Size = Ctx.getTypeSizeInChars(Ty);
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 86304a54473cea..73831eefba4568 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -82,15 +82,27 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (DiscardResult)
       return this->discard(SubExpr);
 
-    if (SubExpr->getType()->isAnyComplexType())
-      return this->delegate(SubExpr);
+    std::optional<PrimType> SubExprT = classify(SubExpr->getType());
+    // Prepare storage for the result.
+    if (!Initializing && !SubExprT) {
+      std::optional<unsigned> LocalIndex =
+          allocateLocal(SubExpr, /*IsExtended=*/false);
+      if (!LocalIndex)
+        return false;
+      if (!this->emitGetPtrLocal(*LocalIndex, CE))
+        return false;
+    }
 
     if (!this->visit(SubExpr))
       return false;
 
-    if (std::optional<PrimType> SubExprT = classify(SubExpr->getType()))
+    if (SubExprT)
       return this->emitLoadPop(*SubExprT, CE);
-    return false;
+
+    // If the subexpr type is not primitive, we need to perform a copy here.
+    // This happens for example in C when dereferencing a pointer of struct
+    // type.
+    return this->emitMemcpy(CE);
   }
 
   case CK_UncheckedDerivedToBase:
@@ -296,14 +308,11 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     // Location for the SubExpr.
     // Since SubExpr is of complex type, visiting it results in a pointer
     // anyway, so we just create a temporary pointer variable.
-    std::optional<unsigned> SubExprOffset = allocateLocalPrimitive(
+    unsigned SubExprOffset = allocateLocalPrimitive(
         SubExpr, PT_Ptr, /*IsConst=*/true, /*IsExtended=*/false);
-    if (!SubExprOffset)
-      return false;
-
     if (!this->visit(SubExpr))
       return false;
-    if (!this->emitSetLocal(PT_Ptr, *SubExprOffset, CE))
+    if (!this->emitSetLocal(PT_Ptr, SubExprOffset, CE))
       return false;
 
     PrimType SourceElemT = classifyComplexElementType(SubExpr->getType());
@@ -312,7 +321,7 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     PrimType DestElemT = classifyPrim(DestElemType);
     // Cast both elements individually.
     for (unsigned I = 0; I != 2; ++I) {
-      if (!this->emitGetLocal(PT_Ptr, *SubExprOffset, CE))
+      if (!this->emitGetLocal(PT_Ptr, SubExprOffset, CE))
         return false;
       if (!this->emitArrayElemPop(SourceElemT, I, CE))
         return false;
@@ -392,6 +401,17 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   const Expr *LHS = BO->getLHS();
   const Expr *RHS = BO->getRHS();
 
+  // Handle comma operators. Just discard the LHS
+  // and delegate to RHS.
+  if (BO->isCommaOp()) {
+    if (!this->discard(LHS))
+      return false;
+    if (RHS->getType()->isVoidType())
+      return this->discard(RHS);
+
+    return this->delegate(RHS);
+  }
+
   if (BO->getType()->isAnyComplexType())
     return this->VisitComplexBinOp(BO);
   if ((LHS->getType()->isAnyComplexType() ||
@@ -407,16 +427,6 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   std::optional<PrimType> RT = classify(RHS->getType());
   std::optional<PrimType> T = classify(BO->getType());
 
-  // Deal with operations which have composite or void types.
-  if (BO->isCommaOp()) {
-    if (!this->discard(LHS))
-      return false;
-    if (RHS->getType()->isVoidType())
-      return this->discard(RHS);
-
-    return this->delegate(RHS);
-  }
-
   // Special case for C++'s three-way/spaceship operator <=>, which
   // returns a std::{strong,weak,partial}_ordering (which is a class, so doesn't
   // have a PrimType).
@@ -676,11 +686,17 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
     if (!this->emitSetLocal(PT_Ptr, ResultOffset, E))
       return false;
   }
+  QualType LHSType = LHS->getType();
+  if (const auto *AT = LHSType->getAs<AtomicType>())
+    LHSType = AT->getValueType();
+  QualType RHSType = RHS->getType();
+  if (const auto *AT = RHSType->getAs<AtomicType>())
+    RHSType = AT->getValueType();
 
   // Evaluate LHS and save value to LHSOffset.
   bool LHSIsComplex;
   unsigned LHSOffset;
-  if (LHS->getType()->isAnyComplexType()) {
+  if (LHSType->isAnyComplexType()) {
     LHSIsComplex = true;
     LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false);
     if (!this->visit(LHS))
@@ -689,7 +705,7 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
       return false;
   } else {
     LHSIsComplex = false;
-    PrimType LHST = classifyPrim(LHS->getType());
+    PrimType LHST = classifyPrim(LHSType);
     LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false);
     if (!this->visit(LHS))
       return false;
@@ -700,7 +716,7 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
   // Same with RHS.
   bool RHSIsComplex;
   unsigned RHSOffset;
-  if (RHS->getType()->isAnyComplexType()) {
+  if (RHSType->isAnyComplexType()) {
     RHSIsComplex = true;
     RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false);
     if (!this->visit(RHS))
@@ -709,7 +725,7 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
       return false;
   } else {
     RHSIsComplex = false;
-    PrimType RHST = classifyPrim(RHS->getType());
+    PrimType RHST = classifyPrim(RHSType);
     RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false);
     if (!this->visit(RHS))
       return false;
@@ -1081,9 +1097,9 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
     const UnaryExprOrTypeTraitExpr *E) {
   UnaryExprOrTypeTrait Kind = E->getKind();
-  ASTContext &ASTCtx = Ctx.getASTContext();
+  const ASTContext &ASTCtx = Ctx.getASTContext();
 
-  if (Kind == UETT_SizeOf) {
+  if (Kind == UETT_SizeOf || Kind == UETT_DataSizeOf) {
     QualType ArgType = E->getTypeOfArgument();
 
     // C++ [expr.sizeof]p2: "When applied to a reference or a reference type,
@@ -1098,7 +1114,10 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
       if (ArgType->isDependentType() || !ArgType->isConstantSizeType())
         return false;
 
-      Size = ASTCtx.getTypeSizeInChars(ArgType);
+      if (Kind == UETT_SizeOf)
+        Size = ASTCtx.getTypeSizeInChars(ArgType);
+      else
+        Size = ASTCtx.getTypeInfoDataSizeInChars(ArgType).Width;
     }
 
     if (DiscardResult)
@@ -1233,22 +1252,19 @@ bool ByteCodeExprGen<Emitter>::VisitOpaqueValueExpr(const OpaqueValueExpr *E) {
   // At this point we either have the evaluated source expression or a pointer
   // to an object on the stack. We want to create a local variable that stores
   // this value.
-  std::optional<unsigned> LocalIndex =
-      allocateLocalPrimitive(E, SubExprT, /*IsConst=*/true);
-  if (!LocalIndex)
-    return false;
-  if (!this->emitSetLocal(SubExprT, *LocalIndex, E))
+  unsigned LocalIndex = allocateLocalPrimitive(E, SubExprT, /*IsConst=*/true);
+  if (!this->emitSetLocal(SubExprT, LocalIndex, E))
     return false;
 
   // Here the local variable is created but the value is removed from the stack,
   // so we put it back if the caller needs it.
   if (!DiscardResult) {
-    if (!this->emitGetLocal(SubExprT, *LocalIndex, E))
+    if (!this->emitGetLocal(SubExprT, LocalIndex, E))
       return false;
   }
 
   // This is cleaned up when the local variable is destroyed.
-  OpaqueExprs.insert({E, *LocalIndex});
+  OpaqueExprs.insert({E, LocalIndex});
 
   return true;
 }
@@ -1642,14 +1658,13 @@ bool ByteCodeExprGen<Emitter>::VisitMaterializeTemporaryExpr(
 
   // For everyhing else, use local variables.
   if (SubExprT) {
-    if (std::optional<unsigned> LocalIndex = allocateLocalPrimitive(
-            SubExpr, *SubExprT, /*IsConst=*/true, /*IsExtended=*/true)) {
-      if (!this->visit(SubExpr))
-        return false;
-      if (!this->emitSetLocal(*SubExprT, *LocalIndex, E))
-        return false;
-      return this->emitGetPtrLocal(*LocalIndex, E);
-    }
+    unsigned LocalIndex = allocateLocalPrimitive(
+        SubExpr, *SubExprT, /*IsConst=*/true, /*IsExtended=*/true);
+    if (!this->visit(SubExpr))
+      return false;
+    if (!this->emitSetLocal(*SubExprT, LocalIndex, E))
+      return false;
+    return this->emitGetPtrLocal(LocalIndex, E);
   } else {
     const Expr *Inner = E->getSubExpr()->skipRValueSubobjectAdjustments();
 
@@ -1744,6 +1759,14 @@ bool ByteCodeExprGen<Emitter>::VisitTypeTraitExpr(const TypeTraitExpr *E) {
   return this->emitConst(E->getValue(), E);
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitArrayTypeTraitExpr(
+    const ArrayTypeTraitExpr *E) {
+  if (DiscardResult)
+    return true;
+  return this->emitConst(E->getValue(), E);
+}
+
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitLambdaExpr(const LambdaExpr *E) {
   if (DiscardResult)
@@ -2215,6 +2238,12 @@ bool ByteCodeExprGen<Emitter>::VisitPseudoObjectExpr(
   return true;
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitPackIndexingExpr(
+    const PackIndexingExpr *E) {
+  return this->delegate(E->getSelectedExpr());
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   if (E->containsErrors())
     return false;
@@ -2227,7 +2256,7 @@ template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::delegate(const Expr *E) {
   if (E->containsErrors())
-    return false;
+    return this->emitError(E);
 
   // We're basically doing:
   // OptionScope<Emitter> Scope(this, DicardResult, Initializing);
@@ -2237,7 +2266,7 @@ bool ByteCodeExprGen<Emitter>::delegate(const Expr *E) {
 
 template <class Emitter> bool ByteCodeExprGen<Emitter>::visit(const Expr *E) {
   if (E->containsErrors())
-    return false;
+    return this->emitError(E);
 
   if (E->getType()->isVoidType())
     return this->discard(E);
@@ -2266,7 +2295,7 @@ bool ByteCodeExprGen<Emitter>::visitInitializer(const Expr *E) {
   assert(!classify(E->getType()));
 
   if (E->containsErrors())
-    return false;
+    return this->emitError(E);
 
   OptionScope<Emitter> Scope(this, /*NewDiscardResult=*/false,
                              /*NewInitializing=*/true);
@@ -2660,18 +2689,12 @@ bool ByteCodeExprGen<Emitter>::visitVarDecl(const VarDecl *VD) {
     if (P.getGlobal(VD))
       return true;
 
-    // Ignore external declarations. We will instead emit a dummy
-    // pointer when we see a DeclRefExpr for them.
-    if (VD->hasExternalStorage())
-      return true;
-
     std::optional<unsigned> GlobalIndex = P.createGlobal(VD, Init);
 
     if (!GlobalIndex)
       return false;
 
-    assert(Init);
-    {
+    if (Init) {
       DeclScope<Emitter> LocalScope(this, VD);
 
       if (VarT) {
@@ -2681,6 +2704,7 @@ bool ByteCodeExprGen<Emitter>::visitVarDecl(const VarDecl *VD) {
       }
       return this->visitGlobalInitializer(Init, *GlobalIndex);
     }
+    return true;
   } else {
     VariableScope<Emitter> LocalScope(this);
     if (VarT) {
@@ -2893,11 +2917,7 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitCXXDefaultInitExpr(
     const CXXDefaultInitExpr *E) {
   SourceLocScope<Emitter> SLS(this, E);
-  if (Initializing)
-    return this->visitInitializer(E->getExpr());
-
-  assert(classify(E->getType()));
-  return this->visit(E->getExpr());
+  return this->delegate(E->getExpr());
 }
 
 template <class Emitter>
@@ -3247,53 +3267,20 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   // pointer to the actual value) instead of a pointer to the pointer to the
   // value.
   bool IsReference = D->getType()->isReferenceType();
-  // Complex values are copied in the AST via a simply assignment or
-  // ltor cast. But we represent them as two-element arrays, which means
-  // we pass them around as pointers. So, to assignm from them, we will
-  // have to copy both (primitive) elements instead.
-  bool IsComplex = D->getType()->isAnyComplexType();
 
   // Check for local/global variables and parameters.
   if (auto It = Locals.find(D); It != Locals.end()) {
     const unsigned Offset = It->second.Offset;
-    // FIXME: Fix the code duplication here with the code in the global case.
-    if (Initializing && IsComplex) {
-      PrimType ElemT = classifyComplexElementType(D->getType());
-      for (unsigned I = 0; I != 2; ++I) {
-        if (!this->emitGetPtrLocal(Offset, E))
-          return false;
-        if (!this->emitArrayElemPop(ElemT, I, E))
-          return false;
-        if (!this->emitInitElem(ElemT, I, E))
-          return false;
-      }
-      return true;
-    }
-
     if (IsReference)
       return this->emitGetLocal(PT_Ptr, Offset, E);
     return this->emitGetPtrLocal(Offset, E);
   } else if (auto GlobalIndex = P.getGlobal(D)) {
-    if (Initializing && IsComplex) {
-      PrimType ElemT = classifyComplexElementType(D->getType());
-      for (unsigned I = 0; I != 2; ++I) {
-        if (!this->emitGetPtrGlobal(*GlobalIndex, E))
-          return false;
-        if (!this->emitArrayElemPop(ElemT, I, E))
-          return false;
-        if (!this->emitInitElem(ElemT, I, E))
-          return false;
-      }
-      return true;
-    }
-
     if (IsReference)
       return this->emitGetGlobalPtr(*GlobalIndex, E);
 
     return this->emitGetPtrGlobal(*GlobalIndex, E);
   } else if (const auto *PVD = dyn_cast<ParmVarDecl>(D)) {
     if (auto It = this->Params.find(PVD); It != this->Params.end()) {
-      // FIXME: _Complex initializing case?
       if (IsReference || !It->second.IsPtr)
         return this->emitGetParamPtr(It->second.Offset, E);
 
@@ -3316,15 +3303,13 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   if (Ctx.getLangOpts().CPlusPlus) {
     if (const auto *VD = dyn_cast<VarDecl>(D)) {
       // Visit local const variables like normal.
-      if (VD->isLocalVarDecl() && VD->getType().isConstQualified()) {
+      if ((VD->isLocalVarDecl() || VD->isStaticDataMember()) &&
+          VD->getType().isConstQualified()) {
         if (!this->visitVarDecl(VD))
           return false;
         // Retry.
         return this->VisitDeclRefExpr(E);
       }
-
-      if (VD->hasExternalStorage())
-        return this->emitInvalidDeclRef(E, E);
     }
   } else {
     if (const auto *VD = dyn_cast<VarDecl>(D);
@@ -3352,6 +3337,8 @@ template <class Emitter>
 unsigned
 ByteCodeExprGen<Emitter>::collectBaseOffset(const RecordType *BaseType,
                                             const RecordType *DerivedType) {
+  assert(BaseType);
+  assert(DerivedType);
   const auto *FinalDecl = cast<CXXRecordDecl>(BaseType->getDecl());
   const RecordDecl *CurDecl = DerivedType->getDecl();
   const Record *CurRecord = getRecord(CurDecl);
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 5ad2e74d7c2693..db0d73ce23f7c4 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -99,6 +99,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitCXXBindTemporaryExpr(const CXXBindTemporaryExpr *E);
   bool VisitCompoundLiteralExpr(const CompoundLiteralExpr *E);
   bool VisitTypeTraitExpr(const TypeTraitExpr *E);
+  bool VisitArrayTypeTraitExpr(const ArrayTypeTraitExpr *E);
   bool VisitLambdaExpr(const LambdaExpr *E);
   bool VisitPredefinedExpr(const PredefinedExpr *E);
   bool VisitCXXThrowExpr(const CXXThrowExpr *E);
@@ -119,6 +120,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitConceptSpecializationExpr(const ConceptSpecializationExpr *E);
   bool VisitCXXRewrittenBinaryOperator(const CXXRewrittenBinaryOperator *E);
   bool VisitPseudoObjectExpr(const PseudoObjectExpr *E);
+  bool VisitPackIndexingExpr(const PackIndexingExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index 6da3860f98d8c0..675063e7489886 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -273,15 +273,18 @@ bool ByteCodeStmtGen<Emitter>::visitStmt(const Stmt *S) {
     return visitCaseStmt(cast<CaseStmt>(S));
   case Stmt::DefaultStmtClass:
     return visitDefaultStmt(cast<DefaultStmt>(S));
-  case Stmt::GCCAsmStmtClass:
-  case Stmt::MSAsmStmtClass:
-    return visitAsmStmt(cast<AsmStmt>(S));
   case Stmt::AttributedStmtClass:
     return visitAttributedStmt(cast<AttributedStmt>(S));
   case Stmt::CXXTryStmtClass:
     return visitCXXTryStmt(cast<CXXTryStmt>(S));
   case Stmt::NullStmtClass:
     return true;
+  // Always invalid statements.
+  case Stmt::GCCAsmStmtClass:
+  case Stmt::MSAsmStmtClass:
+  case Stmt::GotoStmtClass:
+  case Stmt::LabelStmtClass:
+    return this->emitInvalid(S);
   default: {
     if (auto *Exp = dyn_cast<Expr>(S))
       return this->discard(Exp);
@@ -420,6 +423,11 @@ bool ByteCodeStmtGen<Emitter>::visitWhileStmt(const WhileStmt *S) {
   LoopScope<Emitter> LS(this, EndLabel, CondLabel);
 
   this->emitLabel(CondLabel);
+
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
+
   if (!this->visitBool(Cond))
     return false;
   if (!this->jumpFalse(EndLabel))
@@ -484,6 +492,10 @@ bool ByteCodeStmtGen<Emitter>::visitForStmt(const ForStmt *S) {
   if (Init && !this->visitStmt(Init))
     return false;
   this->emitLabel(CondLabel);
+
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
   if (Cond) {
     if (!this->visitBool(Cond))
       return false;
@@ -582,17 +594,21 @@ bool ByteCodeStmtGen<Emitter>::visitContinueStmt(const ContinueStmt *S) {
 template <class Emitter>
 bool ByteCodeStmtGen<Emitter>::visitSwitchStmt(const SwitchStmt *S) {
   const Expr *Cond = S->getCond();
-  PrimType CondT = this->classifyPrim(Cond->getType());
 
   LabelTy EndLabel = this->getLabel();
   OptLabelTy DefaultLabel = std::nullopt;
-  unsigned CondVar = this->allocateLocalPrimitive(Cond, CondT, true, false);
 
   if (const auto *CondInit = S->getInit())
     if (!visitStmt(CondInit))
       return false;
 
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
+    if (!visitDeclStmt(CondDecl))
+      return false;
+
   // Initialize condition variable.
+  PrimType CondT = this->classifyPrim(Cond->getType());
+  unsigned CondVar = this->allocateLocalPrimitive(Cond, CondT, true, false);
   if (!this->visit(Cond))
     return false;
   if (!this->emitSetLocal(CondT, CondVar, S))
@@ -657,11 +673,6 @@ bool ByteCodeStmtGen<Emitter>::visitDefaultStmt(const DefaultStmt *S) {
   return this->visitStmt(S->getSubStmt());
 }
 
-template <class Emitter>
-bool ByteCodeStmtGen<Emitter>::visitAsmStmt(const AsmStmt *S) {
-  return this->emitInvalid(S);
-}
-
 template <class Emitter>
 bool ByteCodeStmtGen<Emitter>::visitAttributedStmt(const AttributedStmt *S) {
   // Ignore all attributes.
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.h b/clang/lib/AST/Interp/ByteCodeStmtGen.h
index 64e03587ab2112..ab7a591fb798ee 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.h
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.h
@@ -63,7 +63,6 @@ class ByteCodeStmtGen final : public ByteCodeExprGen<Emitter> {
   bool visitSwitchStmt(const SwitchStmt *S);
   bool visitCaseStmt(const CaseStmt *S);
   bool visitDefaultStmt(const DefaultStmt *S);
-  bool visitAsmStmt(const AsmStmt *S);
   bool visitAttributedStmt(const AttributedStmt *S);
   bool visitCXXTryStmt(const CXXTryStmt *S);
 
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index ce7ed9cec3db3f..a4ccc0236d292c 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -233,9 +233,10 @@ static BlockMoveFn getMoveArrayPrim(PrimType Type) {
 Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsConst, bool IsTemporary, bool IsMutable)
     : Source(D), ElemSize(primSize(Type)), Size(ElemSize),
-      MDSize(MD.value_or(0)), AllocSize(align(Size + MDSize)), IsConst(IsConst),
-      IsMutable(IsMutable), IsTemporary(IsTemporary), CtorFn(getCtorPrim(Type)),
-      DtorFn(getDtorPrim(Type)), MoveFn(getMovePrim(Type)) {
+      MDSize(MD.value_or(0)), AllocSize(align(Size + MDSize)), PrimT(Type),
+      IsConst(IsConst), IsMutable(IsMutable), IsTemporary(IsTemporary),
+      CtorFn(getCtorPrim(Type)), DtorFn(getDtorPrim(Type)),
+      MoveFn(getMovePrim(Type)) {
   assert(AllocSize >= Size);
   assert(Source && "Missing source");
 }
@@ -246,7 +247,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsMutable)
     : Source(D), ElemSize(primSize(Type)), Size(ElemSize * NumElems),
       MDSize(MD.value_or(0)),
-      AllocSize(align(MDSize) + align(Size) + sizeof(InitMapPtr)),
+      AllocSize(align(MDSize) + align(Size) + sizeof(InitMapPtr)), PrimT(Type),
       IsConst(IsConst), IsMutable(IsMutable), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
       DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
@@ -300,10 +301,19 @@ Descriptor::Descriptor(const DeclTy &D, const Record *R, MetadataSize MD,
   assert(Source && "Missing source");
 }
 
-Descriptor::Descriptor(const DeclTy &D, MetadataSize MD)
-    : Source(D), ElemSize(1), Size(ElemSize), MDSize(MD.value_or(0)),
-      AllocSize(Size + MDSize), ElemRecord(nullptr), IsConst(true),
-      IsMutable(false), IsTemporary(false), IsDummy(true) {
+/// Dummy.
+Descriptor::Descriptor(const DeclTy &D)
+    : Source(D), ElemSize(1), Size(1), MDSize(0), AllocSize(MDSize),
+      ElemRecord(nullptr), IsConst(true), IsMutable(false), IsTemporary(false),
+      IsDummy(true) {
+  assert(Source && "Missing source");
+}
+
+/// Dummy array.
+Descriptor::Descriptor(const DeclTy &D, UnknownSize)
+    : Source(D), ElemSize(1), Size(UnknownSizeMark), MDSize(0),
+      AllocSize(MDSize), ElemRecord(nullptr), IsConst(true), IsMutable(false),
+      IsTemporary(false), IsArray(true), IsDummy(true) {
   assert(Source && "Missing source");
 }
 
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index 0f64d678f3ef6b..4e257361ad146b 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -112,6 +112,10 @@ struct Descriptor final {
   const Record *const ElemRecord = nullptr;
   /// Descriptor of the array element.
   const Descriptor *const ElemDesc = nullptr;
+  /// The primitive type this descriptor was created for,
+  /// or the primitive element type in case this is
+  /// a primitive array.
+  const std::optional<PrimType> PrimT = std::nullopt;
   /// Flag indicating if the block is mutable.
   const bool IsConst = false;
   /// Flag indicating if a field is mutable.
@@ -152,7 +156,11 @@ struct Descriptor final {
   Descriptor(const DeclTy &D, const Record *R, MetadataSize MD, bool IsConst,
              bool IsTemporary, bool IsMutable);
 
-  Descriptor(const DeclTy &D, MetadataSize MD);
+  /// Allocates a dummy descriptor.
+  Descriptor(const DeclTy &D);
+
+  /// Allocates a dummy array descriptor.
+  Descriptor(const DeclTy &D, UnknownSize);
 
   QualType getType() const;
   QualType getElemQualType() const;
@@ -183,6 +191,11 @@ struct Descriptor final {
     return Size;
   }
 
+  PrimType getPrimType() const {
+    assert(isPrimitiveArray() || isPrimitive());
+    return *PrimT;
+  }
+
   /// Returns the allocated size, including metadata.
   unsigned getAllocSize() const { return AllocSize; }
   /// returns the size of an element when the structure is viewed as an array.
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index 315ddb293044b7..01ef1c24744a58 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -10,9 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Boolean.h"
 #include "Floating.h"
 #include "Function.h"
+#include "FunctionPointer.h"
+#include "Integral.h"
 #include "IntegralAP.h"
+#include "InterpFrame.h"
 #include "Opcode.h"
 #include "PrimType.h"
 #include "Program.h"
@@ -86,6 +90,40 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
 
 LLVM_DUMP_METHOD void Program::dump() const { dump(llvm::errs()); }
 
+static const char *primTypeToString(PrimType T) {
+  switch (T) {
+  case PT_Sint8:
+    return "Sint8";
+  case PT_Uint8:
+    return "Uint8";
+  case PT_Sint16:
+    return "Sint16";
+  case PT_Uint16:
+    return "Uint16";
+  case PT_Sint32:
+    return "Sint32";
+  case PT_Uint32:
+    return "Uint32";
+  case PT_Sint64:
+    return "Sint64";
+  case PT_Uint64:
+    return "Uint64";
+  case PT_IntAP:
+    return "IntAP";
+  case PT_IntAPS:
+    return "IntAPS";
+  case PT_Bool:
+    return "Bool";
+  case PT_Float:
+    return "Float";
+  case PT_Ptr:
+    return "Ptr";
+  case PT_FnPtr:
+    return "FnPtr";
+  }
+  llvm_unreachable("Unhandled PrimType");
+}
+
 LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
   {
     ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true});
@@ -100,9 +138,10 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
   unsigned GI = 0;
   for (const Global *G : Globals) {
     const Descriptor *Desc = G->block()->getDescriptor();
+    Pointer GP = getPtrGlobal(GI);
+
     OS << GI << ": " << (void *)G->block() << " ";
     {
-      Pointer GP = getPtrGlobal(GI);
       ColorScope SC(OS, true,
                     GP.isInitialized()
                         ? TerminalColor{llvm::raw_ostream::GREEN, false}
@@ -111,6 +150,15 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
     }
     Desc->dump(OS);
     OS << "\n";
+    if (Desc->isPrimitive() && !Desc->isDummy()) {
+      OS << "   ";
+      {
+        ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_CYAN, false});
+        OS << primTypeToString(Desc->getPrimType()) << " ";
+      }
+      TYPE_SWITCH(Desc->getPrimType(), { GP.deref<T>().print(OS); });
+      OS << "\n";
+    }
     ++GI;
   }
 
@@ -136,7 +184,7 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
   {
     ColorScope SC(OS, true, {llvm::raw_ostream::BLUE, true});
     if (const auto *ND = dyn_cast_if_present<NamedDecl>(asDecl()))
-      OS << ND->getName();
+      ND->printQualifiedName(OS);
     else if (asExpr())
       OS << "expr (TODO)";
   }
@@ -159,3 +207,29 @@ LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
   if (isDummy())
     OS << " dummy";
 }
+
+LLVM_DUMP_METHOD void InterpFrame::dump(llvm::raw_ostream &OS,
+                                        unsigned Indent) const {
+  unsigned Spaces = Indent * 2;
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::BLUE, true});
+    OS.indent(Spaces);
+    if (getCallee())
+      describe(OS);
+    else
+      OS << "Frame (Depth: " << getDepth() << ")";
+    OS << "\n";
+  }
+  OS.indent(Spaces) << "Function: " << getFunction();
+  if (const Function *F = getFunction()) {
+    OS << " (" << F->getName() << ")";
+  }
+  OS << "\n";
+  OS.indent(Spaces) << "This: " << getThis() << "\n";
+  OS.indent(Spaces) << "RVO: " << getRVOPtr() << "\n";
+
+  while (const InterpFrame *F = this->Caller) {
+    F->dump(OS, Indent + 1);
+    F = F->Caller;
+  }
+}
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 4f3cd6cd21a151..0ce64a572c263f 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -254,10 +254,10 @@ bool CheckConstant(InterpState &S, CodePtr OpPC, const Descriptor *Desc) {
     if (VD->isConstexpr())
       return true;
 
+    QualType T = VD->getType();
     if (S.getLangOpts().CPlusPlus && !S.getLangOpts().CPlusPlus11)
-      return false;
+      return T->isSignedIntegerOrEnumerationType() || T->isUnsignedIntegerOrEnumerationType();
 
-    QualType T = VD->getType();
     if (T.isConstQualified())
       return true;
 
@@ -485,7 +485,9 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
         // Don't emit anything if the function isn't defined and we're checking
         // for a constant expression. It might be defined at the point we're
         // actually calling it.
-        if (!DiagDecl->isDefined() && S.checkingPotentialConstantExpression())
+        bool IsExtern = DiagDecl->getStorageClass() == SC_Extern;
+        if (!DiagDecl->isDefined() && !IsExtern &&
+            S.checkingPotentialConstantExpression())
           return false;
 
         // If the declaration is defined _and_ declared 'constexpr', the below
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index db80e2d59753f8..405993eb827036 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -122,6 +122,9 @@ bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
 bool SetThreeWayComparisonField(InterpState &S, CodePtr OpPC,
                                 const Pointer &Ptr, const APSInt &IntValue);
 
+/// Copy the contents of Src into Dest.
+bool DoMemcpy(InterpState &S, CodePtr OpPC, const Pointer &Src, Pointer &Dest);
+
 /// Checks if the shift operation is legal.
 template <typename LT, typename RT>
 bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
@@ -165,7 +168,8 @@ bool CheckDivRem(InterpState &S, CodePtr OpPC, const T &LHS, const T &RHS) {
     const auto *Op = cast<BinaryOperator>(S.Current->getExpr(OpPC));
     S.FFDiag(Op, diag::note_expr_divide_by_zero)
         << Op->getRHS()->getSourceRange();
-    return false;
+    if constexpr (!std::is_same_v<T, Floating>)
+      return false;
   }
 
   if (LHS.isSigned() && LHS.isMin() && RHS.isNegative() && RHS.isMinusOne()) {
@@ -1487,6 +1491,16 @@ bool InitElemPop(InterpState &S, CodePtr OpPC, uint32_t Idx) {
   return true;
 }
 
+inline bool Memcpy(InterpState &S, CodePtr OpPC) {
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  Pointer &Dest = S.Stk.peek<Pointer>();
+
+  if (!CheckLoad(S, OpPC, Src))
+    return false;
+
+  return DoMemcpy(S, OpPC, Src, Dest);
+}
+
 //===----------------------------------------------------------------------===//
 // AddOffset, SubOffset
 //===----------------------------------------------------------------------===//
@@ -1933,8 +1947,15 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
-  if (Ptr.isDummy())
-    return true;
+  if (!Ptr.isZero()) {
+    if (!CheckArray(S, OpPC, Ptr))
+      return false;
+
+    if (Ptr.isDummy()) {
+      S.Stk.push<Pointer>(Ptr);
+      return true;
+    }
+  }
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
     return false;
@@ -1947,9 +1968,14 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (Ptr.isDummy()) {
-    S.Stk.push<Pointer>(Ptr);
-    return true;
+  if (!Ptr.isZero()) {
+    if (!CheckArray(S, OpPC, Ptr))
+      return false;
+
+    if (Ptr.isDummy()) {
+      S.Stk.push<Pointer>(Ptr);
+      return true;
+    }
   }
 
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
@@ -2201,6 +2227,9 @@ inline bool Invalid(InterpState &S, CodePtr OpPC) {
   return false;
 }
 
+/// Do nothing and just abort execution.
+inline bool Error(InterpState &S, CodePtr OpPC) { return false; }
+
 /// Same here, but only for casts.
 inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind) {
   const SourceLocation &Loc = S.Current->getLocation(OpPC);
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index c500b9d502d707..1bf5d55314f1f2 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -119,6 +119,36 @@ static bool retPrimValue(InterpState &S, CodePtr OpPC, APValue &Result,
 #undef RET_CASE
 }
 
+static bool interp__builtin_is_constant_evaluated(InterpState &S, CodePtr OpPC,
+                                                  const InterpFrame *Frame,
+                                                  const CallExpr *Call) {
+  // The current frame is the one for __builtin_is_constant_evaluated.
+  // The one above that, potentially the one for std::is_constant_evaluated().
+  if (S.inConstantContext() && !S.checkingPotentialConstantExpression() &&
+      Frame->Caller && S.getEvalStatus().Diag) {
+    auto isStdCall = [](const FunctionDecl *F) -> bool {
+      return F && F->isInStdNamespace() && F->getIdentifier() &&
+             F->getIdentifier()->isStr("is_constant_evaluated");
+    };
+    const InterpFrame *Caller = Frame->Caller;
+
+    if (Caller->Caller && isStdCall(Caller->getCallee())) {
+      const Expr *E = Caller->Caller->getExpr(Caller->getRetPC());
+      S.report(E->getExprLoc(),
+               diag::warn_is_constant_evaluated_always_true_constexpr)
+          << "std::is_constant_evaluated";
+    } else {
+      const Expr *E = Frame->Caller->getExpr(Frame->getRetPC());
+      S.report(E->getExprLoc(),
+               diag::warn_is_constant_evaluated_always_true_constexpr)
+          << "__builtin_is_constant_evaluated";
+    }
+  }
+
+  S.Stk.push<Boolean>(Boolean::from(S.inConstantContext()));
+  return true;
+}
+
 static bool interp__builtin_strcmp(InterpState &S, CodePtr OpPC,
                                    const InterpFrame *Frame,
                                    const CallExpr *Call) {
@@ -533,11 +563,12 @@ static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC,
                                    const InterpFrame *Frame,
                                    const Function *Func, const CallExpr *Call,
                                    bool Right) {
-  PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
-  assert(ArgT == *S.getContext().classify(Call->getArg(1)->getType()));
+  PrimType AmountT = *S.getContext().classify(Call->getArg(1)->getType());
+  PrimType ValueT = *S.getContext().classify(Call->getArg(0)->getType());
 
-  APSInt Amount = peekToAPSInt(S.Stk, ArgT);
-  APSInt Value = peekToAPSInt(S.Stk, ArgT, align(primSize(ArgT)) * 2);
+  APSInt Amount = peekToAPSInt(S.Stk, AmountT);
+  APSInt Value = peekToAPSInt(
+      S.Stk, ValueT, align(primSize(AmountT)) + align(primSize(ValueT)));
 
   APSInt Result;
   if (Right)
@@ -605,10 +636,9 @@ static bool interp__builtin_eh_return_data_regno(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_launder(InterpState &S, CodePtr OpPC,
-                                    const InterpFrame *Frame,
-                                    const Function *Func,
-                                    const CallExpr *Call) {
+/// Just takes the first Argument to the call and puts it on the stack.
+static bool noopPointer(InterpState &S, CodePtr OpPC, const InterpFrame *Frame,
+                        const Function *Func, const CallExpr *Call) {
   const Pointer &Arg = S.Stk.peek<Pointer>();
   S.Stk.push<Pointer>(Arg);
   return true;
@@ -917,14 +947,15 @@ static bool interp__builtin_complex(InterpState &S, CodePtr OpPC,
 
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *Call) {
-  InterpFrame *Frame = S.Current;
+  const InterpFrame *Frame = S.Current;
   APValue Dummy;
 
   std::optional<PrimType> ReturnT = S.getContext().classify(Call);
 
   switch (F->getBuiltinID()) {
   case Builtin::BI__builtin_is_constant_evaluated:
-    S.Stk.push<Boolean>(Boolean::from(S.inConstantContext()));
+    if (!interp__builtin_is_constant_evaluated(S, OpPC, Frame, Call))
+      return false;
     break;
   case Builtin::BI__builtin_assume:
   case Builtin::BI__assume:
@@ -1143,7 +1174,9 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
     break;
 
   case Builtin::BI__builtin_launder:
-    if (!interp__builtin_launder(S, OpPC, Frame, F, Call))
+  case Builtin::BI__builtin___CFStringMakeConstantString:
+  case Builtin::BI__builtin___NSStringMakeConstantString:
+    if (!noopPointer(S, OpPC, Frame, F, Call))
       return false;
     break;
 
@@ -1326,5 +1359,50 @@ bool SetThreeWayComparisonField(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+bool DoMemcpy(InterpState &S, CodePtr OpPC, const Pointer &Src, Pointer &Dest) {
+  assert(Src.isLive() && Dest.isLive());
+
+  [[maybe_unused]] const Descriptor *SrcDesc = Src.getFieldDesc();
+  const Descriptor *DestDesc = Dest.getFieldDesc();
+
+  assert(!DestDesc->isPrimitive() && !SrcDesc->isPrimitive());
+
+  if (DestDesc->isPrimitiveArray()) {
+    assert(SrcDesc->isPrimitiveArray());
+    assert(SrcDesc->getNumElems() == DestDesc->getNumElems());
+    PrimType ET = DestDesc->getPrimType();
+    for (unsigned I = 0, N = DestDesc->getNumElems(); I != N; ++I) {
+      Pointer DestElem = Dest.atIndex(I);
+      TYPE_SWITCH(ET, {
+        DestElem.deref<T>() = Src.atIndex(I).deref<T>();
+        DestElem.initialize();
+      });
+    }
+    return true;
+  }
+
+  if (DestDesc->isRecord()) {
+    assert(SrcDesc->isRecord());
+    assert(SrcDesc->ElemRecord == DestDesc->ElemRecord);
+    const Record *R = DestDesc->ElemRecord;
+    for (const Record::Field &F : R->fields()) {
+      Pointer DestField = Dest.atField(F.Offset);
+      if (std::optional<PrimType> FT = S.Ctx.classify(F.Decl->getType())) {
+        TYPE_SWITCH(*FT, {
+          DestField.deref<T>() = Src.atField(F.Offset).deref<T>();
+          DestField.initialize();
+        });
+      } else {
+        return Invalid(S, OpPC);
+      }
+    }
+    return true;
+  }
+
+  // FIXME: Composite types.
+
+  return Invalid(S, OpPC);
+}
+
 } // namespace interp
 } // namespace clang
diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp
index f69ff06b5e81b5..12e2e6ff9155b9 100644
--- a/clang/lib/AST/Interp/InterpFrame.cpp
+++ b/clang/lib/AST/Interp/InterpFrame.cpp
@@ -190,6 +190,8 @@ SourceRange InterpFrame::getCallRange() const {
 }
 
 const FunctionDecl *InterpFrame::getCallee() const {
+  if (!Func)
+    return nullptr;
   return Func->getDecl();
 }
 
diff --git a/clang/lib/AST/Interp/InterpFrame.h b/clang/lib/AST/Interp/InterpFrame.h
index 322d5dcfa698ae..1f80a0a5d2c492 100644
--- a/clang/lib/AST/Interp/InterpFrame.h
+++ b/clang/lib/AST/Interp/InterpFrame.h
@@ -123,6 +123,9 @@ class InterpFrame final : public Frame {
 
   unsigned getDepth() const { return Depth; }
 
+  void dump() const { dump(llvm::errs(), 0); }
+  void dump(llvm::raw_ostream &OS, unsigned Indent = 0) const;
+
 private:
   /// Returns an original argument from the stack.
   template <typename T> const T &stackRef(unsigned Offset) const {
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 9b99aa0ccb558a..cc1310f4c0d52a 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -706,6 +706,7 @@ def Dup : Opcode {
 
 // [] -> []
 def Invalid : Opcode {}
+def Error : Opcode {}
 def InvalidCast : Opcode {
   let Args = [ArgCastKind];
 }
@@ -720,3 +721,5 @@ def CheckNonNullArg : Opcode {
   let Types = [PtrTypeClass];
   let HasGroup = 1;
 }
+
+def Memcpy : Opcode;
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index 34ecdb967960d5..fffb4aba492fc8 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -285,6 +285,11 @@ class Pointer {
   bool inPrimitiveArray() const { return getFieldDesc()->isPrimitiveArray(); }
   /// Checks if the structure is an array of unknown size.
   bool isUnknownSizeArray() const {
+    // If this points inside a dummy block, return true.
+    // FIXME: This might change in the future. If it does, we need
+    // to set the proper Ctor/Dtor functions for dummy Descriptors.
+    if (Base != 0 && Base != sizeof(InlineDescriptor) && isDummy())
+      return true;
     return getFieldDesc()->isUnknownSizeArray();
   }
   /// Checks if the pointer points to an array.
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index 86e18ede638114..da6f72c62115dd 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -145,11 +145,20 @@ std::optional<unsigned> Program::getOrCreateGlobal(const ValueDecl *VD,
 
 std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *VD) {
   // Dedup blocks since they are immutable and pointers cannot be compared.
-  if (auto It = DummyParams.find(VD); It != DummyParams.end())
+  if (auto It = DummyVariables.find(VD); It != DummyVariables.end())
     return It->second;
 
   // Create dummy descriptor.
-  Descriptor *Desc = allocateDescriptor(VD, std::nullopt);
+  // We create desriptors of 'array of unknown size' if the type is an array
+  // type _and_ the size isn't known (it's not a ConstantArrayType). If the size
+  // is known however, we create a regular dummy pointer.
+  Descriptor *Desc;
+  if (const auto *AT = VD->getType()->getAsArrayTypeUnsafe();
+      AT && !isa<ConstantArrayType>(AT))
+    Desc = allocateDescriptor(VD, Descriptor::UnknownSize{});
+  else
+    Desc = allocateDescriptor(VD);
+
   // Allocate a block for storage.
   unsigned I = Globals.size();
 
@@ -158,7 +167,7 @@ std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *VD) {
   G->block()->invokeCtor();
 
   Globals.push_back(G);
-  DummyParams[VD] = I;
+  DummyVariables[VD] = I;
   return I;
 }
 
diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h
index 50bdb575e805cf..36b5a1faa513a9 100644
--- a/clang/lib/AST/Interp/Program.h
+++ b/clang/lib/AST/Interp/Program.h
@@ -208,7 +208,7 @@ class Program final {
   llvm::DenseMap<const RecordDecl *, Record *> Records;
 
   /// Dummy parameter to generate pointers from.
-  llvm::DenseMap<const ValueDecl *, unsigned> DummyParams;
+  llvm::DenseMap<const ValueDecl *, unsigned> DummyVariables;
 
   /// Creates a new descriptor.
   template <typename... Ts>
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 1b6141580a81d2..f619d657ae9f50 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -2431,6 +2431,7 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty,
   case Type::MacroQualified:
   case Type::BitInt:
   case Type::DependentBitInt:
+  case Type::CountAttributed:
     llvm_unreachable("type is illegal as a nested name specifier");
 
   case Type::SubstTemplateTypeParmPack:
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index b272a546573a31..aa26bb7ed46f48 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -390,6 +390,7 @@ class MicrosoftCXXNameMangler {
                           const FunctionDecl *D = nullptr,
                           bool ForceThisQuals = false,
                           bool MangleExceptionSpec = true);
+  void mangleSourceName(StringRef Name);
   void mangleNestedName(GlobalDecl GD);
 
 private:
@@ -408,7 +409,6 @@ class MicrosoftCXXNameMangler {
     mangleUnqualifiedName(GD, cast<NamedDecl>(GD.getDecl())->getDeclName());
   }
   void mangleUnqualifiedName(GlobalDecl GD, DeclarationName Name);
-  void mangleSourceName(StringRef Name);
   void mangleOperatorName(OverloadedOperatorKind OO, SourceLocation Loc);
   void mangleCXXDtorType(CXXDtorType T);
   void mangleQualifiers(Qualifiers Quals, bool IsMember);
@@ -3920,7 +3920,8 @@ void MicrosoftMangleContextImpl::mangleThreadSafeStaticGuardVariable(
   msvc_hashing_ostream MHO(Out);
   MicrosoftCXXNameMangler Mangler(*this, MHO);
 
-  Mangler.getStream() << "?$TSS" << GuardNum << '@';
+  Mangler.getStream() << "?";
+  Mangler.mangleSourceName("$TSS" + llvm::utostr(GuardNum));
   Mangler.mangleNestedName(VD);
   Mangler.getStream() << "@4HA";
 }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 22666184c56ccf..c6fe90ba5e2927 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -385,6 +385,26 @@ void DependentBitIntType::Profile(llvm::FoldingSetNodeID &ID,
   NumBitsExpr->Profile(ID, Context, true);
 }
 
+bool BoundsAttributedType::referencesFieldDecls() const {
+  return llvm::any_of(dependent_decls(),
+                      [](const TypeCoupledDeclRefInfo &Info) {
+                        return isa<FieldDecl>(Info.getDecl());
+                      });
+}
+
+void CountAttributedType::Profile(llvm::FoldingSetNodeID &ID,
+                                  QualType WrappedTy, Expr *CountExpr,
+                                  bool CountInBytes, bool OrNull) {
+  ID.AddPointer(WrappedTy.getAsOpaquePtr());
+  ID.AddBoolean(CountInBytes);
+  ID.AddBoolean(OrNull);
+  // We profile it as a pointer as the StmtProfiler considers parameter
+  // expressions on function declaration and function definition as the
+  // same, resulting in count expression being evaluated with ParamDecl
+  // not in the function scope.
+  ID.AddPointer(CountExpr);
+}
+
 /// getArrayElementTypeNoTypeQual - If this is an array type, return the
 /// element type of the array, potentially with type qualifiers missing.
 /// This method should never be used when type qualifiers are meaningful.
@@ -559,6 +579,14 @@ template <> const AttributedType *Type::getAs() const {
   return getAsSugar<AttributedType>(this);
 }
 
+template <> const BoundsAttributedType *Type::getAs() const {
+  return getAsSugar<BoundsAttributedType>(this);
+}
+
+template <> const CountAttributedType *Type::getAs() const {
+  return getAsSugar<CountAttributedType>(this);
+}
+
 /// getUnqualifiedDesugaredType - Pull any qualifiers and syntactic
 /// sugar off the given type.  This should produce an object of the
 /// same dynamic type as the canonical type.
@@ -641,6 +669,10 @@ bool Type::isScopedEnumeralType() const {
   return false;
 }
 
+bool Type::isCountAttributedType() const {
+  return getAs<CountAttributedType>();
+}
+
 const ComplexType *Type::getAsComplexIntegerType() const {
   if (const auto *Complex = getAs<ComplexType>())
     if (Complex->getElementType()->isIntegerType())
@@ -3709,6 +3741,43 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID,
           getExtProtoInfo(), Ctx, isCanonicalUnqualified());
 }
 
+TypeCoupledDeclRefInfo::TypeCoupledDeclRefInfo(ValueDecl *D, bool Deref)
+    : Data(D, Deref << DerefShift) {}
+
+bool TypeCoupledDeclRefInfo::isDeref() const {
+  return Data.getInt() & DerefMask;
+}
+ValueDecl *TypeCoupledDeclRefInfo::getDecl() const { return Data.getPointer(); }
+unsigned TypeCoupledDeclRefInfo::getInt() const { return Data.getInt(); }
+void *TypeCoupledDeclRefInfo::getOpaqueValue() const {
+  return Data.getOpaqueValue();
+}
+bool TypeCoupledDeclRefInfo::operator==(
+    const TypeCoupledDeclRefInfo &Other) const {
+  return getOpaqueValue() == Other.getOpaqueValue();
+}
+void TypeCoupledDeclRefInfo::setFromOpaqueValue(void *V) {
+  Data.setFromOpaqueValue(V);
+}
+
+BoundsAttributedType::BoundsAttributedType(TypeClass TC, QualType Wrapped,
+                                           QualType Canon)
+    : Type(TC, Canon, Wrapped->getDependence()), WrappedTy(Wrapped) {}
+
+CountAttributedType::CountAttributedType(
+    QualType Wrapped, QualType Canon, Expr *CountExpr, bool CountInBytes,
+    bool OrNull, ArrayRef<TypeCoupledDeclRefInfo> CoupledDecls)
+    : BoundsAttributedType(CountAttributed, Wrapped, Canon),
+      CountExpr(CountExpr) {
+  CountAttributedTypeBits.NumCoupledDecls = CoupledDecls.size();
+  CountAttributedTypeBits.CountInBytes = CountInBytes;
+  CountAttributedTypeBits.OrNull = OrNull;
+  auto *DeclSlot = getTrailingObjects<TypeCoupledDeclRefInfo>();
+  Decls = llvm::ArrayRef(DeclSlot, CoupledDecls.size());
+  for (unsigned i = 0; i != CoupledDecls.size(); ++i)
+    DeclSlot[i] = CoupledDecls[i];
+}
+
 TypedefType::TypedefType(TypeClass tc, const TypedefNameDecl *D,
                          QualType Underlying, QualType can)
     : Type(tc, can, toSemanticDependence(can->getDependence())),
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index b0acb182308755..21e152f6aea8a0 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -516,6 +516,10 @@ SourceRange AttributedTypeLoc::getLocalSourceRange() const {
   return getAttr() ? getAttr()->getRange() : SourceRange();
 }
 
+SourceRange CountAttributedTypeLoc::getLocalSourceRange() const {
+  return getCountExpr() ? getCountExpr()->getSourceRange() : SourceRange();
+}
+
 SourceRange BTFTagAttributedTypeLoc::getLocalSourceRange() const {
   return getAttr() ? getAttr()->getRange() : SourceRange();
 }
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 7dcc4348f8e036..7032ff2f18468c 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -286,6 +286,7 @@ bool TypePrinter::canPrefixQualifiers(const Type *T,
     case Type::PackExpansion:
     case Type::SubstTemplateTypeParm:
     case Type::MacroQualified:
+    case Type::CountAttributed:
       CanPrefixQualifiers = false;
       break;
 
@@ -1635,6 +1636,17 @@ void TypePrinter::printElaboratedBefore(const ElaboratedType *T,
     if (T->getKeyword() != ElaboratedTypeKeyword::None)
       OS << " ";
     NestedNameSpecifier *Qualifier = T->getQualifier();
+    if (!Policy.SuppressTagKeyword && Policy.SuppressScope &&
+        !Policy.SuppressUnwrittenScope) {
+      bool OldTagKeyword = Policy.SuppressTagKeyword;
+      bool OldSupressScope = Policy.SuppressScope;
+      Policy.SuppressTagKeyword = true;
+      Policy.SuppressScope = false;
+      printBefore(T->getNamedType(), OS);
+      Policy.SuppressTagKeyword = OldTagKeyword;
+      Policy.SuppressScope = OldSupressScope;
+      return;
+    }
     if (Qualifier)
       Qualifier->print(OS, Policy);
   }
@@ -1717,6 +1729,36 @@ void TypePrinter::printPackExpansionAfter(const PackExpansionType *T,
   OS << "...";
 }
 
+static void printCountAttributedImpl(const CountAttributedType *T,
+                                     raw_ostream &OS,
+                                     const PrintingPolicy &Policy) {
+  if (T->isCountInBytes() && T->isOrNull())
+    OS << " __sized_by_or_null(";
+  else if (T->isCountInBytes())
+    OS << " __sized_by(";
+  else if (T->isOrNull())
+    OS << " __counted_by_or_null(";
+  else
+    OS << " __counted_by(";
+  if (T->getCountExpr())
+    T->getCountExpr()->printPretty(OS, nullptr, Policy);
+  OS << ')';
+}
+
+void TypePrinter::printCountAttributedBefore(const CountAttributedType *T,
+                                             raw_ostream &OS) {
+  printBefore(T->desugar(), OS);
+  if (!T->desugar()->isArrayType())
+    printCountAttributedImpl(T, OS, Policy);
+}
+
+void TypePrinter::printCountAttributedAfter(const CountAttributedType *T,
+                                            raw_ostream &OS) {
+  printAfter(T->desugar(), OS);
+  if (T->desugar()->isArrayType())
+    printCountAttributedImpl(T, OS, Policy);
+}
+
 void TypePrinter::printAttributedBefore(const AttributedType *T,
                                         raw_ostream &OS) {
   // FIXME: Generate this with TableGen.
@@ -1847,6 +1889,7 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
     // AttributedType nodes for them.
     break;
 
+  case attr::CountedBy:
   case attr::LifetimeBound:
   case attr::TypeNonNull:
   case attr::TypeNullable:
@@ -2260,10 +2303,15 @@ printTo(raw_ostream &OS, ArrayRef<TA> Args, const PrintingPolicy &Policy,
     } else {
       if (!FirstArg)
         OS << Comma;
-      // Tries to print the argument with location info if exists.
-      printArgument(Arg, Policy, ArgOS,
-                    TemplateParameterList::shouldIncludeTypeForArgument(
-                        Policy, TPL, ParmIndex));
+      if (!Policy.SuppressTagKeyword &&
+          Argument.getKind() == TemplateArgument::Type &&
+          isa<TagType>(Argument.getAsType()))
+        OS << Argument.getAsType().getAsString();
+      else
+        // Tries to print the argument with location info if exists.
+        printArgument(Arg, Policy, ArgOS,
+                      TemplateParameterList::shouldIncludeTypeForArgument(
+                          Policy, TPL, ParmIndex));
     }
     StringRef ArgString = ArgOS.str();
 
diff --git a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
similarity index 89%
rename from clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
rename to clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
index 7c9f8fbb0a7009..3813b8c3ee8a23 100644
--- a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
@@ -1,4 +1,4 @@
-//===- ControlFlowContext.cpp ---------------------------------------------===//
+//===- AdornedCFG.cpp ---------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file defines a ControlFlowContext class that is used by dataflow
-//  analyses that run over Control-Flow Graphs (CFGs).
+//  This file defines an `AdornedCFG` class that is used by dataflow analyses
+//  that run over Control-Flow Graphs (CFGs).
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Stmt.h"
@@ -126,8 +126,7 @@ buildContainsExprConsumedInDifferentBlock(
   return Result;
 }
 
-llvm::Expected<ControlFlowContext>
-ControlFlowContext::build(const FunctionDecl &Func) {
+llvm::Expected<AdornedCFG> AdornedCFG::build(const FunctionDecl &Func) {
   if (!Func.doesThisDeclarationHaveABody())
     return llvm::createStringError(
         std::make_error_code(std::errc::invalid_argument),
@@ -136,8 +135,8 @@ ControlFlowContext::build(const FunctionDecl &Func) {
   return build(Func, *Func.getBody(), Func.getASTContext());
 }
 
-llvm::Expected<ControlFlowContext>
-ControlFlowContext::build(const Decl &D, Stmt &S, ASTContext &C) {
+llvm::Expected<AdornedCFG> AdornedCFG::build(const Decl &D, Stmt &S,
+                                             ASTContext &C) {
   if (D.isTemplated())
     return llvm::createStringError(
         std::make_error_code(std::errc::invalid_argument),
@@ -175,9 +174,9 @@ ControlFlowContext::build(const Decl &D, Stmt &S, ASTContext &C) {
   llvm::DenseSet<const CFGBlock *> ContainsExprConsumedInDifferentBlock =
       buildContainsExprConsumedInDifferentBlock(*Cfg, StmtToBlock);
 
-  return ControlFlowContext(D, std::move(Cfg), std::move(StmtToBlock),
-                            std::move(BlockReachable),
-                            std::move(ContainsExprConsumedInDifferentBlock));
+  return AdornedCFG(D, std::move(Cfg), std::move(StmtToBlock),
+                    std::move(BlockReachable),
+                    std::move(ContainsExprConsumedInDifferentBlock));
 }
 
 } // namespace dataflow
diff --git a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
index 5af4ecfc9efa5d..a3b5d9adc24bda 100644
--- a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_clang_library(clangAnalysisFlowSensitive
+  AdornedCFG.cpp
   Arena.cpp
-  ControlFlowContext.cpp
   DataflowAnalysisContext.cpp
   DataflowEnvironment.cpp
   Formula.cpp
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
index f4c4af022f51f6..d520539dd25355 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
@@ -288,8 +288,8 @@ void DataflowAnalysisContext::dumpFlowCondition(Atom Token,
   }
 }
 
-const ControlFlowContext *
-DataflowAnalysisContext::getControlFlowContext(const FunctionDecl *F) {
+const AdornedCFG *
+DataflowAnalysisContext::getAdornedCFG(const FunctionDecl *F) {
   // Canonicalize the key:
   F = F->getDefinition();
   if (F == nullptr)
@@ -299,10 +299,10 @@ DataflowAnalysisContext::getControlFlowContext(const FunctionDecl *F) {
     return &It->second;
 
   if (F->doesThisDeclarationHaveABody()) {
-    auto CFCtx = ControlFlowContext::build(*F);
+    auto ACFG = AdornedCFG::build(*F);
     // FIXME: Handle errors.
-    assert(CFCtx);
-    auto Result = FunctionContexts.insert({F, std::move(*CFCtx)});
+    assert(ACFG);
+    auto Result = FunctionContexts.insert({F, std::move(*ACFG)});
     return &Result.first->second;
   }
 
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 1d2bd9a9b08af3..cc1ebd511191a9 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -771,6 +771,7 @@ static bool isOriginalRecordConstructor(const Expr &RecordPRValue) {
     return !Init->isSemanticForm() || !Init->isTransparent();
   return isa<CXXConstructExpr>(RecordPRValue) || isa<CallExpr>(RecordPRValue) ||
          isa<LambdaExpr>(RecordPRValue) ||
+         isa<CXXDefaultArgExpr>(RecordPRValue) ||
          isa<CXXDefaultInitExpr>(RecordPRValue) ||
          // The framework currently does not propagate the objects created in
          // the two branches of a `ConditionalOperator` because there is no way
diff --git a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
index 6afd66d9dc6ac5..397a8d87e114d7 100644
--- a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
+++ b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
@@ -54,7 +54,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DebugSupport.h"
 #include "clang/Analysis/FlowSensitive/Logger.h"
 #include "clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h"
@@ -162,7 +162,7 @@ class HTMLLogger : public Logger {
   llvm::raw_string_ostream JStringStream{JSON};
   llvm::json::OStream JOS{JStringStream, /*Indent=*/2};
 
-  const ControlFlowContext *CFC;
+  const AdornedCFG *ACFG;
   // Timeline of iterations of CFG block visitation.
   std::vector<Iteration> Iters;
   // Indexes  in `Iters` of the iterations for each block.
@@ -176,15 +176,15 @@ class HTMLLogger : public Logger {
 
 public:
   explicit HTMLLogger(StreamFactory Streams) : Streams(std::move(Streams)) {}
-  void beginAnalysis(const ControlFlowContext &CFC,
+  void beginAnalysis(const AdornedCFG &ACFG,
                      TypeErasedDataflowAnalysis &A) override {
     OS = Streams();
-    this->CFC = &CFC;
+    this->ACFG = &ACFG;
     *OS << llvm::StringRef(HTMLLogger_html).split("<?INJECT?>").first;
 
-    BlockConverged.resize(CFC.getCFG().getNumBlockIDs());
+    BlockConverged.resize(ACFG.getCFG().getNumBlockIDs());
 
-    const auto &D = CFC.getDecl();
+    const auto &D = ACFG.getDecl();
     const auto &SM = A.getASTContext().getSourceManager();
     *OS << "<title>";
     if (const auto *ND = dyn_cast<NamedDecl>(&D))
@@ -345,7 +345,7 @@ class HTMLLogger : public Logger {
   // tokens are associated with, and even which BB element (so that clicking
   // can select the right element).
   void writeCode() {
-    const auto &AST = CFC->getDecl().getASTContext();
+    const auto &AST = ACFG->getDecl().getASTContext();
     bool Invalid = false;
 
     // Extract the source code from the original file.
@@ -353,7 +353,7 @@ class HTMLLogger : public Logger {
     // indentation to worry about), but we need the boundaries of particular
     // AST nodes and the printer doesn't provide this.
     auto Range = clang::Lexer::makeFileCharRange(
-        CharSourceRange::getTokenRange(CFC->getDecl().getSourceRange()),
+        CharSourceRange::getTokenRange(ACFG->getDecl().getSourceRange()),
         AST.getSourceManager(), AST.getLangOpts());
     if (Range.isInvalid())
       return;
@@ -419,7 +419,7 @@ class HTMLLogger : public Logger {
     // Construct one TokenInfo per character in a flat array.
     // This is inefficient (chars in a token all have the same info) but simple.
     std::vector<TokenInfo> State(Code.size());
-    for (const auto *Block : CFC->getCFG()) {
+    for (const auto *Block : ACFG->getCFG()) {
       unsigned EltIndex = 0;
       for (const auto& Elt : *Block) {
         ++EltIndex;
@@ -480,7 +480,7 @@ class HTMLLogger : public Logger {
   // out to `dot` to turn it into an SVG.
   void writeCFG() {
     *OS << "<template data-copy='cfg'>\n";
-    if (auto SVG = renderSVG(buildCFGDot(CFC->getCFG())))
+    if (auto SVG = renderSVG(buildCFGDot(ACFG->getCFG())))
       *OS << *SVG;
     else
       *OS << "Can't draw CFG: " << toString(SVG.takeError());
diff --git a/clang/lib/Analysis/FlowSensitive/Logger.cpp b/clang/lib/Analysis/FlowSensitive/Logger.cpp
index 8c401df62e4459..8f40768171c94e 100644
--- a/clang/lib/Analysis/FlowSensitive/Logger.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Logger.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Analysis/FlowSensitive/Logger.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h"
 #include "llvm/Support/WithColor.h"
 
@@ -33,17 +33,17 @@ struct TextualLogger final : Logger {
   TextualLogger(llvm::raw_ostream &OS)
       : OS(OS), ShowColors(llvm::WithColor::defaultAutoDetectFunction()(OS)) {}
 
-  virtual void beginAnalysis(const ControlFlowContext &CFG,
+  virtual void beginAnalysis(const AdornedCFG &ACFG,
                              TypeErasedDataflowAnalysis &Analysis) override {
     {
       llvm::WithColor Header(OS, llvm::raw_ostream::Colors::RED, /*Bold=*/true);
       OS << "=== Beginning data flow analysis ===\n";
     }
-    auto &D = CFG.getDecl();
+    auto &D = ACFG.getDecl();
     D.print(OS);
     OS << "\n";
     D.dump(OS);
-    CurrentCFG = &CFG.getCFG();
+    CurrentCFG = &ACFG.getCFG();
     CurrentCFG->print(OS, Analysis.getASTContext().getLangOpts(), ShowColors);
     CurrentAnalysis = &Analysis;
   }
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index 1d31b22b6d25ff..dbf4878622eba9 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -64,39 +64,125 @@ static bool hasOptionalClassName(const CXXRecordDecl &RD) {
   return false;
 }
 
+static const CXXRecordDecl *getOptionalBaseClass(const CXXRecordDecl *RD) {
+  if (RD == nullptr)
+    return nullptr;
+  if (hasOptionalClassName(*RD))
+    return RD;
+
+  if (!RD->hasDefinition())
+    return nullptr;
+
+  for (const CXXBaseSpecifier &Base : RD->bases())
+    if (const CXXRecordDecl *BaseClass =
+            getOptionalBaseClass(Base.getType()->getAsCXXRecordDecl()))
+      return BaseClass;
+
+  return nullptr;
+}
+
 namespace {
 
 using namespace ::clang::ast_matchers;
 using LatticeTransferState = TransferState<NoopLattice>;
 
-AST_MATCHER(CXXRecordDecl, hasOptionalClassNameMatcher) {
-  return hasOptionalClassName(Node);
+AST_MATCHER(CXXRecordDecl, optionalClass) { return hasOptionalClassName(Node); }
+
+AST_MATCHER(CXXRecordDecl, optionalOrDerivedClass) {
+  return getOptionalBaseClass(&Node) != nullptr;
 }
 
-DeclarationMatcher optionalClass() {
-  return classTemplateSpecializationDecl(
-      hasOptionalClassNameMatcher(),
-      hasTemplateArgument(0, refersToType(type().bind("T"))));
+auto desugarsToOptionalType() {
+  return hasUnqualifiedDesugaredType(
+      recordType(hasDeclaration(cxxRecordDecl(optionalClass()))));
 }
 
-auto optionalOrAliasType() {
+auto desugarsToOptionalOrDerivedType() {
   return hasUnqualifiedDesugaredType(
-      recordType(hasDeclaration(optionalClass())));
+      recordType(hasDeclaration(cxxRecordDecl(optionalOrDerivedClass()))));
+}
+
+auto hasOptionalType() { return hasType(desugarsToOptionalType()); }
+
+/// Matches any of the spellings of the optional types and sugar, aliases,
+/// derived classes, etc.
+auto hasOptionalOrDerivedType() {
+  return hasType(desugarsToOptionalOrDerivedType());
+}
+
+QualType getPublicType(const Expr *E) {
+  auto *Cast = dyn_cast<ImplicitCastExpr>(E->IgnoreParens());
+  if (Cast == nullptr || Cast->getCastKind() != CK_UncheckedDerivedToBase) {
+    QualType Ty = E->getType();
+    if (Ty->isPointerType())
+      return Ty->getPointeeType();
+    return Ty;
+  }
+
+  // Is the derived type that we're casting from the type of `*this`? In this
+  // special case, we can upcast to the base class even if the base is
+  // non-public.
+  bool CastingFromThis = isa<CXXThisExpr>(Cast->getSubExpr());
+
+  // Find the least-derived type in the path (i.e. the last entry in the list)
+  // that we can access.
+  const CXXBaseSpecifier *PublicBase = nullptr;
+  for (const CXXBaseSpecifier *Base : Cast->path()) {
+    if (Base->getAccessSpecifier() != AS_public && !CastingFromThis)
+      break;
+    PublicBase = Base;
+    CastingFromThis = false;
+  }
+
+  if (PublicBase != nullptr)
+    return PublicBase->getType();
+
+  // We didn't find any public type that we could cast to. There may be more
+  // casts in `getSubExpr()`, so recurse. (If there aren't any more casts, this
+  // will return the type of `getSubExpr()`.)
+  return getPublicType(Cast->getSubExpr());
 }
 
-/// Matches any of the spellings of the optional types and sugar, aliases, etc.
-auto hasOptionalType() { return hasType(optionalOrAliasType()); }
+// Returns the least-derived type for the receiver of `MCE` that
+// `MCE.getImplicitObjectArgument()->IgnoreParentImpCasts()` can be downcast to.
+// Effectively, we upcast until we reach a non-public base class, unless that
+// base is a base of `*this`.
+//
+// This is needed to correctly match methods called on types derived from
+// `std::optional`.
+//
+// Say we have a `struct Derived : public std::optional<int> {} d;` For a call
+// `d.has_value()`, the `getImplicitObjectArgument()` looks like this:
+//
+//   ImplicitCastExpr 'const std::__optional_storage_base<int>' lvalue
+//   |            <UncheckedDerivedToBase (optional -> __optional_storage_base)>
+//   `-DeclRefExpr 'Derived' lvalue Var 'd' 'Derived'
+//
+// The type of the implicit object argument is `__optional_storage_base`
+// (since this is the internal type that `has_value()` is declared on). If we
+// call `IgnoreParenImpCasts()` on the implicit object argument, we get the
+// `DeclRefExpr`, which has type `Derived`. Neither of these types is
+// `optional`, and hence neither is sufficient for querying whether we are
+// calling a method on `optional`.
+//
+// Instead, starting with the most derived type, we need to follow the chain of
+// casts
+QualType getPublicReceiverType(const CXXMemberCallExpr &MCE) {
+  return getPublicType(MCE.getImplicitObjectArgument());
+}
+
+AST_MATCHER_P(CXXMemberCallExpr, publicReceiverType,
+              ast_matchers::internal::Matcher<QualType>, InnerMatcher) {
+  return InnerMatcher.matches(getPublicReceiverType(Node), Finder, Builder);
+}
 
 auto isOptionalMemberCallWithNameMatcher(
     ast_matchers::internal::Matcher<NamedDecl> matcher,
     const std::optional<StatementMatcher> &Ignorable = std::nullopt) {
-  auto Exception = unless(Ignorable ? expr(anyOf(*Ignorable, cxxThisExpr()))
-                                    : cxxThisExpr());
-  return cxxMemberCallExpr(
-      on(expr(Exception,
-              anyOf(hasOptionalType(),
-                    hasType(pointerType(pointee(optionalOrAliasType())))))),
-      callee(cxxMethodDecl(matcher)));
+  return cxxMemberCallExpr(Ignorable ? on(expr(unless(*Ignorable)))
+                                     : anything(),
+                           publicReceiverType(desugarsToOptionalType()),
+                           callee(cxxMethodDecl(matcher)));
 }
 
 auto isOptionalOperatorCallWithName(
@@ -129,49 +215,51 @@ auto inPlaceClass() {
 
 auto isOptionalNulloptConstructor() {
   return cxxConstructExpr(
-      hasOptionalType(),
       hasDeclaration(cxxConstructorDecl(parameterCountIs(1),
-                                        hasParameter(0, hasNulloptType()))));
+                                        hasParameter(0, hasNulloptType()))),
+      hasOptionalOrDerivedType());
 }
 
 auto isOptionalInPlaceConstructor() {
-  return cxxConstructExpr(hasOptionalType(),
-                          hasArgument(0, hasType(inPlaceClass())));
+  return cxxConstructExpr(hasArgument(0, hasType(inPlaceClass())),
+                          hasOptionalOrDerivedType());
 }
 
 auto isOptionalValueOrConversionConstructor() {
   return cxxConstructExpr(
-      hasOptionalType(),
       unless(hasDeclaration(
           cxxConstructorDecl(anyOf(isCopyConstructor(), isMoveConstructor())))),
-      argumentCountIs(1), hasArgument(0, unless(hasNulloptType())));
+      argumentCountIs(1), hasArgument(0, unless(hasNulloptType())),
+      hasOptionalOrDerivedType());
 }
 
 auto isOptionalValueOrConversionAssignment() {
   return cxxOperatorCallExpr(
       hasOverloadedOperatorName("="),
-      callee(cxxMethodDecl(ofClass(optionalClass()))),
+      callee(cxxMethodDecl(ofClass(optionalOrDerivedClass()))),
       unless(hasDeclaration(cxxMethodDecl(
           anyOf(isCopyAssignmentOperator(), isMoveAssignmentOperator())))),
       argumentCountIs(2), hasArgument(1, unless(hasNulloptType())));
 }
 
 auto isOptionalNulloptAssignment() {
-  return cxxOperatorCallExpr(hasOverloadedOperatorName("="),
-                             callee(cxxMethodDecl(ofClass(optionalClass()))),
-                             argumentCountIs(2),
-                             hasArgument(1, hasNulloptType()));
+  return cxxOperatorCallExpr(
+      hasOverloadedOperatorName("="),
+      callee(cxxMethodDecl(ofClass(optionalOrDerivedClass()))),
+      argumentCountIs(2), hasArgument(1, hasNulloptType()));
 }
 
 auto isStdSwapCall() {
   return callExpr(callee(functionDecl(hasName("std::swap"))),
-                  argumentCountIs(2), hasArgument(0, hasOptionalType()),
-                  hasArgument(1, hasOptionalType()));
+                  argumentCountIs(2),
+                  hasArgument(0, hasOptionalOrDerivedType()),
+                  hasArgument(1, hasOptionalOrDerivedType()));
 }
 
 auto isStdForwardCall() {
   return callExpr(callee(functionDecl(hasName("std::forward"))),
-                  argumentCountIs(1), hasArgument(0, hasOptionalType()));
+                  argumentCountIs(1),
+                  hasArgument(0, hasOptionalOrDerivedType()));
 }
 
 constexpr llvm::StringLiteral ValueOrCallID = "ValueOrCall";
@@ -212,8 +300,9 @@ auto isValueOrNotEqX() {
 }
 
 auto isCallReturningOptional() {
-  return callExpr(hasType(qualType(anyOf(
-      optionalOrAliasType(), referenceType(pointee(optionalOrAliasType()))))));
+  return callExpr(hasType(qualType(
+      anyOf(desugarsToOptionalOrDerivedType(),
+            referenceType(pointee(desugarsToOptionalOrDerivedType()))))));
 }
 
 template <typename L, typename R>
@@ -275,12 +364,9 @@ BoolValue *getHasValue(Environment &Env, RecordStorageLocation *OptionalLoc) {
   return HasValueVal;
 }
 
-/// Returns true if and only if `Type` is an optional type.
-bool isOptionalType(QualType Type) {
-  if (!Type->isRecordType())
-    return false;
-  const CXXRecordDecl *D = Type->getAsCXXRecordDecl();
-  return D != nullptr && hasOptionalClassName(*D);
+QualType valueTypeFromOptionalDecl(const CXXRecordDecl &RD) {
+  auto &CTSD = cast<ClassTemplateSpecializationDecl>(RD);
+  return CTSD.getTemplateArgs()[0].getAsType();
 }
 
 /// Returns the number of optional wrappers in `Type`.
@@ -288,15 +374,13 @@ bool isOptionalType(QualType Type) {
 /// For example, if `Type` is `optional<optional<int>>`, the result of this
 /// function will be 2.
 int countOptionalWrappers(const ASTContext &ASTCtx, QualType Type) {
-  if (!isOptionalType(Type))
+  const CXXRecordDecl *Optional =
+      getOptionalBaseClass(Type->getAsCXXRecordDecl());
+  if (Optional == nullptr)
     return 0;
   return 1 + countOptionalWrappers(
                  ASTCtx,
-                 cast<ClassTemplateSpecializationDecl>(Type->getAsRecordDecl())
-                     ->getTemplateArgs()
-                     .get(0)
-                     .getAsType()
-                     .getDesugaredType(ASTCtx));
+                 valueTypeFromOptionalDecl(*Optional).getDesugaredType(ASTCtx));
 }
 
 StorageLocation *getLocBehindPossiblePointer(const Expr &E,
@@ -843,13 +927,7 @@ auto buildDiagnoseMatchSwitch(
 
 ast_matchers::DeclarationMatcher
 UncheckedOptionalAccessModel::optionalClassDecl() {
-  return optionalClass();
-}
-
-static QualType valueTypeFromOptionalType(QualType OptionalTy) {
-  auto *CTSD =
-      cast<ClassTemplateSpecializationDecl>(OptionalTy->getAsCXXRecordDecl());
-  return CTSD->getTemplateArgs()[0].getAsType();
+  return cxxRecordDecl(optionalClass());
 }
 
 UncheckedOptionalAccessModel::UncheckedOptionalAccessModel(ASTContext &Ctx,
@@ -858,9 +936,11 @@ UncheckedOptionalAccessModel::UncheckedOptionalAccessModel(ASTContext &Ctx,
       TransferMatchSwitch(buildTransferMatchSwitch()) {
   Env.getDataflowAnalysisContext().setSyntheticFieldCallback(
       [&Ctx](QualType Ty) -> llvm::StringMap<QualType> {
-        if (!isOptionalType(Ty))
+        const CXXRecordDecl *Optional =
+            getOptionalBaseClass(Ty->getAsCXXRecordDecl());
+        if (Optional == nullptr)
           return {};
-        return {{"value", valueTypeFromOptionalType(Ty)},
+        return {{"value", valueTypeFromOptionalDecl(*Optional)},
                 {"has_value", Ctx.BoolTy}};
       });
 }
diff --git a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
index da4dd6dc078515..2f0b0e5c5640c3 100644
--- a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
@@ -14,18 +14,52 @@
 
 #define DEBUG_TYPE "dataflow"
 
-void clang::dataflow::copyRecord(RecordStorageLocation &Src,
-                                 RecordStorageLocation &Dst, Environment &Env) {
+namespace clang::dataflow {
+
+static void copyField(const ValueDecl &Field, StorageLocation *SrcFieldLoc,
+                      StorageLocation *DstFieldLoc, RecordStorageLocation &Dst,
+                      Environment &Env) {
+  assert(Field.getType()->isReferenceType() ||
+         (SrcFieldLoc != nullptr && DstFieldLoc != nullptr));
+
+  if (Field.getType()->isRecordType()) {
+    copyRecord(cast<RecordStorageLocation>(*SrcFieldLoc),
+               cast<RecordStorageLocation>(*DstFieldLoc), Env);
+  } else if (Field.getType()->isReferenceType()) {
+    Dst.setChild(Field, SrcFieldLoc);
+  } else {
+    if (Value *Val = Env.getValue(*SrcFieldLoc))
+      Env.setValue(*DstFieldLoc, *Val);
+    else
+      Env.clearValue(*DstFieldLoc);
+  }
+}
+
+static void copySyntheticField(QualType FieldType, StorageLocation &SrcFieldLoc,
+                               StorageLocation &DstFieldLoc, Environment &Env) {
+  if (FieldType->isRecordType()) {
+    copyRecord(cast<RecordStorageLocation>(SrcFieldLoc),
+               cast<RecordStorageLocation>(DstFieldLoc), Env);
+  } else {
+    if (Value *Val = Env.getValue(SrcFieldLoc))
+      Env.setValue(DstFieldLoc, *Val);
+    else
+      Env.clearValue(DstFieldLoc);
+  }
+}
+
+void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
+                Environment &Env) {
   auto SrcType = Src.getType().getCanonicalType().getUnqualifiedType();
   auto DstType = Dst.getType().getCanonicalType().getUnqualifiedType();
 
   auto SrcDecl = SrcType->getAsCXXRecordDecl();
   auto DstDecl = DstType->getAsCXXRecordDecl();
 
-  bool compatibleTypes =
+  [[maybe_unused]] bool compatibleTypes =
       SrcType == DstType ||
-      (SrcDecl && DstDecl && SrcDecl->isDerivedFrom(DstDecl));
-  (void)compatibleTypes;
+      (SrcDecl != nullptr && DstDecl != nullptr &&
+       (SrcDecl->isDerivedFrom(DstDecl) || DstDecl->isDerivedFrom(SrcDecl)));
 
   LLVM_DEBUG({
     if (!compatibleTypes) {
@@ -35,45 +69,27 @@ void clang::dataflow::copyRecord(RecordStorageLocation &Src,
   });
   assert(compatibleTypes);
 
-  for (auto [Field, DstFieldLoc] : Dst.children()) {
-    StorageLocation *SrcFieldLoc = Src.getChild(*Field);
-
-    assert(Field->getType()->isReferenceType() ||
-           (SrcFieldLoc != nullptr && DstFieldLoc != nullptr));
-
-    if (Field->getType()->isRecordType()) {
-      copyRecord(cast<RecordStorageLocation>(*SrcFieldLoc),
-                 cast<RecordStorageLocation>(*DstFieldLoc), Env);
-    } else if (Field->getType()->isReferenceType()) {
-      Dst.setChild(*Field, SrcFieldLoc);
-    } else {
-      if (Value *Val = Env.getValue(*SrcFieldLoc))
-        Env.setValue(*DstFieldLoc, *Val);
-      else
-        Env.clearValue(*DstFieldLoc);
-    }
-  }
-
-  for (const auto &[Name, SynthFieldLoc] : Src.synthetic_fields()) {
-    if (SynthFieldLoc->getType()->isRecordType()) {
-      copyRecord(*cast<RecordStorageLocation>(SynthFieldLoc),
-                 cast<RecordStorageLocation>(Dst.getSyntheticField(Name)), Env);
-    } else {
-      if (Value *Val = Env.getValue(*SynthFieldLoc))
-        Env.setValue(Dst.getSyntheticField(Name), *Val);
-      else
-        Env.clearValue(Dst.getSyntheticField(Name));
-    }
+  if (SrcType == DstType || (SrcDecl != nullptr && DstDecl != nullptr &&
+                             SrcDecl->isDerivedFrom(DstDecl))) {
+    for (auto [Field, DstFieldLoc] : Dst.children())
+      copyField(*Field, Src.getChild(*Field), DstFieldLoc, Dst, Env);
+    for (const auto &[Name, DstFieldLoc] : Dst.synthetic_fields())
+      copySyntheticField(DstFieldLoc->getType(), Src.getSyntheticField(Name),
+                         *DstFieldLoc, Env);
+  } else {
+    for (auto [Field, SrcFieldLoc] : Src.children())
+      copyField(*Field, SrcFieldLoc, Dst.getChild(*Field), Dst, Env);
+    for (const auto &[Name, SrcFieldLoc] : Src.synthetic_fields())
+      copySyntheticField(SrcFieldLoc->getType(), *SrcFieldLoc,
+                         Dst.getSyntheticField(Name), Env);
   }
 
   RecordValue *DstVal = &Env.create<RecordValue>(Dst);
   Env.setValue(Dst, *DstVal);
 }
 
-bool clang::dataflow::recordsEqual(const RecordStorageLocation &Loc1,
-                                   const Environment &Env1,
-                                   const RecordStorageLocation &Loc2,
-                                   const Environment &Env2) {
+bool recordsEqual(const RecordStorageLocation &Loc1, const Environment &Env1,
+                  const RecordStorageLocation &Loc2, const Environment &Env2) {
   LLVM_DEBUG({
     if (Loc2.getType().getCanonicalType().getUnqualifiedType() !=
         Loc1.getType().getCanonicalType().getUnqualifiedType()) {
@@ -116,3 +132,5 @@ bool clang::dataflow::recordsEqual(const RecordStorageLocation &Loc1,
 
   return true;
 }
+
+} // namespace clang::dataflow
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 04aa2831df0558..960e9688ffb725 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -20,7 +20,7 @@
 #include "clang/AST/OperationKinds.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtVisitor.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/NoopAnalysis.h"
 #include "clang/Analysis/FlowSensitive/RecordOps.h"
@@ -38,9 +38,9 @@ namespace clang {
 namespace dataflow {
 
 const Environment *StmtToEnvMap::getEnvironment(const Stmt &S) const {
-  auto BlockIt = CFCtx.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
-  assert(BlockIt != CFCtx.getStmtToBlock().end());
-  if (!CFCtx.isBlockReachable(*BlockIt->getSecond()))
+  auto BlockIt = ACFG.getStmtToBlock().find(&ignoreCFGOmittedNodes(S));
+  assert(BlockIt != ACFG.getStmtToBlock().end());
+  if (!ACFG.isBlockReachable(*BlockIt->getSecond()))
     return nullptr;
   if (BlockIt->getSecond()->getBlockID() == CurBlockID)
     return &CurState.Env;
@@ -450,6 +450,25 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     Env.setStorageLocation(*S, *MemberLoc);
   }
 
+  void VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *S) {
+    const Expr *ArgExpr = S->getExpr();
+    assert(ArgExpr != nullptr);
+    propagateValueOrStorageLocation(*ArgExpr, *S, Env);
+
+    // If this is a prvalue of record type, we consider it to be an "original
+    // record constructor", which we always require to have a `RecordValue`.
+    // So make sure we have a value if we didn't propagate one above.
+    if (S->isPRValue() && S->getType()->isRecordType()) {
+      if (Env.getValue(*S) == nullptr) {
+        Value *Val = Env.createValue(S->getType());
+        // We're guaranteed to always be able to create a value for record
+        // types.
+        assert(Val != nullptr);
+        Env.setValue(*S, *Val);
+      }
+    }
+  }
+
   void VisitCXXDefaultInitExpr(const CXXDefaultInitExpr *S) {
     const Expr *InitExpr = S->getExpr();
     assert(InitExpr != nullptr);
@@ -525,15 +544,6 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       if (LocSrc == nullptr || LocDst == nullptr)
         return;
 
-      // The assignment operators are different from the type of the destination
-      // in this model (i.e. in one of their base classes). This must be very
-      // rare and we just bail.
-      if (Method->getFunctionObjectParameterType()
-              .getCanonicalType()
-              .getUnqualifiedType() !=
-          LocDst->getType().getCanonicalType().getUnqualifiedType())
-        return;
-
       copyRecord(*LocSrc, *LocDst, Env);
 
       // If the expr is a glvalue, we can reasonably assume the operator is
@@ -836,27 +846,26 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
           Env.canDescend(Options.ContextSensitiveOpts->Depth, F)))
       return;
 
-    const ControlFlowContext *CFCtx =
-        Env.getDataflowAnalysisContext().getControlFlowContext(F);
-    if (!CFCtx)
+    const AdornedCFG *ACFG = Env.getDataflowAnalysisContext().getAdornedCFG(F);
+    if (!ACFG)
       return;
 
     // FIXME: We don't support context-sensitive analysis of recursion, so
     // we should return early here if `F` is the same as the `FunctionDecl`
     // holding `S` itself.
 
-    auto ExitBlock = CFCtx->getCFG().getExit().getBlockID();
+    auto ExitBlock = ACFG->getCFG().getExit().getBlockID();
 
     auto CalleeEnv = Env.pushCall(S);
 
     // FIXME: Use the same analysis as the caller for the callee. Note,
     // though, that doing so would require support for changing the analysis's
     // ASTContext.
-    auto Analysis = NoopAnalysis(CFCtx->getDecl().getASTContext(),
+    auto Analysis = NoopAnalysis(ACFG->getDecl().getASTContext(),
                                  DataflowAnalysisOptions{Options});
 
     auto BlockToOutputState =
-        dataflow::runDataflowAnalysis(*CFCtx, Analysis, CalleeEnv);
+        dataflow::runDataflowAnalysis(*ACFG, Analysis, CalleeEnv);
     assert(BlockToOutputState);
     assert(ExitBlock < BlockToOutputState->size());
 
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 939247c047c66e..595f70f819ddb5 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
 #include <optional>
 #include <system_error>
 #include <utility>
@@ -33,7 +32,6 @@
 #include "clang/Analysis/FlowSensitive/Value.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 
@@ -64,106 +62,44 @@ static bool isBackedgeNode(const CFGBlock &B) {
 
 namespace {
 
-// The return type of the visit functions in TerminatorVisitor. The first
-// element represents the terminator expression (that is the conditional
-// expression in case of a path split in the CFG). The second element
-// represents whether the condition was true or false.
-using TerminatorVisitorRetTy = std::pair<const Expr *, bool>;
-
-/// Extends the flow condition of an environment based on a terminator
-/// statement.
+/// Extracts the terminator's condition expression.
 class TerminatorVisitor
-    : public ConstStmtVisitor<TerminatorVisitor, TerminatorVisitorRetTy> {
+    : public ConstStmtVisitor<TerminatorVisitor, const Expr *> {
 public:
-  TerminatorVisitor(Environment &Env, int BlockSuccIdx)
-      : Env(Env), BlockSuccIdx(BlockSuccIdx) {}
-
-  TerminatorVisitorRetTy VisitIfStmt(const IfStmt *S) {
-    auto *Cond = S->getCond();
-    assert(Cond != nullptr);
-    return extendFlowCondition(*Cond);
-  }
-
-  TerminatorVisitorRetTy VisitWhileStmt(const WhileStmt *S) {
-    auto *Cond = S->getCond();
-    assert(Cond != nullptr);
-    return extendFlowCondition(*Cond);
-  }
-
-  TerminatorVisitorRetTy VisitDoStmt(const DoStmt *S) {
-    auto *Cond = S->getCond();
-    assert(Cond != nullptr);
-    return extendFlowCondition(*Cond);
-  }
-
-  TerminatorVisitorRetTy VisitForStmt(const ForStmt *S) {
-    auto *Cond = S->getCond();
-    if (Cond != nullptr)
-      return extendFlowCondition(*Cond);
-    return {nullptr, false};
-  }
-
-  TerminatorVisitorRetTy VisitCXXForRangeStmt(const CXXForRangeStmt *) {
+  TerminatorVisitor() = default;
+  const Expr *VisitIfStmt(const IfStmt *S) { return S->getCond(); }
+  const Expr *VisitWhileStmt(const WhileStmt *S) { return S->getCond(); }
+  const Expr *VisitDoStmt(const DoStmt *S) { return S->getCond(); }
+  const Expr *VisitForStmt(const ForStmt *S) { return S->getCond(); }
+  const Expr *VisitCXXForRangeStmt(const CXXForRangeStmt *) {
     // Don't do anything special for CXXForRangeStmt, because the condition
     // (being implicitly generated) isn't visible from the loop body.
-    return {nullptr, false};
+    return nullptr;
   }
-
-  TerminatorVisitorRetTy VisitBinaryOperator(const BinaryOperator *S) {
+  const Expr *VisitBinaryOperator(const BinaryOperator *S) {
     assert(S->getOpcode() == BO_LAnd || S->getOpcode() == BO_LOr);
-    auto *LHS = S->getLHS();
-    assert(LHS != nullptr);
-    return extendFlowCondition(*LHS);
+    return S->getLHS();
   }
-
-  TerminatorVisitorRetTy
-  VisitConditionalOperator(const ConditionalOperator *S) {
-    auto *Cond = S->getCond();
-    assert(Cond != nullptr);
-    return extendFlowCondition(*Cond);
+  const Expr *VisitConditionalOperator(const ConditionalOperator *S) {
+    return S->getCond();
   }
-
-private:
-  TerminatorVisitorRetTy extendFlowCondition(const Expr &Cond) {
-    auto *Val = Env.get<BoolValue>(Cond);
-    // In transferCFGBlock(), we ensure that we always have a `Value` for the
-    // terminator condition, so assert this.
-    // We consciously assert ourselves instead of asserting via `cast()` so
-    // that we get a more meaningful line number if the assertion fails.
-    assert(Val != nullptr);
-
-    bool ConditionValue = true;
-    // The condition must be inverted for the successor that encompasses the
-    // "else" branch, if such exists.
-    if (BlockSuccIdx == 1) {
-      Val = &Env.makeNot(*Val);
-      ConditionValue = false;
-    }
-
-    Env.assume(Val->formula());
-    return {&Cond, ConditionValue};
-  }
-
-  Environment &Env;
-  int BlockSuccIdx;
 };
 
 /// Holds data structures required for running dataflow analysis.
 struct AnalysisContext {
-  AnalysisContext(const ControlFlowContext &CFCtx,
-                  TypeErasedDataflowAnalysis &Analysis,
+  AnalysisContext(const AdornedCFG &ACFG, TypeErasedDataflowAnalysis &Analysis,
                   const Environment &InitEnv,
                   llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>>
                       BlockStates)
-      : CFCtx(CFCtx), Analysis(Analysis), InitEnv(InitEnv),
+      : ACFG(ACFG), Analysis(Analysis), InitEnv(InitEnv),
         Log(*InitEnv.getDataflowAnalysisContext().getOptions().Log),
         BlockStates(BlockStates) {
-    Log.beginAnalysis(CFCtx, Analysis);
+    Log.beginAnalysis(ACFG, Analysis);
   }
   ~AnalysisContext() { Log.endAnalysis(); }
 
   /// Contains the CFG being analyzed.
-  const ControlFlowContext &CFCtx;
+  const AdornedCFG &ACFG;
   /// The analysis to be run.
   TypeErasedDataflowAnalysis &Analysis;
   /// Initial state to start the analysis.
@@ -176,19 +112,19 @@ struct AnalysisContext {
 
 class PrettyStackTraceAnalysis : public llvm::PrettyStackTraceEntry {
 public:
-  PrettyStackTraceAnalysis(const ControlFlowContext &CFCtx, const char *Message)
-      : CFCtx(CFCtx), Message(Message) {}
+  PrettyStackTraceAnalysis(const AdornedCFG &ACFG, const char *Message)
+      : ACFG(ACFG), Message(Message) {}
 
   void print(raw_ostream &OS) const override {
     OS << Message << "\n";
     OS << "Decl:\n";
-    CFCtx.getDecl().dump(OS);
+    ACFG.getDecl().dump(OS);
     OS << "CFG:\n";
-    CFCtx.getCFG().print(OS, LangOptions(), false);
+    ACFG.getCFG().print(OS, LangOptions(), false);
   }
 
 private:
-  const ControlFlowContext &CFCtx;
+  const AdornedCFG &ACFG;
   const char *Message;
 };
 
@@ -264,9 +200,13 @@ class JoinedStateBuilder {
     return Result;
   }
 };
-
 } // namespace
 
+static const Expr *getTerminatorCondition(const Stmt *TerminatorStmt) {
+  return TerminatorStmt == nullptr ? nullptr
+                                   : TerminatorVisitor().Visit(TerminatorStmt);
+}
+
 /// Computes the input state for a given basic block by joining the output
 /// states of its predecessors.
 ///
@@ -303,7 +243,7 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
     // See `NoreturnDestructorTest` for concrete examples.
     if (Block.succ_begin()->getReachableBlock() != nullptr &&
         Block.succ_begin()->getReachableBlock()->hasNoReturnElement()) {
-      auto &StmtToBlock = AC.CFCtx.getStmtToBlock();
+      auto &StmtToBlock = AC.ACFG.getStmtToBlock();
       auto StmtBlock = StmtToBlock.find(Block.getTerminatorStmt());
       assert(StmtBlock != StmtToBlock.end());
       llvm::erase(Preds, StmtBlock->getSecond());
@@ -319,7 +259,7 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
   // all predecessors have expression state consumed in a different block.
   Environment::ExprJoinBehavior JoinBehavior = Environment::DiscardExprState;
   for (const CFGBlock *Pred : Preds) {
-    if (Pred && AC.CFCtx.containsExprConsumedInDifferentBlock(*Pred)) {
+    if (Pred && AC.ACFG.containsExprConsumedInDifferentBlock(*Pred)) {
       JoinBehavior = Environment::KeepExprState;
       break;
     }
@@ -338,25 +278,32 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
     if (!MaybePredState)
       continue;
 
+    const TypeErasedDataflowAnalysisState &PredState = *MaybePredState;
+    const Expr *Cond = getTerminatorCondition(Pred->getTerminatorStmt());
+    if (Cond == nullptr) {
+      Builder.addUnowned(PredState);
+      continue;
+    }
+
+    bool BranchVal = blockIndexInPredecessor(*Pred, Block) == 0;
+
+    // `transferBranch` may need to mutate the environment to describe the
+    // dynamic effect of the terminator for a given branch.  Copy now.
+    TypeErasedDataflowAnalysisState Copy = MaybePredState->fork();
     if (AC.Analysis.builtinOptions()) {
-      if (const Stmt *PredTerminatorStmt = Pred->getTerminatorStmt()) {
-        // We have a terminator: we need to mutate an environment to describe
-        // when the terminator is taken. Copy now.
-        TypeErasedDataflowAnalysisState Copy = MaybePredState->fork();
-
-        auto [Cond, CondValue] =
-            TerminatorVisitor(Copy.Env, blockIndexInPredecessor(*Pred, Block))
-                .Visit(PredTerminatorStmt);
-        if (Cond != nullptr)
-          // FIXME: Call transferBranchTypeErased even if BuiltinTransferOpts
-          // are not set.
-          AC.Analysis.transferBranchTypeErased(CondValue, Cond, Copy.Lattice,
-                                               Copy.Env);
-        Builder.addOwned(std::move(Copy));
-        continue;
-      }
+      auto *CondVal = Copy.Env.get<BoolValue>(*Cond);
+      // In transferCFGBlock(), we ensure that we always have a `Value`
+      // for the terminator condition, so assert this. We consciously
+      // assert ourselves instead of asserting via `cast()` so that we get
+      // a more meaningful line number if the assertion fails.
+      assert(CondVal != nullptr);
+      BoolValue *AssertedVal =
+          BranchVal ? CondVal : &Copy.Env.makeNot(*CondVal);
+      Copy.Env.assume(AssertedVal->formula());
     }
-    Builder.addUnowned(*MaybePredState);
+    AC.Analysis.transferBranchTypeErased(BranchVal, Cond, Copy.Lattice,
+                                         Copy.Env);
+    Builder.addOwned(std::move(Copy));
   }
   return std::move(Builder).take();
 }
@@ -368,7 +315,7 @@ builtinTransferStatement(unsigned CurBlockID, const CFGStmt &Elt,
                          AnalysisContext &AC) {
   const Stmt *S = Elt.getStmt();
   assert(S != nullptr);
-  transfer(StmtToEnvMap(AC.CFCtx, AC.BlockStates, CurBlockID, InputState), *S,
+  transfer(StmtToEnvMap(AC.ACFG, AC.BlockStates, CurBlockID, InputState), *S,
            InputState.Env);
 }
 
@@ -511,9 +458,8 @@ transferCFGBlock(const CFGBlock &Block, AnalysisContext &AC,
       // takes a `CFGElement` as input, but some expressions only show up as a
       // terminator condition, but not as a `CFGElement`. The condition of an if
       // statement is one such example.
-      transfer(
-          StmtToEnvMap(AC.CFCtx, AC.BlockStates, Block.getBlockID(), State),
-          *TerminatorCond, State.Env);
+      transfer(StmtToEnvMap(AC.ACFG, AC.BlockStates, Block.getBlockID(), State),
+               *TerminatorCond, State.Env);
 
     // If the transfer function didn't produce a value, create an atom so that
     // we have *some* value for the condition expression. This ensures that
@@ -528,13 +474,13 @@ transferCFGBlock(const CFGBlock &Block, AnalysisContext &AC,
 
 llvm::Expected<std::vector<std::optional<TypeErasedDataflowAnalysisState>>>
 runTypeErasedDataflowAnalysis(
-    const ControlFlowContext &CFCtx, TypeErasedDataflowAnalysis &Analysis,
+    const AdornedCFG &ACFG, TypeErasedDataflowAnalysis &Analysis,
     const Environment &InitEnv,
     std::function<void(const CFGElement &,
                        const TypeErasedDataflowAnalysisState &)>
         PostVisitCFG,
     std::int32_t MaxBlockVisits) {
-  PrettyStackTraceAnalysis CrashInfo(CFCtx, "runTypeErasedDataflowAnalysis");
+  PrettyStackTraceAnalysis CrashInfo(ACFG, "runTypeErasedDataflowAnalysis");
 
   std::optional<Environment> MaybeStartingEnv;
   if (InitEnv.callStackSize() == 1) {
@@ -544,7 +490,7 @@ runTypeErasedDataflowAnalysis(
   const Environment &StartingEnv =
       MaybeStartingEnv ? *MaybeStartingEnv : InitEnv;
 
-  const clang::CFG &CFG = CFCtx.getCFG();
+  const clang::CFG &CFG = ACFG.getCFG();
   PostOrderCFGView POV(&CFG);
   ForwardDataflowWorklist Worklist(CFG, &POV);
 
@@ -557,7 +503,7 @@ runTypeErasedDataflowAnalysis(
                                      StartingEnv.fork()};
   Worklist.enqueueSuccessors(&Entry);
 
-  AnalysisContext AC(CFCtx, Analysis, StartingEnv, BlockStates);
+  AnalysisContext AC(ACFG, Analysis, StartingEnv, BlockStates);
   std::int32_t BlockVisits = 0;
   while (const CFGBlock *Block = Worklist.dequeue()) {
     LLVM_DEBUG(llvm::dbgs()
@@ -615,7 +561,7 @@ runTypeErasedDataflowAnalysis(
   // state set to `std::nullopt` at this point) to also analyze dead code.
 
   if (PostVisitCFG) {
-    for (const CFGBlock *Block : CFCtx.getCFG()) {
+    for (const CFGBlock *Block : ACFG.getCFG()) {
       // Skip blocks that were not evaluated.
       if (!BlockStates[Block->getBlockID()])
         continue;
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index b353a6627f298b..cd42573968b212 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -49,6 +49,7 @@ struct StaticDiagInfoDescriptionStringTable {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
   // clang-format on
 #undef DIAG
 };
@@ -70,7 +71,8 @@ const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
-  // clang-format on
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+// clang-format on
 #undef DIAG
 };
 
@@ -95,7 +97,8 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
-  // clang-format on
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+// clang-format on
 #undef DIAG
 };
 
@@ -173,6 +176,7 @@ VALIDATE_DIAG_SIZE(CROSSTU)
 VALIDATE_DIAG_SIZE(SEMA)
 VALIDATE_DIAG_SIZE(ANALYSIS)
 VALIDATE_DIAG_SIZE(REFACTORING)
+VALIDATE_DIAG_SIZE(INSTALLAPI)
 #undef VALIDATE_DIAG_SIZE
 #undef STRINGIFY_NAME
 
@@ -204,6 +208,7 @@ const StaticDiagInfoRec StaticDiagInfo[] = {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
 // clang-format on
 #undef DIAG
 };
@@ -246,6 +251,7 @@ CATEGORY(CROSSTU, COMMENT)
 CATEGORY(SEMA, CROSSTU)
 CATEGORY(ANALYSIS, SEMA)
 CATEGORY(REFACTORING, ANALYSIS)
+CATEGORY(INSTALLAPI, REFACTORING)
 #undef CATEGORY
 
   // Avoid out of bounds reads.
@@ -855,6 +861,9 @@ bool DiagnosticIDs::isUnrecoverable(unsigned DiagID) const {
   if (isARCDiagnostic(DiagID))
     return false;
 
+  if (isCodegenABICheckDiagnostic(DiagID))
+    return false;
+
   return true;
 }
 
@@ -862,3 +871,8 @@ bool DiagnosticIDs::isARCDiagnostic(unsigned DiagID) {
   unsigned cat = getCategoryNumberForDiag(DiagID);
   return DiagnosticIDs::getCategoryNameFromID(cat).starts_with("ARC ");
 }
+
+bool DiagnosticIDs::isCodegenABICheckDiagnostic(unsigned DiagID) {
+  unsigned cat = getCategoryNumberForDiag(DiagID);
+  return DiagnosticIDs::getCategoryNameFromID(cat) == "Codegen ABI Check";
+}
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 96b3ad9ba2f273..5d9055174c089a 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -925,6 +925,10 @@ bool TargetInfo::validateInputConstraint(
   return true;
 }
 
+bool TargetInfo::validatePointerAuthKey(const llvm::APSInt &value) const {
+  return false;
+}
+
 void TargetInfo::CheckFixedPointBits() const {
   // Check that the number of fractional and integral bits (and maybe sign) can
   // fit into the bits given for a fixed point type.
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 5abb060073c517..1c3199bd76eed6 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -11,9 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -199,13 +201,23 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple,
 StringRef AArch64TargetInfo::getABI() const { return ABI; }
 
 bool AArch64TargetInfo::setABI(const std::string &Name) {
-  if (Name != "aapcs" && Name != "darwinpcs")
+  if (Name != "aapcs" && Name != "aapcs-soft" && Name != "darwinpcs")
     return false;
 
   ABI = Name;
   return true;
 }
 
+bool AArch64TargetInfo::validateTarget(DiagnosticsEngine &Diags) const {
+  if (hasFeature("fp") && ABI == "aapcs-soft") {
+    // aapcs-soft is not allowed for targets with an FPU, to avoid there being
+    // two incomatible ABIs.
+    Diags.Report(diag::err_target_unsupported_abi_with_fpu) << ABI;
+    return false;
+  }
+  return true;
+}
+
 bool AArch64TargetInfo::validateBranchProtection(StringRef Spec, StringRef,
                                                  BranchProtectionInfo &BPI,
                                                  StringRef &Err) const {
@@ -680,7 +692,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Cases("aarch64", "arm64", "arm", true)
       .Case("fmv", HasFMV)
-      .Cases("neon", "fp", "simd", FPU & NeonMode)
+      .Case("fp", FPU & FPUMode)
+      .Cases("neon", "simd", FPU & NeonMode)
       .Case("jscvt", HasJSCVT)
       .Case("fcma", HasFCMA)
       .Case("rng", HasRandGen)
@@ -1450,6 +1463,11 @@ int AArch64TargetInfo::getEHDataRegisterNumber(unsigned RegNo) const {
   return -1;
 }
 
+bool AArch64TargetInfo::validatePointerAuthKey(
+    const llvm::APSInt &value) const {
+  return 0 <= value && value <= 3;
+}
+
 bool AArch64TargetInfo::hasInt128Type() const { return true; }
 
 AArch64leTargetInfo::AArch64leTargetInfo(const llvm::Triple &Triple,
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index c1ba156860a122..542894c66412dc 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -195,10 +195,14 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
 
   int getEHDataRegisterNumber(unsigned RegNo) const override;
 
+  bool validatePointerAuthKey(const llvm::APSInt &value) const override;
+
   const char *getBFloat16Mangling() const override { return "u6__bf16"; };
   bool hasInt128Type() const override;
 
   bool hasBitIntType() const override { return true; }
+
+  bool validateTarget(DiagnosticsEngine &Diags) const override;
 };
 
 class LLVM_LIBRARY_VISIBILITY AArch64leTargetInfo : public AArch64TargetInfo {
diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
index 26a54f631fcfc4..b5ba11a3bdca9d 100644
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -36,7 +36,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
     return;
   }
 
-  Builder.defineMacro("__BPF_FEATURE_ARENA_CAST");
+  Builder.defineMacro("__BPF_FEATURE_ADDR_SPACE_CAST");
 
   if (CPU.empty() || CPU == "generic" || CPU == "v1") {
     Builder.defineMacro("__BPF_CPU_VERSION__", "1");
@@ -45,6 +45,7 @@ void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   std::string CpuVerNumStr = CPU.substr(1);
   Builder.defineMacro("__BPF_CPU_VERSION__", CpuVerNumStr);
+  Builder.defineMacro("__BPF_FEATURE_MAY_GOTO");
 
   int CpuVerNum = std::stoi(CpuVerNumStr);
   if (CpuVerNum >= 2)
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 0cbace7b7f7bbd..ad0b50d799618e 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1540,7 +1540,10 @@ llvm::Function *CodeGenFunction::GenerateBlockFunction(
   llvm::BasicBlock *resume = Builder.GetInsertBlock();
 
   // Go back to the entry.
-  ++entry_ptr;
+  if (entry_ptr->getNextNonDebugInstruction())
+    entry_ptr = entry_ptr->getNextNonDebugInstruction()->getIterator();
+  else
+    entry_ptr = entry->end();
   Builder.SetInsertPoint(entry, entry_ptr);
 
   // Emit debug information for all the DeclRefExprs.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 528a13fb275124..77cb269d43c5a8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -858,7 +858,7 @@ static unsigned CountCountedByAttrs(const RecordDecl *RD) {
 
   for (const Decl *D : RD->decls()) {
     if (const auto *FD = dyn_cast<FieldDecl>(D);
-        FD && FD->hasAttr<CountedByAttr>()) {
+        FD && FD->getType()->isCountAttributedType()) {
       return ++Num;
     }
 
@@ -956,7 +956,7 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
     //         };
     //    };
     //
-    // We don't konw which 'count' to use in this scenario:
+    // We don't know which 'count' to use in this scenario:
     //
     //     size_t get_size(struct union_of_fams *p) {
     //         return __builtin_dynamic_object_size(p, 1);
@@ -975,7 +975,7 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
       FindFlexibleArrayMemberField(Ctx, OuterRD, FAMName, Offset);
   Offset = Ctx.toCharUnitsFromBits(Offset).getQuantity();
 
-  if (!FAMDecl || !FAMDecl->hasAttr<CountedByAttr>())
+  if (!FAMDecl || !FAMDecl->getType()->isCountAttributedType())
     // No flexible array member found or it doesn't have the "counted_by"
     // attribute.
     return nullptr;
@@ -5208,6 +5208,73 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__iso_volatile_store64:
     return RValue::get(EmitISOVolatileStore(*this, E));
 
+  case Builtin::BI__builtin_ptrauth_auth:
+  case Builtin::BI__builtin_ptrauth_auth_and_resign:
+  case Builtin::BI__builtin_ptrauth_blend_discriminator:
+  case Builtin::BI__builtin_ptrauth_sign_generic_data:
+  case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
+  case Builtin::BI__builtin_ptrauth_strip: {
+    // Emit the arguments.
+    SmallVector<llvm::Value *, 5> Args;
+    for (auto argExpr : E->arguments())
+      Args.push_back(EmitScalarExpr(argExpr));
+
+    // Cast the value to intptr_t, saving its original type.
+    llvm::Type *OrigValueType = Args[0]->getType();
+    if (OrigValueType->isPointerTy())
+      Args[0] = Builder.CreatePtrToInt(Args[0], IntPtrTy);
+
+    switch (BuiltinID) {
+    case Builtin::BI__builtin_ptrauth_auth_and_resign:
+      if (Args[4]->getType()->isPointerTy())
+        Args[4] = Builder.CreatePtrToInt(Args[4], IntPtrTy);
+      LLVM_FALLTHROUGH;
+
+    case Builtin::BI__builtin_ptrauth_auth:
+    case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
+      if (Args[2]->getType()->isPointerTy())
+        Args[2] = Builder.CreatePtrToInt(Args[2], IntPtrTy);
+      break;
+
+    case Builtin::BI__builtin_ptrauth_sign_generic_data:
+      if (Args[1]->getType()->isPointerTy())
+        Args[1] = Builder.CreatePtrToInt(Args[1], IntPtrTy);
+      break;
+
+    case Builtin::BI__builtin_ptrauth_blend_discriminator:
+    case Builtin::BI__builtin_ptrauth_strip:
+      break;
+    }
+
+    // Call the intrinsic.
+    auto IntrinsicID = [&]() -> unsigned {
+      switch (BuiltinID) {
+      case Builtin::BI__builtin_ptrauth_auth:
+        return llvm::Intrinsic::ptrauth_auth;
+      case Builtin::BI__builtin_ptrauth_auth_and_resign:
+        return llvm::Intrinsic::ptrauth_resign;
+      case Builtin::BI__builtin_ptrauth_blend_discriminator:
+        return llvm::Intrinsic::ptrauth_blend;
+      case Builtin::BI__builtin_ptrauth_sign_generic_data:
+        return llvm::Intrinsic::ptrauth_sign_generic;
+      case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
+        return llvm::Intrinsic::ptrauth_sign;
+      case Builtin::BI__builtin_ptrauth_strip:
+        return llvm::Intrinsic::ptrauth_strip;
+      }
+      llvm_unreachable("bad ptrauth intrinsic");
+    }();
+    auto Intrinsic = CGM.getIntrinsic(IntrinsicID);
+    llvm::Value *Result = EmitRuntimeCall(Intrinsic, Args);
+
+    if (BuiltinID != Builtin::BI__builtin_ptrauth_sign_generic_data &&
+        BuiltinID != Builtin::BI__builtin_ptrauth_blend_discriminator &&
+        OrigValueType->isPointerTy()) {
+      Result = Builder.CreateIntToPtr(Result, OrigValueType);
+    }
+    return RValue::get(Result);
+  }
+
   case Builtin::BI__exception_code:
   case Builtin::BI_exception_code:
     return RValue::get(EmitSEHExceptionCode());
@@ -17969,6 +18036,17 @@ llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
   return Arg;
 }
 
+Intrinsic::ID getDotProductIntrinsic(QualType QT) {
+  if (QT->hasSignedIntegerRepresentation())
+    return Intrinsic::dx_sdot;
+  if (QT->hasUnsignedIntegerRepresentation())
+    return Intrinsic::dx_udot;
+
+  assert(QT->hasFloatingRepresentation());
+  return Intrinsic::dx_dot;
+  ;
+}
+
 Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
                                             const CallExpr *E) {
   if (!getLangOpts().HLSL)
@@ -17981,6 +18059,21 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/llvm::Type::getInt1Ty(getLLVMContext()),
         Intrinsic::dx_any, ArrayRef<Value *>{Op0}, nullptr, "dx.any");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_clamp: {
+    Value *OpX = EmitScalarExpr(E->getArg(0));
+    Value *OpMin = EmitScalarExpr(E->getArg(1));
+    Value *OpMax = EmitScalarExpr(E->getArg(2));
+
+    QualType Ty = E->getArg(0)->getType();
+    bool IsUnsigned = false;
+    if (auto *VecTy = Ty->getAs<VectorType>())
+      Ty = VecTy->getElementType();
+    IsUnsigned = Ty->isUnsignedIntegerType();
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/OpX->getType(),
+        IsUnsigned ? Intrinsic::dx_uclamp : Intrinsic::dx_clamp,
+        ArrayRef<Value *>{OpX, OpMin, OpMax}, nullptr, "dx.clamp");
+  }
   case Builtin::BI__builtin_hlsl_dot: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     Value *Op1 = EmitScalarExpr(E->getArg(1));
@@ -18014,45 +18107,19 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
            "Dot product requires vectors to be of the same size.");
 
     return Builder.CreateIntrinsic(
-        /*ReturnType=*/T0->getScalarType(), Intrinsic::dx_dot,
+        /*ReturnType=*/T0->getScalarType(),
+        getDotProductIntrinsic(E->getArg(0)->getType()),
         ArrayRef<Value *>{Op0, Op1}, nullptr, "dx.dot");
   } break;
   case Builtin::BI__builtin_hlsl_lerp: {
     Value *X = EmitScalarExpr(E->getArg(0));
     Value *Y = EmitScalarExpr(E->getArg(1));
     Value *S = EmitScalarExpr(E->getArg(2));
-    llvm::Type *Xty = X->getType();
-    llvm::Type *Yty = Y->getType();
-    llvm::Type *Sty = S->getType();
-    if (!Xty->isVectorTy() && !Yty->isVectorTy() && !Sty->isVectorTy()) {
-      if (Xty->isFloatingPointTy()) {
-        auto V = Builder.CreateFSub(Y, X);
-        V = Builder.CreateFMul(S, V);
-        return Builder.CreateFAdd(X, V, "dx.lerp");
-      }
-      llvm_unreachable("Scalar Lerp is only supported on floats.");
-    }
-    // A VectorSplat should have happened
-    assert(Xty->isVectorTy() && Yty->isVectorTy() && Sty->isVectorTy() &&
-           "Lerp of vector and scalar is not supported.");
-
-    [[maybe_unused]] auto *XVecTy =
-        E->getArg(0)->getType()->getAs<VectorType>();
-    [[maybe_unused]] auto *YVecTy =
-        E->getArg(1)->getType()->getAs<VectorType>();
-    [[maybe_unused]] auto *SVecTy =
-        E->getArg(2)->getType()->getAs<VectorType>();
-    // A HLSLVectorTruncation should have happend
-    assert(XVecTy->getNumElements() == YVecTy->getNumElements() &&
-           XVecTy->getNumElements() == SVecTy->getNumElements() &&
-           "Lerp requires vectors to be of the same size.");
-    assert(XVecTy->getElementType()->isRealFloatingType() &&
-           XVecTy->getElementType() == YVecTy->getElementType() &&
-           XVecTy->getElementType() == SVecTy->getElementType() &&
-           "Lerp requires float vectors to be of the same type.");
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("lerp operand must have a float representation");
     return Builder.CreateIntrinsic(
-        /*ReturnType=*/Xty, Intrinsic::dx_lerp, ArrayRef<Value *>{X, Y, S},
-        nullptr, "dx.lerp");
+        /*ReturnType=*/X->getType(), Intrinsic::dx_lerp,
+        ArrayRef<Value *>{X, Y, S}, nullptr, "dx.lerp");
   }
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
@@ -18062,6 +18129,20 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/Op0->getType(), Intrinsic::dx_frac,
         ArrayRef<Value *>{Op0}, nullptr, "dx.frac");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_isinf: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    llvm::Type *Xty = Op0->getType();
+    llvm::Type *retType = llvm::Type::getInt1Ty(this->getLLVMContext());
+    if (Xty->isVectorTy()) {
+      auto *XVecTy = E->getArg(0)->getType()->getAs<VectorType>();
+      retType = llvm::VectorType::get(
+          retType, ElementCount::getFixed(XVecTy->getNumElements()));
+    }
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("isinf operand must have a float representation");
+    return Builder.CreateIntrinsic(retType, Intrinsic::dx_isinf,
+                                   ArrayRef<Value *>{Op0}, nullptr, "dx.isinf");
+  }
   case Builtin::BI__builtin_hlsl_mad: {
     Value *M = EmitScalarExpr(E->getArg(0));
     Value *A = EmitScalarExpr(E->getArg(1));
@@ -18089,6 +18170,14 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/Op0->getType(), Intrinsic::dx_rcp,
         ArrayRef<Value *>{Op0}, nullptr, "dx.rcp");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_rsqrt: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    if (!E->getArg(0)->getType()->hasFloatingRepresentation())
+      llvm_unreachable("rsqrt operand must have a float representation");
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/Op0->getType(), Intrinsic::dx_rsqrt,
+        ArrayRef<Value *>{Op0}, nullptr, "dx.rsqrt");
+  }
   }
   return nullptr;
 }
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 49f93451db7bbb..d3f2573fd5e38a 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -491,7 +491,8 @@ static void replaceManagedVar(llvm::GlobalVariable *Var,
       // variable with instructions.
       for (auto &&Op : WorkItem) {
         auto *CE = cast<llvm::ConstantExpr>(Op);
-        auto *NewInst = CE->getAsInstruction(I);
+        auto *NewInst = CE->getAsInstruction();
+        NewInst->insertBefore(*I->getParent(), I->getIterator());
         NewInst->replaceUsesOfWith(OldV, NewV);
         OldV = CE;
         NewV = NewInst;
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index c2c01439f2dc99..07ecaa81c47d84 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -3658,6 +3658,7 @@ llvm::DIType *CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile *Unit) {
   case Type::TemplateSpecialization:
     return CreateType(cast<TemplateSpecializationType>(Ty), Unit);
 
+  case Type::CountAttributed:
   case Type::Auto:
   case Type::Attributed:
   case Type::BTFTagAttributed:
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 59a7fe8925001c..85f5d739cef457 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1136,38 +1136,19 @@ llvm::Value *CodeGenFunction::EmitCountedByFieldExpr(
 }
 
 const FieldDecl *CodeGenFunction::FindCountedByField(const FieldDecl *FD) {
-  if (!FD || !FD->hasAttr<CountedByAttr>())
+  if (!FD)
     return nullptr;
 
-  const auto *CBA = FD->getAttr<CountedByAttr>();
-  if (!CBA)
+  const auto *CAT = FD->getType()->getAs<CountAttributedType>();
+  if (!CAT)
     return nullptr;
 
-  auto GetNonAnonStructOrUnion =
-      [](const RecordDecl *RD) -> const RecordDecl * {
-    while (RD && RD->isAnonymousStructOrUnion()) {
-      const auto *R = dyn_cast<RecordDecl>(RD->getDeclContext());
-      if (!R)
-        return nullptr;
-      RD = R;
-    }
-    return RD;
-  };
-  const RecordDecl *EnclosingRD = GetNonAnonStructOrUnion(FD->getParent());
-  if (!EnclosingRD)
-    return nullptr;
-
-  DeclarationName DName(CBA->getCountedByField());
-  DeclContext::lookup_result Lookup = EnclosingRD->lookup(DName);
-
-  if (Lookup.empty())
-    return nullptr;
-
-  const NamedDecl *ND = Lookup.front();
-  if (const auto *IFD = dyn_cast<IndirectFieldDecl>(ND))
-    ND = IFD->getAnonField();
+  const auto *CountDRE = cast<DeclRefExpr>(CAT->getCountExpr());
+  const auto *CountDecl = CountDRE->getDecl();
+  if (const auto *IFD = dyn_cast<IndirectFieldDecl>(CountDecl))
+    CountDecl = IFD->getAnonField();
 
-  return dyn_cast<FieldDecl>(ND);
+  return dyn_cast<FieldDecl>(CountDecl);
 }
 
 void CodeGenFunction::EmitBoundsCheck(const Expr *E, const Expr *Base,
@@ -4259,7 +4240,7 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
       if (const auto *ME = dyn_cast<MemberExpr>(Array);
           ME &&
           ME->isFlexibleArrayMemberLike(getContext(), StrictFlexArraysLevel) &&
-          ME->getMemberDecl()->hasAttr<CountedByAttr>()) {
+          ME->getMemberDecl()->getType()->isCountAttributedType()) {
         const FieldDecl *FAMDecl = dyn_cast<FieldDecl>(ME->getMemberDecl());
         if (const FieldDecl *CountFD = FindCountedByField(FAMDecl)) {
           if (std::optional<int64_t> Diff =
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 452ce6983f6ac1..f37ac549d10a82 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -4749,10 +4749,10 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
           if (CGF.CGM.getCodeGenOpts().hasReducedDebugInfo())
             (void)DI->EmitDeclareOfAutoVariable(SharedVar, ContextValue,
                                                 CGF.Builder, false);
-          llvm::Instruction &Last = CGF.Builder.GetInsertBlock()->back();
           // Get the call dbg.declare instruction we just created and update
           // its DIExpression to add offset to base address.
-          if (auto DDI = dyn_cast<llvm::DbgVariableIntrinsic>(&Last)) {
+          auto UpdateExpr = [](llvm::LLVMContext &Ctx, auto *Declare,
+                               unsigned Offset) {
             SmallVector<uint64_t, 8> Ops;
             // Add offset to the base address if non zero.
             if (Offset) {
@@ -4760,9 +4760,21 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
               Ops.push_back(Offset);
             }
             Ops.push_back(llvm::dwarf::DW_OP_deref);
-            auto &Ctx = DDI->getContext();
-            llvm::DIExpression *DIExpr = llvm::DIExpression::get(Ctx, Ops);
-            Last.setOperand(2, llvm::MetadataAsValue::get(Ctx, DIExpr));
+            Declare->setExpression(llvm::DIExpression::get(Ctx, Ops));
+          };
+          llvm::Instruction &Last = CGF.Builder.GetInsertBlock()->back();
+          if (auto DDI = dyn_cast<llvm::DbgVariableIntrinsic>(&Last))
+            UpdateExpr(DDI->getContext(), DDI, Offset);
+          // If we're emitting using the new debug info format into a block
+          // without a terminator, the record will be "trailing".
+          assert(!Last.isTerminator() && "unexpected terminator");
+          if (auto *Marker =
+                  CGF.Builder.GetInsertBlock()->getTrailingDbgRecords()) {
+            for (llvm::DbgVariableRecord &DVR : llvm::reverse(
+                     llvm::filterDbgVars(Marker->getDbgRecordRange()))) {
+              UpdateExpr(Last.getContext(), &DVR, Offset);
+              break;
+            }
           }
         }
       }
diff --git a/clang/lib/CodeGen/CGVTT.cpp b/clang/lib/CodeGen/CGVTT.cpp
index 1d3f14f1c5344d..d2376b14dd5826 100644
--- a/clang/lib/CodeGen/CGVTT.cpp
+++ b/clang/lib/CodeGen/CGVTT.cpp
@@ -77,9 +77,18 @@ CodeGenVTables::EmitVTTDefinition(llvm::GlobalVariable *VTT,
        llvm::ConstantInt::get(CGM.Int32Ty, AddressPoint.AddressPointIndex),
      };
 
+     // Add inrange attribute to indicate that only the VTableIndex can be
+     // accessed.
+     unsigned ComponentSize =
+         CGM.getDataLayout().getTypeAllocSize(getVTableComponentType());
+     unsigned VTableSize = CGM.getDataLayout().getTypeAllocSize(
+         cast<llvm::StructType>(VTable->getValueType())
+             ->getElementType(AddressPoint.VTableIndex));
+     unsigned Offset = ComponentSize * AddressPoint.AddressPointIndex;
+     llvm::ConstantRange InRange(llvm::APInt(32, -Offset, true),
+                                 llvm::APInt(32, VTableSize - Offset, true));
      llvm::Constant *Init = llvm::ConstantExpr::getGetElementPtr(
-         VTable->getValueType(), VTable, Idxs, /*InBounds=*/true,
-         /*InRangeIndex=*/1);
+         VTable->getValueType(), VTable, Idxs, /*InBounds=*/true, InRange);
 
      VTTComponents.push_back(Init);
   }
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 4a3ff49b0007a3..fad26c43da3d34 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1361,6 +1361,8 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
   FunctionArgList Args;
   QualType ResTy = BuildFunctionArgList(GD, Args);
 
+  CGM.getTargetCodeGenInfo().checkFunctionABI(CGM, FD);
+
   if (FD->isInlineBuiltinDeclaration()) {
     // When generating code for a builtin with an inline declaration, use a
     // mangled name to hold the actual body, while keeping an external
@@ -2407,6 +2409,7 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) {
     case Type::BTFTagAttributed:
     case Type::SubstTemplateTypeParm:
     case Type::MacroQualified:
+    case Type::CountAttributed:
       // Keep walking after single level desugaring.
       type = type.getSingleStepDesugaredType(getContext());
       break;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 8ceecff28cbc63..bb26bfcddaeb78 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -145,6 +145,8 @@ createTargetCodeGenInfo(CodeGenModule &CGM) {
       Kind = AArch64ABIKind::DarwinPCS;
     else if (Triple.isOSWindows())
       return createWindowsAArch64TargetCodeGenInfo(CGM, AArch64ABIKind::Win64);
+    else if (Target.getABI() == "aapcs-soft")
+      Kind = AArch64ABIKind::AAPCSSoft;
 
     return createAArch64TargetCodeGenInfo(CGM, Kind);
   }
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 60b45ee78d9316..bdd53a192f828a 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -1885,19 +1885,27 @@ ItaniumCXXABI::getVTableAddressPoint(BaseSubobject Base,
 
   // Find the appropriate vtable within the vtable group, and the address point
   // within that vtable.
+  const VTableLayout &Layout =
+      CGM.getItaniumVTableContext().getVTableLayout(VTableClass);
   VTableLayout::AddressPointLocation AddressPoint =
-      CGM.getItaniumVTableContext()
-          .getVTableLayout(VTableClass)
-          .getAddressPoint(Base);
+      Layout.getAddressPoint(Base);
   llvm::Value *Indices[] = {
     llvm::ConstantInt::get(CGM.Int32Ty, 0),
     llvm::ConstantInt::get(CGM.Int32Ty, AddressPoint.VTableIndex),
     llvm::ConstantInt::get(CGM.Int32Ty, AddressPoint.AddressPointIndex),
   };
 
-  return llvm::ConstantExpr::getGetElementPtr(VTable->getValueType(), VTable,
-                                              Indices, /*InBounds=*/true,
-                                              /*InRangeIndex=*/1);
+  // Add inrange attribute to indicate that only the VTableIndex can be
+  // accessed.
+  unsigned ComponentSize =
+      CGM.getDataLayout().getTypeAllocSize(CGM.getVTableComponentType());
+  unsigned VTableSize =
+      ComponentSize * Layout.getVTableSize(AddressPoint.VTableIndex);
+  unsigned Offset = ComponentSize * AddressPoint.AddressPointIndex;
+  llvm::ConstantRange InRange(llvm::APInt(32, -Offset, true),
+                              llvm::APInt(32, VTableSize - Offset, true));
+  return llvm::ConstantExpr::getGetElementPtr(
+      VTable->getValueType(), VTable, Indices, /*InBounds=*/true, InRange);
 }
 
 // Check whether all the non-inline virtual methods for the class have the
diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
index 3594f4c66e6774..df85295cfb2e29 100644
--- a/clang/lib/CodeGen/ModuleBuilder.cpp
+++ b/clang/lib/CodeGen/ModuleBuilder.cpp
@@ -180,7 +180,7 @@ namespace {
 
     bool HandleTopLevelDecl(DeclGroupRef DG) override {
       // FIXME: Why not return false and abort parsing?
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return true;
 
       HandlingTopLevelDeclRAII HandlingDecl(*this);
@@ -206,7 +206,7 @@ namespace {
     }
 
     void HandleInlineFunctionDefinition(FunctionDecl *D) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       assert(D->doesThisDeclarationHaveABody());
@@ -233,7 +233,7 @@ namespace {
     /// client hack on the type, which can occur at any point in the file
     /// (because these can be defined in declspecs).
     void HandleTagDeclDefinition(TagDecl *D) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       // Don't allow re-entrant calls to CodeGen triggered by PCH
@@ -269,7 +269,7 @@ namespace {
     }
 
     void HandleTagDeclRequiredDefinition(const TagDecl *D) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       // Don't allow re-entrant calls to CodeGen triggered by PCH
@@ -283,7 +283,7 @@ namespace {
 
     void HandleTranslationUnit(ASTContext &Ctx) override {
       // Release the Builder when there is no error.
-      if (!Diags.hasErrorOccurred() && Builder)
+      if (!Diags.hasUnrecoverableErrorOccurred() && Builder)
         Builder->Release();
 
       // If there are errors before or when releasing the Builder, reset
@@ -297,14 +297,14 @@ namespace {
     }
 
     void AssignInheritanceModel(CXXRecordDecl *RD) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       Builder->RefreshTypeCacheForClass(RD);
     }
 
     void CompleteTentativeDefinition(VarDecl *D) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       Builder->EmitTentativeDefinition(D);
@@ -315,7 +315,7 @@ namespace {
     }
 
     void HandleVTable(CXXRecordDecl *RD) override {
-      if (Diags.hasErrorOccurred())
+      if (Diags.hasUnrecoverableErrorOccurred())
         return;
 
       Builder->EmitVTable(RD);
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 7682f197041c74..6893b50a3cfe90 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -84,6 +84,11 @@ class TargetCodeGenInfo {
   /// Provides a convenient hook to handle extra target-specific globals.
   virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {}
 
+  /// Any further codegen related checks that need to be done on a function
+  /// signature in a target specific manner.
+  virtual void checkFunctionABI(CodeGenModule &CGM,
+                                const FunctionDecl *Decl) const {}
+
   /// Any further codegen related checks that need to be done on a function call
   /// in a target specific manner.
   virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
@@ -416,6 +421,7 @@ enum class AArch64ABIKind {
   AAPCS = 0,
   DarwinPCS,
   Win64,
+  AAPCSSoft,
 };
 
 std::unique_ptr<TargetCodeGenInfo>
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 85117366de0ee8..4c32f510101f04 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -27,6 +27,8 @@ class AArch64ABIInfo : public ABIInfo {
   AArch64ABIInfo(CodeGenTypes &CGT, AArch64ABIKind Kind)
       : ABIInfo(CGT), Kind(Kind) {}
 
+  bool isSoftFloat() const { return Kind == AArch64ABIKind::AAPCSSoft; }
+
 private:
   AArch64ABIKind getABIKind() const { return Kind; }
   bool isDarwinPCS() const { return Kind == AArch64ABIKind::DarwinPCS; }
@@ -55,8 +57,8 @@ class AArch64ABIInfo : public ABIInfo {
   Address EmitDarwinVAArg(Address VAListAddr, QualType Ty,
                           CodeGenFunction &CGF) const;
 
-  Address EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
-                         CodeGenFunction &CGF) const;
+  Address EmitAAPCSVAArg(Address VAListAddr, QualType Ty, CodeGenFunction &CGF,
+                         AArch64ABIKind Kind) const;
 
   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                     QualType Ty) const override {
@@ -67,7 +69,7 @@ class AArch64ABIInfo : public ABIInfo {
 
     return Kind == AArch64ABIKind::Win64 ? EmitMSVAArg(CGF, VAListAddr, Ty)
            : isDarwinPCS()               ? EmitDarwinVAArg(VAListAddr, Ty, CGF)
-                                         : EmitAAPCSVAArg(VAListAddr, Ty, CGF);
+                           : EmitAAPCSVAArg(VAListAddr, Ty, CGF, Kind);
   }
 
   Address EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
@@ -163,6 +165,9 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
     return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty);
   }
 
+  void checkFunctionABI(CodeGenModule &CGM,
+                        const FunctionDecl *Decl) const override;
+
   void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
                             const FunctionDecl *Caller,
                             const FunctionDecl *Callee,
@@ -494,6 +499,11 @@ bool AArch64SwiftABIInfo::isLegalVectorType(CharUnits VectorSize,
 }
 
 bool AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
+  // For the soft-float ABI variant, no types are considered to be homogeneous
+  // aggregates.
+  if (Kind == AArch64ABIKind::AAPCSSoft)
+    return false;
+
   // Homogeneous aggregates for AAPCS64 must have base types of a floating
   // point type or a short-vector type. This is the same as the 32-bit ABI,
   // but with the difference that any floating-point type is allowed,
@@ -525,7 +535,8 @@ bool AArch64ABIInfo::isZeroLengthBitfieldPermittedInHomogeneousAggregate()
 }
 
 Address AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
-                                       CodeGenFunction &CGF) const {
+                                       CodeGenFunction &CGF,
+                                       AArch64ABIKind Kind) const {
   ABIArgInfo AI = classifyArgumentType(Ty, /*IsVariadic=*/true,
                                        CGF.CurFnInfo->getCallingConvention());
   // Empty records are ignored for parameter passing purposes.
@@ -550,7 +561,8 @@ Address AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
     BaseTy = ArrTy->getElementType();
     NumRegs = ArrTy->getNumElements();
   }
-  bool IsFPR = BaseTy->isFloatingPointTy() || BaseTy->isVectorTy();
+  bool IsFPR = Kind != AArch64ABIKind::AAPCSSoft &&
+               (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy());
 
   // The AArch64 va_list type and handling is specified in the Procedure Call
   // Standard, section B.4:
@@ -841,6 +853,34 @@ static bool isStreamingCompatible(const FunctionDecl *F) {
   return false;
 }
 
+void AArch64TargetCodeGenInfo::checkFunctionABI(
+    CodeGenModule &CGM, const FunctionDecl *FuncDecl) const {
+  const AArch64ABIInfo &ABIInfo = getABIInfo<AArch64ABIInfo>();
+  const TargetInfo &TI = ABIInfo.getContext().getTargetInfo();
+
+  // If we are using a hard-float ABI, but do not have floating point
+  // registers, then report an error for any function arguments or returns
+  // which would be passed in floating-pint registers.
+  auto CheckType = [&CGM, &TI, &ABIInfo](const QualType &Ty,
+                                         const NamedDecl *D) {
+    const Type *HABase = nullptr;
+    uint64_t HAMembers = 0;
+    if (Ty->isFloatingType() || Ty->isVectorType() ||
+        ABIInfo.isHomogeneousAggregate(Ty, HABase, HAMembers)) {
+      CGM.getDiags().Report(D->getLocation(),
+                            diag::err_target_unsupported_type_for_abi)
+          << D->getDeclName() << Ty << TI.getABI();
+    }
+  };
+
+  if (!TI.hasFeature("fp") && !ABIInfo.isSoftFloat()) {
+    CheckType(FuncDecl->getReturnType(), FuncDecl);
+    for (ParmVarDecl *PVD : FuncDecl->parameters()) {
+      CheckType(PVD->getType(), PVD);
+    }
+  }
+}
+
 void AArch64TargetCodeGenInfo::checkFunctionCallABI(
     CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller,
     const FunctionDecl *Callee, const CallArgList &Args) const {
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 190782a79a2456..1daf588142b3b4 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -87,6 +87,7 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/RISCVISAInfo.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
@@ -2196,6 +2197,12 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
     return false;
   }
 
+  if (C.getArgs().hasArg(options::OPT_print_std_module_manifest_path)) {
+    llvm::outs() << GetStdModuleManifestPath(C, C.getDefaultToolChain())
+                 << '\n';
+    return false;
+  }
+
   if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) {
     if (std::optional<std::string> RuntimePath = TC.getRuntimePath())
       llvm::outs() << *RuntimePath << '\n';
@@ -4640,7 +4647,8 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
 
   // All kinds exit now in device-only mode except for non-RDC mode HIP.
   if (offloadDeviceOnly() &&
-      (!C.isOffloadingHostKind(Action::OFK_HIP) ||
+      (getFinalPhase(Args) == phases::Preprocess ||
+       !C.isOffloadingHostKind(Action::OFK_HIP) ||
        !Args.hasFlag(options::OPT_gpu_bundle_output,
                      options::OPT_no_gpu_bundle_output, true) ||
        Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)))
@@ -6185,6 +6193,44 @@ std::string Driver::GetProgramPath(StringRef Name, const ToolChain &TC) const {
   return std::string(Name);
 }
 
+std::string Driver::GetStdModuleManifestPath(const Compilation &C,
+                                             const ToolChain &TC) const {
+  std::string error = "<NOT PRESENT>";
+
+  switch (TC.GetCXXStdlibType(C.getArgs())) {
+  case ToolChain::CST_Libcxx: {
+    std::string lib = GetFilePath("libc++.so", TC);
+
+    // Note when there are multiple flavours of libc++ the module json needs to
+    // look at the command-line arguments for the proper json.
+    // These flavours do not exist at the moment, but there are plans to
+    // provide a variant that is built with sanitizer instrumentation enabled.
+
+    // For example
+    //  StringRef modules = [&] {
+    //    const SanitizerArgs &Sanitize = TC.getSanitizerArgs(C.getArgs());
+    //    if (Sanitize.needsAsanRt())
+    //      return "modules-asan.json";
+    //    return "modules.json";
+    //  }();
+
+    SmallString<128> path(lib.begin(), lib.end());
+    llvm::sys::path::remove_filename(path);
+    llvm::sys::path::append(path, "modules.json");
+    if (TC.getVFS().exists(path))
+      return static_cast<std::string>(path);
+
+    return error;
+  }
+
+  case ToolChain::CST_Libstdcxx:
+    // libstdc++ does not provide Standard library modules yet.
+    return error;
+  }
+
+  return error;
+}
+
 std::string Driver::GetTemporaryPath(StringRef Prefix, StringRef Suffix) const {
   SmallString<128> Path;
   std::error_code EC = llvm::sys::fs::createTemporaryFile(Prefix, Suffix, Path);
@@ -6677,3 +6723,131 @@ llvm::Error driver::expandResponseFiles(SmallVectorImpl<const char *> &Args,
 
   return llvm::Error::success();
 }
+
+static const char *GetStableCStr(llvm::StringSet<> &SavedStrings, StringRef S) {
+  return SavedStrings.insert(S).first->getKeyData();
+}
+
+/// Apply a list of edits to the input argument lists.
+///
+/// The input string is a space separated list of edits to perform,
+/// they are applied in order to the input argument lists. Edits
+/// should be one of the following forms:
+///
+///  '#': Silence information about the changes to the command line arguments.
+///
+///  '^': Add FOO as a new argument at the beginning of the command line.
+///
+///  '+': Add FOO as a new argument at the end of the command line.
+///
+///  's/XXX/YYY/': Substitute the regular expression XXX with YYY in the command
+///  line.
+///
+///  'xOPTION': Removes all instances of the literal argument OPTION.
+///
+///  'XOPTION': Removes all instances of the literal argument OPTION,
+///  and the following argument.
+///
+///  'Ox': Removes all flags matching 'O' or 'O[sz0-9]' and adds 'Ox'
+///  at the end of the command line.
+///
+/// \param OS - The stream to write edit information to.
+/// \param Args - The vector of command line arguments.
+/// \param Edit - The override command to perform.
+/// \param SavedStrings - Set to use for storing string representations.
+static void applyOneOverrideOption(raw_ostream &OS,
+                                   SmallVectorImpl<const char *> &Args,
+                                   StringRef Edit,
+                                   llvm::StringSet<> &SavedStrings) {
+  // This does not need to be efficient.
+
+  if (Edit[0] == '^') {
+    const char *Str = GetStableCStr(SavedStrings, Edit.substr(1));
+    OS << "### Adding argument " << Str << " at beginning\n";
+    Args.insert(Args.begin() + 1, Str);
+  } else if (Edit[0] == '+') {
+    const char *Str = GetStableCStr(SavedStrings, Edit.substr(1));
+    OS << "### Adding argument " << Str << " at end\n";
+    Args.push_back(Str);
+  } else if (Edit[0] == 's' && Edit[1] == '/' && Edit.ends_with("/") &&
+             Edit.slice(2, Edit.size() - 1).contains('/')) {
+    StringRef MatchPattern = Edit.substr(2).split('/').first;
+    StringRef ReplPattern = Edit.substr(2).split('/').second;
+    ReplPattern = ReplPattern.slice(0, ReplPattern.size() - 1);
+
+    for (unsigned i = 1, e = Args.size(); i != e; ++i) {
+      // Ignore end-of-line response file markers
+      if (Args[i] == nullptr)
+        continue;
+      std::string Repl = llvm::Regex(MatchPattern).sub(ReplPattern, Args[i]);
+
+      if (Repl != Args[i]) {
+        OS << "### Replacing '" << Args[i] << "' with '" << Repl << "'\n";
+        Args[i] = GetStableCStr(SavedStrings, Repl);
+      }
+    }
+  } else if (Edit[0] == 'x' || Edit[0] == 'X') {
+    auto Option = Edit.substr(1);
+    for (unsigned i = 1; i < Args.size();) {
+      if (Option == Args[i]) {
+        OS << "### Deleting argument " << Args[i] << '\n';
+        Args.erase(Args.begin() + i);
+        if (Edit[0] == 'X') {
+          if (i < Args.size()) {
+            OS << "### Deleting argument " << Args[i] << '\n';
+            Args.erase(Args.begin() + i);
+          } else
+            OS << "### Invalid X edit, end of command line!\n";
+        }
+      } else
+        ++i;
+    }
+  } else if (Edit[0] == 'O') {
+    for (unsigned i = 1; i < Args.size();) {
+      const char *A = Args[i];
+      // Ignore end-of-line response file markers
+      if (A == nullptr)
+        continue;
+      if (A[0] == '-' && A[1] == 'O' &&
+          (A[2] == '\0' || (A[3] == '\0' && (A[2] == 's' || A[2] == 'z' ||
+                                             ('0' <= A[2] && A[2] <= '9'))))) {
+        OS << "### Deleting argument " << Args[i] << '\n';
+        Args.erase(Args.begin() + i);
+      } else
+        ++i;
+    }
+    OS << "### Adding argument " << Edit << " at end\n";
+    Args.push_back(GetStableCStr(SavedStrings, '-' + Edit.str()));
+  } else {
+    OS << "### Unrecognized edit: " << Edit << "\n";
+  }
+}
+
+void driver::applyOverrideOptions(SmallVectorImpl<const char *> &Args,
+                                  const char *OverrideStr,
+                                  llvm::StringSet<> &SavedStrings,
+                                  raw_ostream *OS) {
+  if (!OS)
+    OS = &llvm::nulls();
+
+  if (OverrideStr[0] == '#') {
+    ++OverrideStr;
+    OS = &llvm::nulls();
+  }
+
+  *OS << "### CCC_OVERRIDE_OPTIONS: " << OverrideStr << "\n";
+
+  // This does not need to be efficient.
+
+  const char *S = OverrideStr;
+  while (*S) {
+    const char *End = ::strchr(S, ' ');
+    if (!End)
+      End = S + strlen(S);
+    if (End != S)
+      applyOneOverrideOption(*OS, Args, std::string(S, End), SavedStrings);
+    S = End;
+    if (*S != '\0')
+      ++S;
+  }
+}
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 56d497eb4c32b8..8bfe9f02a091d1 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -487,6 +487,14 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
         Add &= ~NotAllowedWithExecuteOnly;
       if (CfiCrossDso)
         Add &= ~SanitizerKind::CFIMFCall;
+      // -fsanitize=undefined does not expand to signed-integer-overflow in
+      // -fwrapv (implied by -fno-strict-overflow) mode.
+      if (Add & SanitizerKind::UndefinedGroup) {
+        bool S = Args.hasFlagNoClaim(options::OPT_fno_strict_overflow,
+                                     options::OPT_fstrict_overflow, false);
+        if (Args.hasFlagNoClaim(options::OPT_fwrapv, options::OPT_fno_wrapv, S))
+          Add &= ~SanitizerKind::SignedIntegerOverflow;
+      }
       Add &= Supported;
 
       if (Add & SanitizerKind::Fuzzer)
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 08b1fd01b3c0ac..03450fc0f57b93 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -453,12 +453,6 @@ ToolChain::getDefaultUnwindTableLevel(const ArgList &Args) const {
   return UnwindTableLevel::None;
 }
 
-unsigned ToolChain::GetDefaultDwarfVersion() const {
-  // TODO: Remove the RISC-V special case when R_RISCV_SET_ULEB128 linker
-  // support becomes more widely available.
-  return getTriple().isRISCV() ? 4 : 5;
-}
-
 Tool *ToolChain::getClang() const {
   if (!Clang)
     Clang.reset(new tools::Clang(*this, useIntegratedBackend()));
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index aa3b80cb16e55a..3e6e29584df3ac 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -321,9 +321,11 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
     }
   }
 
-  if (Arg *A = Args.getLastArg(options::OPT_mno_unaligned_access,
-                               options::OPT_munaligned_access)) {
-    if (A->getOption().matches(options::OPT_mno_unaligned_access))
+  if (Arg *A = Args.getLastArg(
+          options::OPT_mstrict_align, options::OPT_mno_strict_align,
+          options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) {
+    if (A->getOption().matches(options::OPT_mstrict_align) ||
+        A->getOption().matches(options::OPT_mno_unaligned_access))
       Features.push_back("+strict-align");
   } else if (Triple.isOSOpenBSD())
     Features.push_back("+strict-align");
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index ba158b92bb44ba..a68368c4758651 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -868,12 +868,16 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver &D,
     }
   }
 
-  // Kernel code has more strict alignment requirements.
-  if (KernelOrKext) {
-    Features.push_back("+strict-align");
-  } else if (Arg *A = Args.getLastArg(options::OPT_mno_unaligned_access,
-                                      options::OPT_munaligned_access)) {
-    if (A->getOption().matches(options::OPT_munaligned_access)) {
+  if (Arg *A = Args.getLastArg(options::OPT_mno_unaligned_access,
+                                      options::OPT_munaligned_access,
+                                      options::OPT_mstrict_align,
+                                      options::OPT_mno_strict_align)) {
+    // Kernel code has more strict alignment requirements.
+    if (KernelOrKext ||
+        A->getOption().matches(options::OPT_mno_unaligned_access) ||
+        A->getOption().matches(options::OPT_mstrict_align)) {
+      Features.push_back("+strict-align");
+    } else {
       // No v6M core supports unaligned memory access (v6M ARM ARM A3.2).
       if (Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v6m)
         D.Diag(diag::err_target_unsupported_unaligned) << "v6m";
@@ -881,8 +885,7 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver &D,
       // access either.
       else if (Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v8m_baseline)
         D.Diag(diag::err_target_unsupported_unaligned) << "v8m.base";
-    } else
-      Features.push_back("+strict-align");
+    }
   } else {
     // Assume pre-ARMv6 doesn't support unaligned accesses.
     //
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 31153a67ad2840..d23f9b36efb9ac 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -165,10 +165,9 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     }
   }
 
-  // Select the `ual` feature determined by -m[no-]unaligned-access
-  // or the alias -m[no-]strict-align.
-  AddTargetFeature(Args, Features, options::OPT_munaligned_access,
-                   options::OPT_mno_unaligned_access, "ual");
+  // Select the `ual` feature determined by -m[no-]strict-align.
+  AddTargetFeature(Args, Features, options::OPT_mno_strict_align,
+                   options::OPT_mstrict_align, "ual");
 
   // Accept but warn about these TargetSpecific options.
   if (Arg *A = Args.getLastArgNoClaim(options::OPT_mabi_EQ))
diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index fe9d112b8800b1..74a8874a3ea2b7 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -341,6 +341,15 @@ void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
                    "dspr2");
   AddTargetFeature(Args, Features, options::OPT_mmsa, options::OPT_mno_msa,
                    "msa");
+  if (Arg *A = Args.getLastArg(
+          options::OPT_mstrict_align, options::OPT_mno_strict_align,
+          options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) {
+    if (A->getOption().matches(options::OPT_mstrict_align) ||
+        A->getOption().matches(options::OPT_mno_unaligned_access))
+      Features.push_back(Args.MakeArgString("+strict-align"));
+    else
+      Features.push_back(Args.MakeArgString("-strict-align"));
+  }
 
   // Add the last -mfp32/-mfpxx/-mfp64, if none are given and the ABI is O32
   // pass -mfpxx, or if none are given and fp64a is default, pass fp64 and
diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index a46b44f9ad2b2d..5165bccc6d7e30 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -167,9 +167,9 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     Features.push_back("-relax");
   }
 
-  // -mno-unaligned-access is default, unless -munaligned-access is specified.
-  AddTargetFeature(Args, Features, options::OPT_munaligned_access,
-                   options::OPT_mno_unaligned_access, "fast-unaligned-access");
+  // -mstrict-align is default, unless -mno-strict-align is specified.
+  AddTargetFeature(Args, Features, options::OPT_mno_strict_align,
+                   options::OPT_mstrict_align, "fast-unaligned-access");
 
   // Now add any that the user explicitly requested on the command line,
   // which may override the defaults.
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 3a7a1cf99c79ac..bcdf2737bc7ae0 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7199,6 +7199,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // -fno-common is the default, set -fcommon only when that flag is set.
   Args.addOptInFlag(CmdArgs, options::OPT_fcommon, options::OPT_fno_common);
 
+  if (Args.hasFlag(options::OPT_fptrauth_intrinsics,
+                   options::OPT_fno_ptrauth_intrinsics, false))
+    CmdArgs.push_back("-fptrauth-intrinsics");
+
   // -fsigned-bitfields is default, and clang doesn't yet support
   // -funsigned-bitfields.
   if (!Args.hasFlag(options::OPT_fsigned_bitfields,
@@ -8477,6 +8481,14 @@ void ClangAs::ConstructJob(Compilation &C, const JobAction &JA,
   case llvm::Triple::riscv64:
     AddRISCVTargetArgs(Args, CmdArgs);
     break;
+
+  case llvm::Triple::hexagon:
+    if (Args.hasFlag(options::OPT_mdefault_build_attributes,
+                     options::OPT_mno_default_build_attributes, true)) {
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back("-hexagon-add-build-attributes");
+    }
+    break;
   }
 
   // Consume all the warning flags. Usually this would be handled more
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 100e71245394d8..4478865313636d 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -740,7 +740,8 @@ bool tools::isTLSDESCEnabled(const ToolChain &TC,
     SupportedArgument = V == "desc" || V == "trad";
     EnableTLSDESC = V == "desc";
   } else if (Triple.isX86()) {
-    SupportedArgument = V == "gnu";
+    SupportedArgument = V == "gnu" || V == "gnu2";
+    EnableTLSDESC = V == "gnu2";
   } else {
     Unsupported = true;
   }
@@ -2829,7 +2830,7 @@ void tools::addHIPRuntimeLibArgs(const ToolChain &TC, Compilation &C,
                                  llvm::opt::ArgStringList &CmdArgs) {
   if ((C.getActiveOffloadKinds() & Action::OFK_HIP) &&
       !Args.hasArg(options::OPT_nostdlib) &&
-      !Args.hasArg(options::OPT_no_hip_rt)) {
+      !Args.hasArg(options::OPT_no_hip_rt) && !Args.hasArg(options::OPT_r)) {
     TC.AddHIPRuntimeLibArgs(Args, CmdArgs);
   } else {
     // Claim "no HIP libraries" arguments if any
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index c6007d3cfab864..5f0b516e1a1a08 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -750,10 +750,12 @@ NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
     if (!llvm::is_contained(*DAL, A))
       DAL->append(A);
 
-  // TODO: We should accept 'generic' as a valid architecture.
   if (!DAL->hasArg(options::OPT_march_EQ) && OffloadKind != Action::OFK_None) {
     DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
                       CudaArchToString(CudaArch::CudaDefault));
+  } else if (DAL->getLastArgValue(options::OPT_march_EQ) == "generic" &&
+             OffloadKind == Action::OFK_None) {
+    DAL->eraseArg(options::OPT_march_EQ);
   } else if (DAL->getLastArgValue(options::OPT_march_EQ) == "native") {
     auto GPUsOrErr = getSystemGPUArchs(Args);
     if (!GPUsOrErr) {
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 6168b42dc78292..70daa699e3a949 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -148,7 +148,6 @@ void Flang::addCodegenOptions(const ArgList &Args,
 
   Args.addAllArgs(CmdArgs, {options::OPT_flang_experimental_hlfir,
                             options::OPT_flang_deprecated_no_hlfir,
-                            options::OPT_flang_experimental_polymorphism,
                             options::OPT_fno_ppc_native_vec_elem_order,
                             options::OPT_fppc_native_vec_elem_order});
 }
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index b8c2573d6265fb..b7c6efab83e806 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -44,8 +44,15 @@ std::string wasm::Linker::getLinkerPath(const ArgList &Args) const {
           llvm::sys::fs::can_execute(UseLinker))
         return std::string(UseLinker);
 
-      // Accept 'lld', and 'ld' as aliases for the default linker
-      if (UseLinker != "lld" && UseLinker != "ld")
+      // Interpret 'lld' as explicitly requesting `wasm-ld`, so look for that
+      // linker. Note that for `wasm32-wasip2` this overrides the default linker
+      // of `wasm-component-ld`.
+      if (UseLinker == "lld") {
+        return ToolChain.GetProgramPath("wasm-ld");
+      }
+
+      // Allow 'ld' as an alias for the default linker
+      if (UseLinker != "ld")
         ToolChain.getDriver().Diag(diag::err_drv_invalid_linker_name)
             << A->getAsString(Args);
     }
@@ -73,6 +80,16 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.hasArg(options::OPT_s))
     CmdArgs.push_back("--strip-all");
 
+  // On `wasip2` the default linker is `wasm-component-ld` which wraps the
+  // execution of `wasm-ld`. Find `wasm-ld` and pass it as an argument of where
+  // to find it to avoid it needing to hunt and rediscover or search `PATH` for
+  // where it is.
+  if (llvm::sys::path::stem(Linker).ends_with_insensitive(
+          "wasm-component-ld")) {
+    CmdArgs.push_back("--wasm-ld-path");
+    CmdArgs.push_back(Args.MakeArgString(ToolChain.GetProgramPath("wasm-ld")));
+  }
+
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_u});
 
   ToolChain.AddFilePathLibArgs(Args, CmdArgs);
@@ -221,6 +238,12 @@ WebAssembly::WebAssembly(const Driver &D, const llvm::Triple &Triple,
   }
 }
 
+const char *WebAssembly::getDefaultLinker() const {
+  if (getOS() == "wasip2")
+    return "wasm-component-ld";
+  return "wasm-ld";
+}
+
 bool WebAssembly::IsMathErrnoDefault() const { return false; }
 
 bool WebAssembly::IsObjCNonFragileABIDefault() const { return true; }
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.h b/clang/lib/Driver/ToolChains/WebAssembly.h
index ae60f464c10818..76e0ca39bd748d 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.h
+++ b/clang/lib/Driver/ToolChains/WebAssembly.h
@@ -67,7 +67,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssembly final : public ToolChain {
                            llvm::opt::ArgStringList &CmdArgs) const override;
   SanitizerMask getSupportedSanitizers() const override;
 
-  const char *getDefaultLinker() const override { return "wasm-ld"; }
+  const char *getDefaultLinker() const override;
 
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
diff --git a/clang/lib/Format/.clang-format b/clang/lib/Format/.clang-format
index f95602cab0f7fc..d7331b3c8cf02e 100644
--- a/clang/lib/Format/.clang-format
+++ b/clang/lib/Format/.clang-format
@@ -1,6 +1 @@
-BasedOnStyle: LLVM
-InsertBraces: true
-InsertNewlineAtEOF: true
-LineEnding: LF
-RemoveBracesLLVM: true
-RemoveParentheses: ReturnStatement
+BasedOnStyle: clang-format
diff --git a/clang/lib/Format/BreakableToken.h b/clang/lib/Format/BreakableToken.h
index e7c0680641e294..8b9360a3335ef4 100644
--- a/clang/lib/Format/BreakableToken.h
+++ b/clang/lib/Format/BreakableToken.h
@@ -18,11 +18,8 @@
 #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
 
 #include "Encoding.h"
-#include "TokenAnnotator.h"
 #include "WhitespaceManager.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Support/Regex.h"
-#include <utility>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index df44e6994c4784..700bce35c86839 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -822,6 +822,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       !CurrentState.IsCSharpGenericTypeConstraint && Previous.opensScope() &&
       Previous.isNot(TT_ObjCMethodExpr) && Previous.isNot(TT_RequiresClause) &&
       Previous.isNot(TT_TableGenDAGArgOpener) &&
+      Previous.isNot(TT_TableGenDAGArgOpenerToBreak) &&
       !(Current.MacroParent && Previous.MacroParent) &&
       (Current.isNot(TT_LineComment) ||
        Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen))) {
@@ -1444,7 +1445,9 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
       Style.BreakInheritanceList == FormatStyle::BILS_AfterColon) {
     return CurrentState.Indent;
   }
-  if (Previous.is(tok::r_paren) && !Current.isBinaryOperator() &&
+  if (Previous.is(tok::r_paren) &&
+      Previous.isNot(TT_TableGenDAGArgOperatorToBreak) &&
+      !Current.isBinaryOperator() &&
       !Current.isOneOf(tok::colon, tok::comment)) {
     return ContinuationIndent;
   }
@@ -1705,7 +1708,8 @@ void ContinuationIndenter::moveStatePastFakeLParens(LineState &State,
         (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign ||
          PrecedenceLevel != prec::Comma || Current.NestingLevel == 0) &&
         (!Style.isTableGen() ||
-         (Previous && Previous->is(TT_TableGenDAGArgListComma)))) {
+         (Previous && Previous->isOneOf(TT_TableGenDAGArgListComma,
+                                        TT_TableGenDAGArgListCommaToBreak)))) {
       NewParenState.Indent = std::max(
           std::max(State.Column, NewParenState.Indent), CurrentState.LastSpace);
     }
@@ -1842,6 +1846,17 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State,
         Style.ContinuationIndentWidth +
         std::max(CurrentState.LastSpace, CurrentState.StartOfFunctionCall);
 
+    if (Style.isTableGen() && Current.is(TT_TableGenDAGArgOpenerToBreak) &&
+        Style.TableGenBreakInsideDAGArg == FormatStyle::DAS_BreakElements) {
+      // For the case the next token is a TableGen DAGArg operator identifier
+      // that is not marked to have a line break after it.
+      // In this case the option DAS_BreakElements requires to align the
+      // DAGArg elements to the operator.
+      const FormatToken *Next = Current.Next;
+      if (Next && Next->is(TT_TableGenDAGArgOperatorID))
+        NewIndent = State.Column + Next->TokenText.size() + 2;
+    }
+
     // Ensure that different different brackets force relative alignment, e.g.:
     // void SomeFunction(vector<  // break
     //                       int> v);
diff --git a/clang/lib/Format/ContinuationIndenter.h b/clang/lib/Format/ContinuationIndenter.h
index 2598947bb624c5..18441e10a12492 100644
--- a/clang/lib/Format/ContinuationIndenter.h
+++ b/clang/lib/Format/ContinuationIndenter.h
@@ -17,11 +17,6 @@
 
 #include "Encoding.h"
 #include "FormatToken.h"
-#include "clang/Format/Format.h"
-#include "llvm/Support/Regex.h"
-#include <map>
-#include <optional>
-#include <tuple>
 
 namespace clang {
 class SourceManager;
diff --git a/clang/lib/Format/Encoding.h b/clang/lib/Format/Encoding.h
index a0d664121b2bba..12f9043bb95ad8 100644
--- a/clang/lib/Format/Encoding.h
+++ b/clang/lib/Format/Encoding.h
@@ -16,7 +16,6 @@
 #define LLVM_CLANG_LIB_FORMAT_ENCODING_H
 
 #include "clang/Basic/LLVM.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Unicode.h"
 
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index e64ba7eebc1ce8..63ec3a88978dd9 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -13,44 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Format/Format.h"
-#include "AffectedRangeManager.h"
-#include "BreakableToken.h"
-#include "ContinuationIndenter.h"
 #include "DefinitionBlockSeparator.h"
-#include "FormatInternal.h"
-#include "FormatToken.h"
-#include "FormatTokenLexer.h"
 #include "IntegerLiteralSeparatorFixer.h"
 #include "NamespaceEndCommentsFixer.h"
 #include "ObjCPropertyAttributeOrderFixer.h"
 #include "QualifierAlignmentFixer.h"
 #include "SortJavaScriptImports.h"
-#include "TokenAnalyzer.h"
-#include "TokenAnnotator.h"
 #include "UnwrappedLineFormatter.h"
-#include "UnwrappedLineParser.h"
 #include "UsingDeclarationsSorter.h"
-#include "WhitespaceManager.h"
-#include "clang/Basic/Diagnostic.h"
-#include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Lex/Lexer.h"
 #include "clang/Tooling/Inclusions/HeaderIncludes.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Support/YAMLTraits.h"
-#include <algorithm>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <string>
-#include <unordered_map>
 
 #define DEBUG_TYPE "format-formatter"
 
@@ -307,6 +279,14 @@ struct ScalarEnumerationTraits<FormatStyle::BreakTemplateDeclarationsStyle> {
   }
 };
 
+template <> struct ScalarEnumerationTraits<FormatStyle::DAGArgStyle> {
+  static void enumeration(IO &IO, FormatStyle::DAGArgStyle &Value) {
+    IO.enumCase(Value, "DontBreak", FormatStyle::DAS_DontBreak);
+    IO.enumCase(Value, "BreakElements", FormatStyle::DAS_BreakElements);
+    IO.enumCase(Value, "BreakAll", FormatStyle::DAS_BreakAll);
+  }
+};
+
 template <>
 struct ScalarEnumerationTraits<FormatStyle::DefinitionReturnTypeBreakingStyle> {
   static void
@@ -1120,6 +1100,10 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("StatementAttributeLikeMacros",
                    Style.StatementAttributeLikeMacros);
     IO.mapOptional("StatementMacros", Style.StatementMacros);
+    IO.mapOptional("TableGenBreakingDAGArgOperators",
+                   Style.TableGenBreakingDAGArgOperators);
+    IO.mapOptional("TableGenBreakInsideDAGArg",
+                   Style.TableGenBreakInsideDAGArg);
     IO.mapOptional("TabWidth", Style.TabWidth);
     IO.mapOptional("TypeNames", Style.TypeNames);
     IO.mapOptional("TypenameMacros", Style.TypenameMacros);
@@ -1580,6 +1564,8 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.StatementAttributeLikeMacros.push_back("Q_EMIT");
   LLVMStyle.StatementMacros.push_back("Q_UNUSED");
   LLVMStyle.StatementMacros.push_back("QT_REQUIRE_VERSION");
+  LLVMStyle.TableGenBreakingDAGArgOperators = {};
+  LLVMStyle.TableGenBreakInsideDAGArg = FormatStyle::DAS_DontBreak;
   LLVMStyle.TabWidth = 8;
   LLVMStyle.UseTab = FormatStyle::UT_Never;
   LLVMStyle.VerilogBreakBetweenInstancePorts = true;
diff --git a/clang/lib/Format/FormatInternal.h b/clang/lib/Format/FormatInternal.h
index 9043ce32e9e32a..60c5bf6b786b04 100644
--- a/clang/lib/Format/FormatInternal.h
+++ b/clang/lib/Format/FormatInternal.h
@@ -15,7 +15,9 @@
 #ifndef LLVM_CLANG_LIB_FORMAT_FORMATINTERNAL_H
 #define LLVM_CLANG_LIB_FORMAT_FORMATINTERNAL_H
 
-#include "BreakableToken.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Format/Format.h"
+#include "clang/Tooling/Core/Replacement.h"
 #include <utility>
 
 namespace clang {
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index f4566e4a335138..06f567059c3576 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -19,8 +19,6 @@
 #include "clang/Basic/OperatorPrecedence.h"
 #include "clang/Format/Format.h"
 #include "clang/Lex/Lexer.h"
-#include <memory>
-#include <optional>
 #include <unordered_set>
 
 namespace clang {
@@ -155,7 +153,11 @@ namespace format {
   TYPE(TableGenDAGArgCloser)                                                   \
   TYPE(TableGenDAGArgListColon)                                                \
   TYPE(TableGenDAGArgListComma)                                                \
+  TYPE(TableGenDAGArgListCommaToBreak)                                         \
   TYPE(TableGenDAGArgOpener)                                                   \
+  TYPE(TableGenDAGArgOpenerToBreak)                                            \
+  TYPE(TableGenDAGArgOperatorID)                                               \
+  TYPE(TableGenDAGArgOperatorToBreak)                                          \
   TYPE(TableGenListCloser)                                                     \
   TYPE(TableGenListOpener)                                                     \
   TYPE(TableGenMultiLineString)                                                \
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 65dd733bd53352..277cc0a2dfde66 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -17,14 +17,9 @@
 
 #include "Encoding.h"
 #include "FormatToken.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Support/Regex.h"
 
 #include <stack>
 
diff --git a/clang/lib/Format/FormatTokenSource.h b/clang/lib/Format/FormatTokenSource.h
index 7819244eb7d19a..cce19f527a9236 100644
--- a/clang/lib/Format/FormatTokenSource.h
+++ b/clang/lib/Format/FormatTokenSource.h
@@ -15,10 +15,7 @@
 #ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENSOURCE_H
 #define LLVM_CLANG_LIB_FORMAT_FORMATTOKENSOURCE_H
 
-#include "FormatToken.h"
 #include "UnwrappedLineParser.h"
-#include "llvm/ADT/DenseMap.h"
-#include <cstddef>
 
 #define DEBUG_TYPE "format-token-source"
 
diff --git a/clang/lib/Format/Macros.h b/clang/lib/Format/Macros.h
index d2f7fe502364cd..fb12d22299dead 100644
--- a/clang/lib/Format/Macros.h
+++ b/clang/lib/Format/Macros.h
@@ -39,15 +39,9 @@
 #define CLANG_LIB_FORMAT_MACROS_H
 
 #include <list>
-#include <map>
-#include <string>
-#include <vector>
 
 #include "FormatToken.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/SortJavaScriptImports.h b/clang/lib/Format/SortJavaScriptImports.h
index 7336db9537b0d5..b55b149aab4cd2 100644
--- a/clang/lib/Format/SortJavaScriptImports.h
+++ b/clang/lib/Format/SortJavaScriptImports.h
@@ -14,10 +14,7 @@
 #ifndef LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H
 #define LLVM_CLANG_LIB_FORMAT_SORTJAVASCRIPTIMPORTS_H
 
-#include "clang/Basic/LLVM.h"
 #include "clang/Format/Format.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/TokenAnalyzer.h b/clang/lib/Format/TokenAnalyzer.h
index 4086dab1c94c3a..b7494c395c8a29 100644
--- a/clang/lib/Format/TokenAnalyzer.h
+++ b/clang/lib/Format/TokenAnalyzer.h
@@ -17,19 +17,8 @@
 #define LLVM_CLANG_LIB_FORMAT_TOKENANALYZER_H
 
 #include "AffectedRangeManager.h"
-#include "Encoding.h"
-#include "FormatToken.h"
 #include "FormatTokenLexer.h"
 #include "TokenAnnotator.h"
-#include "UnwrappedLineParser.h"
-#include "clang/Basic/Diagnostic.h"
-#include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/FileManager.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
-#include <memory>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index d7b84e309e0964..94d2266555f6b9 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -989,16 +989,59 @@ class AnnotatingParser {
     return false;
   }
 
+  // Judge if the token is a operator ID to insert line break in DAGArg.
+  // That is, TableGenBreakingDAGArgOperators is empty (by the definition of the
+  // option) or the token is in the list.
+  bool isTableGenDAGArgBreakingOperator(const FormatToken &Tok) {
+    auto &Opes = Style.TableGenBreakingDAGArgOperators;
+    // If the list is empty, all operators are breaking operators.
+    if (Opes.empty())
+      return true;
+    // Otherwise, the operator is limited to normal identifiers.
+    if (Tok.isNot(tok::identifier) ||
+        Tok.isOneOf(TT_TableGenBangOperator, TT_TableGenCondOperator)) {
+      return false;
+    }
+    // The case next is colon, it is not a operator of identifier.
+    if (!Tok.Next || Tok.Next->is(tok::colon))
+      return false;
+    return std::find(Opes.begin(), Opes.end(), Tok.TokenText.str()) !=
+           Opes.end();
+  }
+
   // SimpleValue6 ::=  "(" DagArg [DagArgList] ")"
   // This parses SimpleValue 6's inside part of "(" ")"
   bool parseTableGenDAGArgAndList(FormatToken *Opener) {
+    FormatToken *FirstTok = CurrentToken;
     if (!parseTableGenDAGArg())
       return false;
+    bool BreakInside = false;
+    if (Style.TableGenBreakInsideDAGArg != FormatStyle::DAS_DontBreak) {
+      // Specialized detection for DAGArgOperator, that determines the way of
+      // line break for this DAGArg elements.
+      if (isTableGenDAGArgBreakingOperator(*FirstTok)) {
+        // Special case for identifier DAGArg operator.
+        BreakInside = true;
+        Opener->setType(TT_TableGenDAGArgOpenerToBreak);
+        if (FirstTok->isOneOf(TT_TableGenBangOperator,
+                              TT_TableGenCondOperator)) {
+          // Special case for bang/cond operators. Set the whole operator as
+          // the DAGArg operator. Always break after it.
+          CurrentToken->Previous->setType(TT_TableGenDAGArgOperatorToBreak);
+        } else if (FirstTok->is(tok::identifier)) {
+          if (Style.TableGenBreakInsideDAGArg == FormatStyle::DAS_BreakAll)
+            FirstTok->setType(TT_TableGenDAGArgOperatorToBreak);
+          else
+            FirstTok->setType(TT_TableGenDAGArgOperatorID);
+        }
+      }
+    }
     // Parse the [DagArgList] part
     bool FirstDAGArgListElm = true;
     while (CurrentToken) {
       if (!FirstDAGArgListElm && CurrentToken->is(tok::comma)) {
-        CurrentToken->setType(TT_TableGenDAGArgListComma);
+        CurrentToken->setType(BreakInside ? TT_TableGenDAGArgListCommaToBreak
+                                          : TT_TableGenDAGArgListComma);
         skipToNextNonComment();
       }
       if (CurrentToken && CurrentToken->is(tok::r_paren)) {
@@ -3596,6 +3639,13 @@ static bool isFunctionDeclarationName(bool IsCpp, const FormatToken &Current,
   if (!Current.Tok.getIdentifierInfo())
     return false;
 
+  const auto &Previous = *Current.Previous;
+
+  if (const auto *PrevPrev = Previous.Previous;
+      PrevPrev && PrevPrev->is(TT_ObjCDecl)) {
+    return false;
+  }
+
   auto skipOperatorName =
       [IsCpp](const FormatToken *Next) -> const FormatToken * {
     for (; Next; Next = Next->Next) {
@@ -3635,18 +3685,17 @@ static bool isFunctionDeclarationName(bool IsCpp, const FormatToken &Current,
   // Find parentheses of parameter list.
   const FormatToken *Next = Current.Next;
   if (Current.is(tok::kw_operator)) {
-    const auto *Previous = Current.Previous;
-    if (Previous->Tok.getIdentifierInfo() &&
-        !Previous->isOneOf(tok::kw_return, tok::kw_co_return)) {
+    if (Previous.Tok.getIdentifierInfo() &&
+        !Previous.isOneOf(tok::kw_return, tok::kw_co_return)) {
       return true;
     }
-    if (Previous->is(tok::r_paren) && Previous->is(TT_TypeDeclarationParen)) {
-      assert(Previous->MatchingParen);
-      assert(Previous->MatchingParen->is(tok::l_paren));
-      assert(Previous->MatchingParen->is(TT_TypeDeclarationParen));
+    if (Previous.is(tok::r_paren) && Previous.is(TT_TypeDeclarationParen)) {
+      assert(Previous.MatchingParen);
+      assert(Previous.MatchingParen->is(tok::l_paren));
+      assert(Previous.MatchingParen->is(TT_TypeDeclarationParen));
       return true;
     }
-    if (!Previous->isPointerOrReference() && Previous->isNot(TT_TemplateCloser))
+    if (!Previous.isPointerOrReference() && Previous.isNot(TT_TemplateCloser))
       return false;
     Next = skipOperatorName(Next);
   } else {
@@ -4268,6 +4317,10 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
     return Left.is(tok::hash);
   if (Left.isOneOf(tok::hashhash, tok::hash))
     return Right.is(tok::hash);
+  if (Left.is(BK_Block) && Right.is(tok::r_brace) &&
+      Right.MatchingParen == &Left && Line.Children.empty()) {
+    return Style.SpaceInEmptyBlock;
+  }
   if ((Left.is(tok::l_paren) && Right.is(tok::r_paren)) ||
       (Left.is(tok::l_brace) && Left.isNot(BK_Block) &&
        Right.is(tok::r_brace) && Right.isNot(BK_Block))) {
@@ -4610,7 +4663,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
       return Style.SpaceBeforeParensOptions.AfterFunctionDefinitionName ||
              spaceRequiredBeforeParens(Right);
     }
-    if (!Left.Previous || Left.Previous->isNot(tok::period)) {
+    if (!Left.Previous || !Left.Previous->isOneOf(tok::period, tok::arrow)) {
       if (Left.isOneOf(tok::kw_try, Keywords.kw___except, tok::kw_catch)) {
         return Style.SpaceBeforeParensOptions.AfterControlStatements ||
                spaceRequiredBeforeParens(Right);
@@ -5083,6 +5136,11 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     }
     if (Right.is(TT_TableGenCondOperatorColon))
       return false;
+    if (Left.isOneOf(TT_TableGenDAGArgOperatorID,
+                     TT_TableGenDAGArgOperatorToBreak) &&
+        Right.isNot(TT_TableGenDAGArgCloser)) {
+      return true;
+    }
     // Do not insert bang operators and consequent openers.
     if (Right.isOneOf(tok::l_paren, tok::less) &&
         Left.isOneOf(TT_TableGenBangOperator, TT_TableGenCondOperator)) {
@@ -5459,6 +5517,18 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
     //       case2:0);
     if (Left.is(TT_TableGenCondOperatorComma))
       return true;
+    if (Left.is(TT_TableGenDAGArgOperatorToBreak) &&
+        Right.isNot(TT_TableGenDAGArgCloser)) {
+      return true;
+    }
+    if (Left.is(TT_TableGenDAGArgListCommaToBreak))
+      return true;
+    if (Right.is(TT_TableGenDAGArgCloser) && Right.MatchingParen &&
+        Right.MatchingParen->is(TT_TableGenDAGArgOpenerToBreak) &&
+        &Left != Right.MatchingParen->Next) {
+      // Check to avoid empty DAGArg such as (ins).
+      return Style.TableGenBreakInsideDAGArg == FormatStyle::DAS_BreakAll;
+    }
   }
 
   if (Line.startsWith(tok::kw_asm) && Right.is(TT_InlineASMColon) &&
@@ -5873,6 +5943,8 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
     // Avoid to break around paste operator.
     if (Left.is(tok::hash) || Right.is(tok::hash))
       return false;
+    if (Left.isOneOf(TT_TableGenBangOperator, TT_TableGenCondOperator))
+      return false;
   }
 
   if (Left.is(tok::at))
diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h
index a631e5f52bc60a..25a24dccb1b834 100644
--- a/clang/lib/Format/TokenAnnotator.h
+++ b/clang/lib/Format/TokenAnnotator.h
@@ -16,7 +16,6 @@
 #define LLVM_CLANG_LIB_FORMAT_TOKENANNOTATOR_H
 
 #include "UnwrappedLineParser.h"
-#include "clang/Format/Format.h"
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/UnwrappedLineFormatter.h b/clang/lib/Format/UnwrappedLineFormatter.h
index ee6d31de8c4223..9b8acf427a2a00 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.h
+++ b/clang/lib/Format/UnwrappedLineFormatter.h
@@ -16,8 +16,6 @@
 #define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEFORMATTER_H
 
 #include "ContinuationIndenter.h"
-#include "clang/Format/Format.h"
-#include <map>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index e3f0af176b567d..98ae1c8f62bbc2 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1222,7 +1222,6 @@ void UnwrappedLineParser::parsePPUnknown() {
 static bool tokenCanStartNewLine(const FormatToken &Tok) {
   // Semicolon can be a null-statement, l_square can be a start of a macro or
   // a C++11 attribute, but this doesn't seem to be common.
-  assert(Tok.isNot(TT_AttributeSquare));
   return !Tok.isOneOf(tok::semi, tok::l_brace,
                       // Tokens that can only be used as binary operators and a
                       // part of overloaded operator names.
@@ -3710,14 +3709,19 @@ bool UnwrappedLineParser::parseEnum() {
   if (Style.Language == FormatStyle::LK_Proto && FormatTok->is(tok::equal))
     return false;
 
-  // Eat up enum class ...
-  if (FormatTok->isOneOf(tok::kw_class, tok::kw_struct))
-    nextToken();
+  if (IsCpp) {
+    // Eat up enum class ...
+    if (FormatTok->isOneOf(tok::kw_class, tok::kw_struct))
+      nextToken();
+    while (FormatTok->is(tok::l_square))
+      if (!handleCppAttributes())
+        return false;
+  }
 
   while (FormatTok->Tok.getIdentifierInfo() ||
          FormatTok->isOneOf(tok::colon, tok::coloncolon, tok::less,
                             tok::greater, tok::comma, tok::question,
-                            tok::l_square, tok::r_square)) {
+                            tok::l_square)) {
     if (Style.isVerilog()) {
       FormatTok->setFinalizedType(TT_VerilogDimensionedTypeName);
       nextToken();
@@ -3730,7 +3734,6 @@ bool UnwrappedLineParser::parseEnum() {
     // We can have macros or attributes in between 'enum' and the enum name.
     if (FormatTok->is(tok::l_paren))
       parseParens();
-    assert(FormatTok->isNot(TT_AttributeSquare));
     if (FormatTok->is(tok::identifier)) {
       nextToken();
       // If there are two identifiers in a row, this is likely an elaborate
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 619fbb217882b3..e2cf28c0c065dc 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -15,17 +15,8 @@
 #ifndef LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
 #define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
 
-#include "Encoding.h"
-#include "FormatToken.h"
 #include "Macros.h"
-#include "clang/Basic/IdentifierTable.h"
-#include "clang/Format/Format.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/Support/Regex.h"
-#include <list>
 #include <stack>
-#include <vector>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Format/WhitespaceManager.h b/clang/lib/Format/WhitespaceManager.h
index 9942e0f35738fc..0ebc6cf8377cdb 100644
--- a/clang/lib/Format/WhitespaceManager.h
+++ b/clang/lib/Format/WhitespaceManager.h
@@ -17,11 +17,6 @@
 
 #include "TokenAnnotator.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Format/Format.h"
-#include "llvm/ADT/SmallVector.h"
-#include <algorithm>
-#include <string>
-#include <tuple>
 
 namespace clang {
 namespace format {
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 451bdb9386f587..0df6a82ccd8933 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3191,6 +3191,22 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
   bool IsIndexHeaderMap = false;
   bool IsSysrootSpecified =
       Args.hasArg(OPT__sysroot_EQ) || Args.hasArg(OPT_isysroot);
+
+  // Expand a leading `=` to the sysroot if one was passed (and it's not a
+  // framework flag).
+  auto PrefixHeaderPath = [IsSysrootSpecified,
+                           &Opts](const llvm::opt::Arg *A,
+                                  bool IsFramework = false) -> std::string {
+    assert(A->getNumValues() && "Unexpected empty search path flag!");
+    if (IsSysrootSpecified && !IsFramework && A->getValue()[0] == '=') {
+      SmallString<32> Buffer;
+      llvm::sys::path::append(Buffer, Opts.Sysroot,
+                              llvm::StringRef(A->getValue()).substr(1));
+      return std::string(Buffer);
+    }
+    return A->getValue();
+  };
+
   for (const auto *A : Args.filtered(OPT_I, OPT_F, OPT_index_header_map)) {
     if (A->getOption().matches(OPT_index_header_map)) {
       // -index-header-map applies to the next -I or -F.
@@ -3202,16 +3218,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
         IsIndexHeaderMap ? frontend::IndexHeaderMap : frontend::Angled;
 
     bool IsFramework = A->getOption().matches(OPT_F);
-    std::string Path = A->getValue();
-
-    if (IsSysrootSpecified && !IsFramework && A->getValue()[0] == '=') {
-      SmallString<32> Buffer;
-      llvm::sys::path::append(Buffer, Opts.Sysroot,
-                              llvm::StringRef(A->getValue()).substr(1));
-      Path = std::string(Buffer);
-    }
-
-    Opts.AddPath(Path, Group, IsFramework,
+    Opts.AddPath(PrefixHeaderPath(A, IsFramework), Group, IsFramework,
                  /*IgnoreSysroot*/ true);
     IsIndexHeaderMap = false;
   }
@@ -3229,12 +3236,18 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
   }
 
   for (const auto *A : Args.filtered(OPT_idirafter))
-    Opts.AddPath(A->getValue(), frontend::After, false, true);
+    Opts.AddPath(PrefixHeaderPath(A), frontend::After, false, true);
   for (const auto *A : Args.filtered(OPT_iquote))
-    Opts.AddPath(A->getValue(), frontend::Quoted, false, true);
-  for (const auto *A : Args.filtered(OPT_isystem, OPT_iwithsysroot))
-    Opts.AddPath(A->getValue(), frontend::System, false,
-                 !A->getOption().matches(OPT_iwithsysroot));
+    Opts.AddPath(PrefixHeaderPath(A), frontend::Quoted, false, true);
+
+  for (const auto *A : Args.filtered(OPT_isystem, OPT_iwithsysroot)) {
+    if (A->getOption().matches(OPT_iwithsysroot)) {
+      Opts.AddPath(A->getValue(), frontend::System, false,
+                   /*IgnoreSysRoot=*/false);
+      continue;
+    }
+    Opts.AddPath(PrefixHeaderPath(A), frontend::System, false, true);
+  }
   for (const auto *A : Args.filtered(OPT_iframework))
     Opts.AddPath(A->getValue(), frontend::System, true, true);
   for (const auto *A : Args.filtered(OPT_iframeworkwithsysroot))
@@ -3293,6 +3306,17 @@ static void ParseAPINotesArgs(APINotesOptions &Opts, ArgList &Args,
     Opts.ModuleSearchPaths.push_back(A->getValue());
 }
 
+static void GeneratePointerAuthArgs(const LangOptions &Opts,
+                                    ArgumentConsumer Consumer) {
+  if (Opts.PointerAuthIntrinsics)
+    GenerateArg(Consumer, OPT_fptrauth_intrinsics);
+}
+
+static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
+                                 DiagnosticsEngine &Diags) {
+  Opts.PointerAuthIntrinsics = Args.hasArg(OPT_fptrauth_intrinsics);
+}
+
 /// Check if input file kind and language standard are compatible.
 static bool IsInputCompatibleWithStandard(InputKind IK,
                                           const LangStandard &S) {
@@ -4014,6 +4038,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 
       if (TT.getArch() == llvm::Triple::UnknownArch ||
           !(TT.getArch() == llvm::Triple::aarch64 || TT.isPPC() ||
+            TT.getArch() == llvm::Triple::systemz ||
             TT.getArch() == llvm::Triple::nvptx ||
             TT.getArch() == llvm::Triple::nvptx64 ||
             TT.getArch() == llvm::Triple::amdgcn ||
@@ -4612,6 +4637,8 @@ bool CompilerInvocation::CreateFromArgsImpl(
                         Res.getFileSystemOpts().WorkingDir);
   ParseAPINotesArgs(Res.getAPINotesOpts(), Args, Diags);
 
+  ParsePointerAuthArgs(LangOpts, Args, Diags);
+
   ParseLangArgs(LangOpts, Args, DashX, T, Res.getPreprocessorOpts().Includes,
                 Diags);
   if (Res.getFrontendOpts().ProgramAction == frontend::RewriteObjC)
@@ -4842,6 +4869,7 @@ void CompilerInvocationBase::generateCC1CommandLine(
   GenerateTargetArgs(getTargetOpts(), Consumer);
   GenerateHeaderSearchArgs(getHeaderSearchOpts(), Consumer);
   GenerateAPINotesArgs(getAPINotesOpts(), Consumer);
+  GeneratePointerAuthArgs(getLangOpts(), Consumer);
   GenerateLangArgs(getLangOpts(), Consumer, T, getFrontendOpts().DashX);
   GenerateCodeGenArgs(getCodeGenOpts(), Consumer, T,
                       getFrontendOpts().OutputFile, &getLangOpts());
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index b9a966be73ee97..97104ccd8db59c 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -214,6 +214,7 @@ set(x86_files
   popcntintrin.h
   prfchiintrin.h
   prfchwintrin.h
+  ptrauth.h
   ptwriteintrin.h
   raointintrin.h
   rdpruintrin.h
@@ -253,8 +254,10 @@ set(x86_files
   )
 
 set(windows_only_files
+  intrin0.h
   intrin.h
   vadefs.h
+  yvals_core.h
 )
 
 set(utility_files
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index ecd9bf18a41ca0..a8882e82e171af 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -2201,6 +2201,10 @@ _mm256_cvtpd_ps(__m256d __a)
 
 /// Converts a vector of [8 x float] into a vector of [8 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
@@ -2230,9 +2234,13 @@ _mm256_cvtps_pd(__m128 __a)
   return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
 }
 
-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
-///    x i32], truncating the result by rounding towards zero when it is
-///    inexact.
+/// Converts a 256-bit vector of [4 x double] into four signed truncated
+///    (rounded toward zero) 32-bit integers returned in a 128-bit vector of
+///    [4 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2247,9 +2255,12 @@ _mm256_cvttpd_epi32(__m256d __a)
   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
 }
 
-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
-///    x i32]. When a conversion is inexact, the value returned is rounded
-///    according to the rounding control bits in the MXCSR register.
+/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
+///    [4 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2264,8 +2275,12 @@ _mm256_cvtpd_epi32(__m256d __a)
   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
 }
 
-/// Converts a vector of [8 x float] into a vector of [8 x i32],
-///    truncating the result by rounding towards zero when it is inexact.
+/// Converts a vector of [8 x float] into eight signed truncated (rounded
+///    toward zero) 32-bit integers returned in a vector of [8 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/bmiintrin.h b/clang/lib/Headers/bmiintrin.h
index d8e57c0cb49404..78bffe68e221a9 100644
--- a/clang/lib/Headers/bmiintrin.h
+++ b/clang/lib/Headers/bmiintrin.h
@@ -161,8 +161,7 @@ _mm_tzcnt_64(unsigned long long __X)
 
 #undef __RELAXED_FN_ATTRS
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__BMI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
@@ -610,7 +609,6 @@ __blsr_u64(unsigned long long __X)
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules)   \
-          || defined(__BMI__) */
+#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__BMI__) */
 
 #endif /* __BMIINTRIN_H */
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 984f0cf917e99b..f0c2db752195ad 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -1294,6 +1294,10 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
 ///    64 bits of the result vector are set to zero.
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
@@ -1309,6 +1313,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
 /// Converts the low-order element of a 128-bit vector of [2 x double]
 ///    into a 32-bit signed integer value.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
@@ -1394,12 +1402,13 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
 }
 
 /// Converts the two double-precision floating-point elements of a
-///    128-bit vector of [2 x double] into two signed 32-bit integer values,
-///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
+///    128-bit vector of [2 x double] into two signed truncated (rounded
+///    toward zero) 32-bit integer values, returned in the lower 64 bits
+///    of a 128-bit vector of [4 x i32].
 ///
-///    If the result of either conversion is inexact, the result is truncated
-///    (rounded towards zero) regardless of the current MXCSR setting. The upper
-///    64 bits of the result vector are set to zero.
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1415,7 +1424,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
 }
 
 /// Converts the low-order element of a [2 x double] vector into a 32-bit
-///    signed integer value, truncating the result when it is inexact.
+///    signed truncated (rounded toward zero) integer value.
+///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1434,6 +1447,10 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
 ///    returned in a 64-bit vector of [2 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
@@ -1446,11 +1463,12 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
 }
 
 /// Converts the two double-precision floating-point elements of a
-///    128-bit vector of [2 x double] into two signed 32-bit integer values,
-///    returned in a 64-bit vector of [2 x i32].
+///    128-bit vector of [2 x double] into two signed truncated (rounded toward
+///    zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
 ///
-///    If the result of either conversion is inexact, the result is truncated
-///    (rounded towards zero) regardless of the current MXCSR setting.
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3216,7 +3234,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
 }
 
 /// Converts the first (lower) element of a vector of [2 x double] into a
-///    64-bit signed integer value, according to the current rounding mode.
+///    64-bit signed integer value.
+///
+///    If the converted value does not fit in a 64-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3231,7 +3253,11 @@ static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
 }
 
 /// Converts the first (lower) element of a vector of [2 x double] into a
-///    64-bit signed integer value, truncating the result when it is inexact.
+///    64-bit signed truncated (rounded toward zero) integer value.
+///
+///    If a converted value does not fit in a 64-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3262,6 +3288,10 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
 
 /// Converts a vector of [4 x float] into a vector of [4 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
@@ -3274,8 +3304,12 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
 }
 
-/// Converts a vector of [4 x float] into a vector of [4 x i32],
-///    truncating the result when it is inexact.
+/// Converts a vector of [4 x float] into four signed truncated (rounded toward
+///    zero) 32-bit integers, returned in a vector of [4 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 45f8544392584e..5e703772b7ee4f 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -252,6 +252,116 @@ double3 ceil(double3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_ceil)
 double4 ceil(double4);
 
+//===----------------------------------------------------------------------===//
+// clamp builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T clamp(T X, T Min, T Max)
+/// \brief Clamps the specified value \a X to the specified
+/// minimum ( \a Min) and maximum ( \a Max) range.
+/// \param X A value to clamp.
+/// \param Min The specified minimum range.
+/// \param Max The specified maximum range.
+///
+/// Returns The clamped value for the \a X parameter.
+/// For values of -INF or INF, clamp will behave as expected.
+/// However for values of NaN, the results are undefined.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+half clamp(half, half, half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+half2 clamp(half2, half2, half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+half3 clamp(half3, half3, half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+half4 clamp(half4, half4, half4);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int16_t clamp(int16_t, int16_t, int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int16_t2 clamp(int16_t2, int16_t2, int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int16_t3 clamp(int16_t3, int16_t3, int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int16_t4 clamp(int16_t4, int16_t4, int16_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint16_t clamp(uint16_t, uint16_t, uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint16_t2 clamp(uint16_t2, uint16_t2, uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint16_t3 clamp(uint16_t3, uint16_t3, uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint16_t4 clamp(uint16_t4, uint16_t4, uint16_t4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int clamp(int, int, int);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int2 clamp(int2, int2, int2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int3 clamp(int3, int3, int3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int4 clamp(int4, int4, int4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint clamp(uint, uint, uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint2 clamp(uint2, uint2, uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint3 clamp(uint3, uint3, uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint4 clamp(uint4, uint4, uint4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int64_t clamp(int64_t, int64_t, int64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int64_t2 clamp(int64_t2, int64_t2, int64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int64_t3 clamp(int64_t3, int64_t3, int64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+int64_t4 clamp(int64_t4, int64_t4, int64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint64_t clamp(uint64_t, uint64_t, uint64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint64_t2 clamp(uint64_t2, uint64_t2, uint64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint64_t3 clamp(uint64_t3, uint64_t3, uint64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+uint64_t4 clamp(uint64_t4, uint64_t4, uint64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+float clamp(float, float, float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+float2 clamp(float2, float2, float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+float3 clamp(float3, float3, float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+float4 clamp(float4, float4, float4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+double clamp(double, double, double);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+double2 clamp(double2, double2, double2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+double3 clamp(double3, double3, double3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_clamp)
+double4 clamp(double4, double4, double4);
+
 //===----------------------------------------------------------------------===//
 // cos builtins
 //===----------------------------------------------------------------------===//
@@ -525,6 +635,39 @@ float3 frac(float3);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_frac)
 float4 frac(float4);
 
+//===----------------------------------------------------------------------===//
+// isinf builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T isinf(T x)
+/// \brief Determines if the specified value \a x  is infinite.
+/// \param x The specified input value.
+///
+/// Returns a value of the same size as the input, with a value set
+/// to True if the x parameter is +INF or -INF. Otherwise, False.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool isinf(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool2 isinf(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool3 isinf(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool4 isinf(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool isinf(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool2 isinf(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool3 isinf(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf)
+bool4 isinf(float4);
+
 //===----------------------------------------------------------------------===//
 // lerp builtins
 //===----------------------------------------------------------------------===//
@@ -1153,6 +1296,39 @@ double3 rcp(double3);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rcp)
 double4 rcp(double4);
 
+//===----------------------------------------------------------------------===//
+// rsqrt builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T rsqrt(T x)
+/// \brief Returns the reciprocal of the square root of the specified value.
+/// ie 1 / sqrt( \a x).
+/// \param x The specified input value.
+///
+/// This function uses the following formula: 1 / sqrt(x).
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+half rsqrt(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+half2 rsqrt(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+half3 rsqrt(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+half4 rsqrt(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+float rsqrt(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+float2 rsqrt(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+float3 rsqrt(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_rsqrt)
+float4 rsqrt(float4);
+
 //===----------------------------------------------------------------------===//
 // round builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 27800f7a8202c1..508696d3725b9a 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -16,281 +16,239 @@
 
 #include <x86gprintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MMX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MMX__)
 #include <mmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE__)
 #include <xmmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE2__)
 #include <emmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE3__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE3__)
 #include <pmmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSSE3__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSSE3__)
 #include <tmmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__SSE4_2__) || defined(__SSE4_1__))
 #include <smmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AES__) || defined(__PCLMUL__))
 #include <wmmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLFLUSHOPT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLFLUSHOPT__)
 #include <clflushoptintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLWB__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLWB__)
 #include <clwbintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX__)
 #include <avxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX2__)
 #include <avx2intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__F16C__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__F16C__)
 #include <f16cintrin.h>
 #endif
 
 /* No feature check desired due to internal checks */
 #include <bmiintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__BMI2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI2__)
 #include <bmi2intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__LZCNT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__LZCNT__)
 #include <lzcntintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__POPCNT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__POPCNT__)
 #include <popcntintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FMA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA__)
 #include <fmaintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512F__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512F__)
 #include <avx512fintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VL__)
 #include <avx512vlintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512BW__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BW__)
 #include <avx512bwintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512BITALG__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BITALG__)
 #include <avx512bitalgintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512CD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512CD__)
 #include <avx512cdintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VPOPCNTDQ__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
 #include <avx512vpopcntdqintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
 #include <avx512vpopcntdqvlintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VNNI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VNNI__)
 #include <avx512vnniintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512VNNI__))
 #include <avx512vlvnniintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXVNNI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNI__)
 #include <avxvnniintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512DQ__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512BITALG__))
 #include <avx512vlbitalgintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512BW__))
 #include <avx512vlbwintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512CD__))
 #include <avx512vlcdintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512DQ__))
 #include <avx512vldqintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512ER__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512ER__)
 #include <avx512erintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512IFMA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
 #include <avx512ifmaintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512IFMA__) && defined(__AVX512VL__))
 #include <avx512ifmavlintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXIFMA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXIFMA__)
 #include <avxifmaintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VBMI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI__)
 #include <avx512vbmiintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VBMI__) && defined(__AVX512VL__))
 #include <avx512vbmivlintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512VBMI2__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI2__)
 #include <avx512vbmi2intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
 #include <avx512vlvbmi2intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512PF__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512PF__)
 #include <avx512pfintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512FP16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
 #include <avx512fp16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512FP16__))
 #include <avx512vlfp16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVX512BF16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BF16__)
 #include <avx512bf16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512BF16__))
 #include <avx512vlbf16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PKU__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PKU__)
 #include <pkuintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__VPCLMULQDQ__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__VPCLMULQDQ__)
 #include <vpclmulqdqintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__VAES__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__VAES__)
 #include <vaesintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__GFNI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__GFNI__)
 #include <gfniintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXVNNIINT8__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT8__)
 #include <avxvnniint8intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXNECONVERT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXNECONVERT__)
 #include <avxneconvertintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SHA512__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA512__)
 #include <sha512intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SM3__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SM3__)
 #include <sm3intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SM4__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SM4__)
 #include <sm4intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AVXVNNIINT16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT16__)
 #include <avxvnniint16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDPID__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPID__)
 /// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
 ///
 /// \headerfile <immintrin.h>
@@ -304,8 +262,7 @@ _rdpid_u32(void) {
 }
 #endif // __RDPID__
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDRND__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDRND__)
 /// Returns a 16-bit hardware-generated random value.
 ///
 /// \headerfile <immintrin.h>
@@ -367,8 +324,7 @@ _rdrand64_step(unsigned long long *__p)
 }
 #endif /* __RDRND__ */
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FSGSBASE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FSGSBASE__)
 #ifdef __x86_64__
 /// Reads the FS base register.
 ///
@@ -481,8 +437,7 @@ _writegsbase_u64(unsigned long long __V)
 #endif
 #endif /* __FSGSBASE__ */
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MOVBE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVBE__)
 
 /* The structs used below are to force the load/store to be unaligned. This
  * is accomplished with the __packed__ attribute. The __may_alias__ prevents
@@ -598,139 +553,118 @@ _storebe_i64(void * __P, long long __D) {
 #endif
 #endif /* __MOVBE */
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RTM__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RTM__)
 #include <rtmintrin.h>
 #include <xtestintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SHA__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA__)
 #include <shaintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FXSR__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FXSR__)
 #include <fxsrintrin.h>
 #endif
 
 /* No feature check desired due to internal MSC_VER checks */
 #include <xsaveintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XSAVEOPT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEOPT__)
 #include <xsaveoptintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XSAVEC__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEC__)
 #include <xsavecintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XSAVES__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVES__)
 #include <xsavesintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SHSTK__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHSTK__)
 #include <cetintrin.h>
 #endif
 
 /* Intrinsics inside adcintrin.h are available at all times. */
 #include <adcintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__ADX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__ADX__)
 #include <adxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDSEED__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDSEED__)
 #include <rdseedintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__WBNOINVD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WBNOINVD__)
 #include <wbnoinvdintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLDEMOTE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLDEMOTE__)
 #include <cldemoteintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__WAITPKG__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WAITPKG__)
 #include <waitpkgintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MOVDIRI__) || defined(__MOVDIR64B__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVDIRI__) ||     \
+    defined(__MOVDIR64B__)
 #include <movdirintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PCONFIG__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
 #include <pconfigintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SGX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SGX__)
 #include <sgxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PTWRITE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PTWRITE__)
 #include <ptwriteintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__INVPCID__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
 #include <invpcidintrin.h>
 #endif
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMX_FP16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
 #include <amxfp16intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__KL__) || defined(__WIDEKL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) ||          \
+    defined(__WIDEKL__)
 #include <keylockerintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMX_TILE__) || defined(__AMX_INT8__) || defined(__AMX_BF16__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TILE__) ||    \
+    defined(__AMX_INT8__) || defined(__AMX_BF16__)
 #include <amxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMX_COMPLEX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
 #include <amxcomplexintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
     (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
 #include <avx512vlvp2intersectintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__ENQCMD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
 #include <enqcmdintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SERIALIZE__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SERIALIZE__)
 #include <serializeintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__TSXLDTRK__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__TSXLDTRK__)
 #include <tsxldtrkintrin.h>
 #endif
 
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index a6395143db54c2..fd27955fbe002d 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -15,6 +15,8 @@
 #ifndef __INTRIN_H
 #define __INTRIN_H
 
+#include <intrin0.h>
+
 /* First include the standard intrinsics. */
 #if defined(__i386__) || defined(__x86_64__)
 #include <x86intrin.h>
@@ -131,8 +133,6 @@ void __writefsqword(unsigned long, unsigned __int64);
 void __writefsword(unsigned long, unsigned short);
 void __writemsr(unsigned long, unsigned __int64);
 void *_AddressOfReturnAddress(void);
-unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
-unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
 unsigned char _bittest(long const *, long);
 unsigned char _bittestandcomplement(long *, long);
 unsigned char _bittestandreset(long *, long);
@@ -151,7 +151,6 @@ long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
 void _ReadBarrier(void);
-void _ReadWriteBarrier(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
@@ -182,12 +181,6 @@ unsigned char __readgsbyte(unsigned long);
 unsigned long __readgsdword(unsigned long);
 unsigned __int64 __readgsqword(unsigned long);
 unsigned short __readgsword(unsigned long);
-unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
-                                unsigned __int64 _HighPart,
-                                unsigned char _Shift);
-unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
-                                 unsigned __int64 _HighPart,
-                                 unsigned char _Shift);
 void __stosq(unsigned __int64 *, unsigned __int64, size_t);
 unsigned char __vmx_on(unsigned __int64 *);
 unsigned char __vmx_vmclear(unsigned __int64 *);
@@ -236,212 +229,10 @@ unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
 unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
 __int64 __mulh(__int64, __int64);
 unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
-__int64 _mul128(__int64, __int64, __int64*);
-unsigned __int64 _umul128(unsigned __int64,
-                          unsigned __int64,
-                          unsigned __int64*);
+__int64 _mul128(__int64, __int64, __int64 *);
 
 #endif /* __x86_64__ */
 
-#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-
-unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
-
-#endif
-
-#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
-__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
-__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
-__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
-
-#endif
-
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange Add
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value);
-char _InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value);
-char _InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value);
-short _InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value);
-short _InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value);
-short _InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value);
-long _InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value);
-long _InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value);
-long _InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value);
-__int64 _InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value);
-__int64 _InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value);
-__int64 _InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Increment
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-short _InterlockedIncrement16_acq(short volatile *_Value);
-short _InterlockedIncrement16_nf(short volatile *_Value);
-short _InterlockedIncrement16_rel(short volatile *_Value);
-long _InterlockedIncrement_acq(long volatile *_Value);
-long _InterlockedIncrement_nf(long volatile *_Value);
-long _InterlockedIncrement_rel(long volatile *_Value);
-__int64 _InterlockedIncrement64_acq(__int64 volatile *_Value);
-__int64 _InterlockedIncrement64_nf(__int64 volatile *_Value);
-__int64 _InterlockedIncrement64_rel(__int64 volatile *_Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Decrement
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-short _InterlockedDecrement16_acq(short volatile *_Value);
-short _InterlockedDecrement16_nf(short volatile *_Value);
-short _InterlockedDecrement16_rel(short volatile *_Value);
-long _InterlockedDecrement_acq(long volatile *_Value);
-long _InterlockedDecrement_nf(long volatile *_Value);
-long _InterlockedDecrement_rel(long volatile *_Value);
-__int64 _InterlockedDecrement64_acq(__int64 volatile *_Value);
-__int64 _InterlockedDecrement64_nf(__int64 volatile *_Value);
-__int64 _InterlockedDecrement64_rel(__int64 volatile *_Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked And
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedAnd8_acq(char volatile *_Value, char _Mask);
-char _InterlockedAnd8_nf(char volatile *_Value, char _Mask);
-char _InterlockedAnd8_rel(char volatile *_Value, char _Mask);
-short _InterlockedAnd16_acq(short volatile *_Value, short _Mask);
-short _InterlockedAnd16_nf(short volatile *_Value, short _Mask);
-short _InterlockedAnd16_rel(short volatile *_Value, short _Mask);
-long _InterlockedAnd_acq(long volatile *_Value, long _Mask);
-long _InterlockedAnd_nf(long volatile *_Value, long _Mask);
-long _InterlockedAnd_rel(long volatile *_Value, long _Mask);
-__int64 _InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Bit Counting and Testing
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-unsigned char _interlockedbittestandset_acq(long volatile *_BitBase,
-                                            long _BitPos);
-unsigned char _interlockedbittestandset_nf(long volatile *_BitBase,
-                                           long _BitPos);
-unsigned char _interlockedbittestandset_rel(long volatile *_BitBase,
-                                            long _BitPos);
-unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase,
-                                              long _BitPos);
-unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase,
-                                             long _BitPos);
-unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase,
-                                              long _BitPos);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Or
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedOr8_acq(char volatile *_Value, char _Mask);
-char _InterlockedOr8_nf(char volatile *_Value, char _Mask);
-char _InterlockedOr8_rel(char volatile *_Value, char _Mask);
-short _InterlockedOr16_acq(short volatile *_Value, short _Mask);
-short _InterlockedOr16_nf(short volatile *_Value, short _Mask);
-short _InterlockedOr16_rel(short volatile *_Value, short _Mask);
-long _InterlockedOr_acq(long volatile *_Value, long _Mask);
-long _InterlockedOr_nf(long volatile *_Value, long _Mask);
-long _InterlockedOr_rel(long volatile *_Value, long _Mask);
-__int64 _InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Xor
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedXor8_acq(char volatile *_Value, char _Mask);
-char _InterlockedXor8_nf(char volatile *_Value, char _Mask);
-char _InterlockedXor8_rel(char volatile *_Value, char _Mask);
-short _InterlockedXor16_acq(short volatile *_Value, short _Mask);
-short _InterlockedXor16_nf(short volatile *_Value, short _Mask);
-short _InterlockedXor16_rel(short volatile *_Value, short _Mask);
-long _InterlockedXor_acq(long volatile *_Value, long _Mask);
-long _InterlockedXor_nf(long volatile *_Value, long _Mask);
-long _InterlockedXor_rel(long volatile *_Value, long _Mask);
-__int64 _InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask);
-__int64 _InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedExchange8_acq(char volatile *_Target, char _Value);
-char _InterlockedExchange8_nf(char volatile *_Target, char _Value);
-char _InterlockedExchange8_rel(char volatile *_Target, char _Value);
-short _InterlockedExchange16_acq(short volatile *_Target, short _Value);
-short _InterlockedExchange16_nf(short volatile *_Target, short _Value);
-short _InterlockedExchange16_rel(short volatile *_Target, short _Value);
-long _InterlockedExchange_acq(long volatile *_Target, long _Value);
-long _InterlockedExchange_nf(long volatile *_Target, long _Value);
-long _InterlockedExchange_rel(long volatile *_Target, long _Value);
-__int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value);
-__int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value);
-__int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value);
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Compare Exchange
-\*----------------------------------------------------------------------------*/
-#if defined(__arm__) || defined(__aarch64__)
-char _InterlockedCompareExchange8_acq(char volatile *_Destination,
-                             char _Exchange, char _Comparand);
-char _InterlockedCompareExchange8_nf(char volatile *_Destination,
-                             char _Exchange, char _Comparand);
-char _InterlockedCompareExchange8_rel(char volatile *_Destination,
-                             char _Exchange, char _Comparand);
-short _InterlockedCompareExchange16_acq(short volatile *_Destination,
-                              short _Exchange, short _Comparand);
-short _InterlockedCompareExchange16_nf(short volatile *_Destination,
-                              short _Exchange, short _Comparand);
-short _InterlockedCompareExchange16_rel(short volatile *_Destination,
-                              short _Exchange, short _Comparand);
-long _InterlockedCompareExchange_acq(long volatile *_Destination,
-                              long _Exchange, long _Comparand);
-long _InterlockedCompareExchange_nf(long volatile *_Destination,
-                              long _Exchange, long _Comparand);
-long _InterlockedCompareExchange_rel(long volatile *_Destination,
-                              long _Exchange, long _Comparand);
-__int64 _InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand);
-__int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand);
-__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
-                              __int64 _Exchange, __int64 _Comparand);
-#endif
-#if defined(__x86_64__) || defined(__aarch64__)
-unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
-                                             __int64 _ExchangeHigh,
-                                             __int64 _ExchangeLow,
-                                             __int64 *_ComparandResult);
-#endif
-#if defined(__aarch64__)
-unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
-                                                 __int64 _ExchangeHigh,
-                                                 __int64 _ExchangeLow,
-                                                 __int64 *_ComparandResult);
-unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
-                                                __int64 _ExchangeHigh,
-                                                __int64 _ExchangeLow,
-                                                __int64 *_ComparandResult);
-unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
-                                                 __int64 _ExchangeHigh,
-                                                 __int64 _ExchangeLow,
-                                                 __int64 *_ComparandResult);
-#endif
-
 /*----------------------------------------------------------------------------*\
 |* movs, stos
 \*----------------------------------------------------------------------------*/
@@ -583,8 +374,6 @@ unsigned int _CountLeadingOnes(unsigned long);
 unsigned int _CountLeadingOnes64(unsigned __int64);
 unsigned int _CountLeadingSigns(long);
 unsigned int _CountLeadingSigns64(__int64);
-unsigned int _CountLeadingZeros(unsigned long);
-unsigned int _CountLeadingZeros64(unsigned _int64);
 unsigned int _CountOneBits(unsigned long);
 unsigned int _CountOneBits64(unsigned __int64);
 
diff --git a/clang/lib/Headers/intrin0.h b/clang/lib/Headers/intrin0.h
new file mode 100644
index 00000000000000..31f362ec84d5c5
--- /dev/null
+++ b/clang/lib/Headers/intrin0.h
@@ -0,0 +1,247 @@
+/* ===-------- intrin.h ---------------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <intrin0.h>
+#else
+
+#ifndef __INTRIN0_H
+#define __INTRIN0_H
+
+#ifdef __x86_64__
+#include <adcintrin.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+void _ReadWriteBarrier(void);
+
+#if defined(__aarch64__)
+unsigned int _CountLeadingZeros(unsigned long);
+unsigned int _CountLeadingZeros64(unsigned _int64);
+unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+#endif
+
+#ifdef __x86_64__
+unsigned __int64 _umul128(unsigned __int64, unsigned __int64,
+                          unsigned __int64 *);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+#endif
+
+#if defined(__x86_64__) || defined(__i386__)
+void _mm_pause(void);
+#endif
+
+#if defined(__x86_64__) || defined(__aarch64__)
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_ComparandResult);
+#endif
+
+#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) ||            \
+    defined(__aarch64__)
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+char _InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value);
+char _InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value);
+char _InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value);
+short _InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value);
+short _InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value);
+short _InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value);
+long _InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value);
+long _InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value);
+__int64 _InterlockedExchangeAdd64_acq(__int64 volatile *_Addend,
+                                      __int64 _Value);
+__int64 _InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value);
+__int64 _InterlockedExchangeAdd64_rel(__int64 volatile *_Addend,
+                                      __int64 _Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+short _InterlockedIncrement16_acq(short volatile *_Value);
+short _InterlockedIncrement16_nf(short volatile *_Value);
+short _InterlockedIncrement16_rel(short volatile *_Value);
+long _InterlockedIncrement_acq(long volatile *_Value);
+long _InterlockedIncrement_nf(long volatile *_Value);
+long _InterlockedIncrement_rel(long volatile *_Value);
+__int64 _InterlockedIncrement64_acq(__int64 volatile *_Value);
+__int64 _InterlockedIncrement64_nf(__int64 volatile *_Value);
+__int64 _InterlockedIncrement64_rel(__int64 volatile *_Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+short _InterlockedDecrement16_acq(short volatile *_Value);
+short _InterlockedDecrement16_nf(short volatile *_Value);
+short _InterlockedDecrement16_rel(short volatile *_Value);
+long _InterlockedDecrement_acq(long volatile *_Value);
+long _InterlockedDecrement_nf(long volatile *_Value);
+long _InterlockedDecrement_rel(long volatile *_Value);
+__int64 _InterlockedDecrement64_acq(__int64 volatile *_Value);
+__int64 _InterlockedDecrement64_nf(__int64 volatile *_Value);
+__int64 _InterlockedDecrement64_rel(__int64 volatile *_Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+char _InterlockedAnd8_acq(char volatile *_Value, char _Mask);
+char _InterlockedAnd8_nf(char volatile *_Value, char _Mask);
+char _InterlockedAnd8_rel(char volatile *_Value, char _Mask);
+short _InterlockedAnd16_acq(short volatile *_Value, short _Mask);
+short _InterlockedAnd16_nf(short volatile *_Value, short _Mask);
+short _InterlockedAnd16_rel(short volatile *_Value, short _Mask);
+long _InterlockedAnd_acq(long volatile *_Value, long _Mask);
+long _InterlockedAnd_nf(long volatile *_Value, long _Mask);
+long _InterlockedAnd_rel(long volatile *_Value, long _Mask);
+__int64 _InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask);
+
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+unsigned char _interlockedbittestandset_acq(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandset_nf(long volatile *_BitBase,
+                                           long _BitPos);
+unsigned char _interlockedbittestandset_rel(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase,
+                                              long _BitPos);
+unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase,
+                                             long _BitPos);
+unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase,
+                                              long _BitPos);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+char _InterlockedOr8_acq(char volatile *_Value, char _Mask);
+char _InterlockedOr8_nf(char volatile *_Value, char _Mask);
+char _InterlockedOr8_rel(char volatile *_Value, char _Mask);
+short _InterlockedOr16_acq(short volatile *_Value, short _Mask);
+short _InterlockedOr16_nf(short volatile *_Value, short _Mask);
+short _InterlockedOr16_rel(short volatile *_Value, short _Mask);
+long _InterlockedOr_acq(long volatile *_Value, long _Mask);
+long _InterlockedOr_nf(long volatile *_Value, long _Mask);
+long _InterlockedOr_rel(long volatile *_Value, long _Mask);
+__int64 _InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+char _InterlockedXor8_acq(char volatile *_Value, char _Mask);
+char _InterlockedXor8_nf(char volatile *_Value, char _Mask);
+char _InterlockedXor8_rel(char volatile *_Value, char _Mask);
+short _InterlockedXor16_acq(short volatile *_Value, short _Mask);
+short _InterlockedXor16_nf(short volatile *_Value, short _Mask);
+short _InterlockedXor16_rel(short volatile *_Value, short _Mask);
+long _InterlockedXor_acq(long volatile *_Value, long _Mask);
+long _InterlockedXor_nf(long volatile *_Value, long _Mask);
+long _InterlockedXor_rel(long volatile *_Value, long _Mask);
+__int64 _InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask);
+__int64 _InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+char _InterlockedExchange8_acq(char volatile *_Target, char _Value);
+char _InterlockedExchange8_nf(char volatile *_Target, char _Value);
+char _InterlockedExchange8_rel(char volatile *_Target, char _Value);
+short _InterlockedExchange16_acq(short volatile *_Target, short _Value);
+short _InterlockedExchange16_nf(short volatile *_Target, short _Value);
+short _InterlockedExchange16_rel(short volatile *_Target, short _Value);
+long _InterlockedExchange_acq(long volatile *_Target, long _Value);
+long _InterlockedExchange_nf(long volatile *_Target, long _Value);
+long _InterlockedExchange_rel(long volatile *_Target, long _Value);
+__int64 _InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value);
+__int64 _InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value);
+__int64 _InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value);
+
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+char _InterlockedCompareExchange8_acq(char volatile *_Destination,
+                                      char _Exchange, char _Comparand);
+char _InterlockedCompareExchange8_nf(char volatile *_Destination,
+                                     char _Exchange, char _Comparand);
+char _InterlockedCompareExchange8_rel(char volatile *_Destination,
+                                      char _Exchange, char _Comparand);
+short _InterlockedCompareExchange16_acq(short volatile *_Destination,
+                                        short _Exchange, short _Comparand);
+short _InterlockedCompareExchange16_nf(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+short _InterlockedCompareExchange16_rel(short volatile *_Destination,
+                                        short _Exchange, short _Comparand);
+long _InterlockedCompareExchange_acq(long volatile *_Destination,
+                                     long _Exchange, long _Comparand);
+long _InterlockedCompareExchange_nf(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+long _InterlockedCompareExchange_rel(long volatile *_Destination,
+                                     long _Exchange, long _Comparand);
+__int64 _InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
+                                          __int64 _Exchange,
+                                          __int64 _Comparand);
+__int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+__int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
+                                          __int64 _Exchange,
+                                          __int64 _Comparand);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __INTRIN0_H */
+#endif /* _MSC_VER */
diff --git a/clang/lib/Headers/keylockerintrin.h b/clang/lib/Headers/keylockerintrin.h
index 1994ac42070ad3..f76e91b4d4b306 100644
--- a/clang/lib/Headers/keylockerintrin.h
+++ b/clang/lib/Headers/keylockerintrin.h
@@ -28,8 +28,7 @@
 #ifndef _KEYLOCKERINTRIN_H
 #define _KEYLOCKERINTRIN_H
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__KL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__)
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
@@ -327,11 +326,9 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
-          || defined(__KL__) */
+#endif /* !defined(__SCE__ || __has_feature(modules) || defined(__KL__) */
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__WIDEKL__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
@@ -524,7 +521,7 @@ _mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 
 #undef __DEFAULT_FN_ATTRS
 
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
-          || defined(__WIDEKL__) */
+#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)   \
+        */
 
 #endif /* _KEYLOCKERINTRIN_H */
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 56a13f69bc0559..8741968fa7f364 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -315,3 +315,8 @@ module opencl_c {
   header "opencl-c.h"
   header "opencl-c-base.h"
 }
+
+module ptrauth {
+  header "ptrauth.h"
+  export *
+}
diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h
new file mode 100644
index 00000000000000..56c3c3636c9bcd
--- /dev/null
+++ b/clang/lib/Headers/ptrauth.h
@@ -0,0 +1,185 @@
+/*===---- ptrauth.h - Pointer authentication -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PTRAUTH_H
+#define __PTRAUTH_H
+
+typedef enum {
+  ptrauth_key_asia = 0,
+  ptrauth_key_asib = 1,
+  ptrauth_key_asda = 2,
+  ptrauth_key_asdb = 3,
+} ptrauth_key;
+
+/* An integer type of the appropriate size for a discriminator argument. */
+typedef __UINTPTR_TYPE__ ptrauth_extra_data_t;
+
+/* An integer type of the appropriate size for a generic signature. */
+typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
+
+/* A signed pointer value embeds the original pointer together with
+   a signature that attests to the validity of that pointer.  Because
+   this signature must use only "spare" bits of the pointer, a
+   signature's validity is probabilistic in practice: it is unlikely
+   but still plausible that an invalidly-derived signature will
+   somehow equal the correct signature and therefore successfully
+   authenticate.  Nonetheless, this scheme provides a strong degree
+   of protection against certain kinds of attacks. */
+
+/* Authenticating a pointer that was not signed with the given key
+   and extra-data value will (likely) fail by trapping. */
+
+#if __has_feature(ptrauth_intrinsics)
+
+/* Strip the signature from a value without authenticating it.
+
+   If the value is a function pointer, the result will not be a
+   legal function pointer because of the missing signature, and
+   attempting to call it will result in an authentication failure.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The result will have the same type as the original value. */
+#define ptrauth_strip(__value, __key) __builtin_ptrauth_strip(__value, __key)
+
+/* Blend a constant discriminator into the given pointer-like value
+   to form a new discriminator.  Not all bits of the inputs are
+   guaranteed to contribute to the result.
+
+   On arm64e, the integer must fall within the range of a uint16_t;
+   other bits may be ignored.
+
+   The first argument must be an expression of pointer type.
+   The second argument must be an expression of integer type.
+   The result will have type uintptr_t. */
+#define ptrauth_blend_discriminator(__pointer, __integer)                      \
+  __builtin_ptrauth_blend_discriminator(__pointer, __integer)
+
+/* Add a signature to the given pointer value using a specific key,
+   using the given extra data as a salt to the signing process.
+
+   This operation does not authenticate the original value and is
+   therefore potentially insecure if an attacker could possibly
+   control that value.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value. */
+#define ptrauth_sign_unauthenticated(__value, __key, __data)                   \
+  __builtin_ptrauth_sign_unauthenticated(__value, __key, __data)
+
+/* Authenticate a pointer using one scheme and resign it using another.
+
+   If the result is subsequently authenticated using the new scheme, that
+   authentication is guaranteed to fail if and only if the initial
+   authentication failed.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This operation is guaranteed to not leave the intermediate value
+   available for attack before it is re-signed.
+
+   Do not pass a null pointer to this function. A null pointer
+   will not successfully authenticate.
+
+   This operation traps if the authentication fails. */
+#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key,     \
+                                __new_data)                                    \
+  __builtin_ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, \
+                                    __new_data)
+
+/* Authenticate a data pointer.
+
+   The value must be an expression of non-function pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This operation traps if the authentication fails. */
+#define ptrauth_auth_data(__value, __old_key, __old_data)                      \
+  __builtin_ptrauth_auth(__value, __old_key, __old_data)
+
+/* Compute a signature for the given pair of pointer-sized values.
+   The order of the arguments is significant.
+
+   Like a pointer signature, the resulting signature depends on
+   private key data and therefore should not be reliably reproducible
+   by attackers.  That means that this can be used to validate the
+   integrity of arbitrary data by storing a signature for that data
+   alongside it, then checking that the signature is still valid later.
+   Data which exceeds two pointers in size can be signed by either
+   computing a tree of generic signatures or just signing an ordinary
+   cryptographic hash of the data.
+
+   The result has type ptrauth_generic_signature_t.  However, it may
+   not have as many bits of entropy as that type's width would suggest;
+   some implementations are known to compute a compressed signature as
+   if the arguments were a pointer and a discriminator.
+
+   The arguments must be either pointers or integers; if integers, they
+   will be coerce to uintptr_t. */
+#define ptrauth_sign_generic_data(__value, __data)                             \
+  __builtin_ptrauth_sign_generic_data(__value, __data)
+
+#else
+
+#define ptrauth_strip(__value, __key)                                          \
+  ({                                                                           \
+    (void)__key;                                                               \
+    __value;                                                                   \
+  })
+
+#define ptrauth_blend_discriminator(__pointer, __integer)                      \
+  ({                                                                           \
+    (void)__pointer;                                                           \
+    (void)__integer;                                                           \
+    ((ptrauth_extra_data_t)0);                                                 \
+  })
+
+#define ptrauth_sign_unauthenticated(__value, __key, __data)                   \
+  ({                                                                           \
+    (void)__key;                                                               \
+    (void)__data;                                                              \
+    __value;                                                                   \
+  })
+
+#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key,     \
+                                __new_data)                                    \
+  ({                                                                           \
+    (void)__old_key;                                                           \
+    (void)__old_data;                                                          \
+    (void)__new_key;                                                           \
+    (void)__new_data;                                                          \
+    __value;                                                                   \
+  })
+
+#define ptrauth_auth_data(__value, __old_key, __old_data)                      \
+  ({                                                                           \
+    (void)__old_key;                                                           \
+    (void)__old_data;                                                          \
+    __value;                                                                   \
+  })
+
+#define ptrauth_sign_generic_data(__value, __data)                             \
+  ({                                                                           \
+    (void)__value;                                                             \
+    (void)__data;                                                              \
+    ((ptrauth_generic_signature_t)0);                                          \
+  })
+
+#endif /* __has_feature(ptrauth_intrinsics) */
+
+#endif /* __PTRAUTH_H */
diff --git a/clang/lib/Headers/x86gprintrin.h b/clang/lib/Headers/x86gprintrin.h
index ed141879fbc744..3d5cc606d7e63d 100644
--- a/clang/lib/Headers/x86gprintrin.h
+++ b/clang/lib/Headers/x86gprintrin.h
@@ -10,38 +10,31 @@
 #ifndef __X86GPRINTRIN_H
 #define __X86GPRINTRIN_H
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__HRESET__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__HRESET__)
 #include <hresetintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__UINTR__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__UINTR__)
 #include <uintrintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__USERMSR__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__USERMSR__)
 #include <usermsrintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CRC32__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CRC32__)
 #include <crc32intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PRFCHI__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHI__)
 #include <prfchiintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RAOINT__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RAOINT__)
 #include <raointintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CMPCCXADD__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CMPCCXADD__)
 #include <cmpccxaddintrin.h>
 #endif
 
diff --git a/clang/lib/Headers/x86intrin.h b/clang/lib/Headers/x86intrin.h
index 450fd008dab95b..c20bfbb8fe46e2 100644
--- a/clang/lib/Headers/x86intrin.h
+++ b/clang/lib/Headers/x86intrin.h
@@ -14,53 +14,43 @@
 
 #include <immintrin.h>
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__3dNOW__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__3dNOW__)
 #include <mm3dnow.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__PRFCHW__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHW__)
 #include <prfchwintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__SSE4A__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE4A__)
 #include <ammintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__FMA4__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA4__)
 #include <fma4intrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__XOP__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XOP__)
 #include <xopintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__TBM__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__TBM__)
 #include <tbmintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__LWP__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__LWP__)
 #include <lwpintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__MWAITX__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MWAITX__)
 #include <mwaitxintrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__CLZERO__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLZERO__)
 #include <clzerointrin.h>
 #endif
 
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__RDPRU__)
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPRU__)
 #include <rdpruintrin.h>
 #endif
 
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 8e386a72cde789..b2c68c3b7be976 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -1333,6 +1333,10 @@ _mm_ucomineq_ss(__m128 __a, __m128 __b)
 /// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
@@ -1351,6 +1355,10 @@ _mm_cvtss_si32(__m128 __a)
 /// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 32-bit integer.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
@@ -1371,6 +1379,10 @@ _mm_cvt_ss2si(__m128 __a)
 /// Converts a float value contained in the lower 32 bits of a vector of
 ///    [4 x float] into a 64-bit integer.
 ///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
@@ -1391,6 +1403,10 @@ _mm_cvtss_si64(__m128 __a)
 /// Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
@@ -1407,6 +1423,10 @@ _mm_cvtps_pi32(__m128 __a)
 /// Converts two low-order float values in a 128-bit vector of
 ///    [4 x float] into a 64-bit vector of [2 x i32].
 ///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
@@ -1420,9 +1440,12 @@ _mm_cvt_ps2pi(__m128 __a)
   return _mm_cvtps_pi32(__a);
 }
 
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 32-bit integer, truncating the result when it is
-///    inexact.
+/// Converts the lower (first) element of a vector of [4 x float] into a signed
+///    truncated (rounded toward zero) 32-bit integer.
+///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1439,9 +1462,12 @@ _mm_cvttss_si32(__m128 __a)
   return __builtin_ia32_cvttss2si((__v4sf)__a);
 }
 
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 32-bit integer, truncating the result when it is
-///    inexact.
+/// Converts the lower (first) element of a vector of [4 x float] into a signed
+///    truncated (rounded toward zero) 32-bit integer.
+///
+///    If the converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1459,9 +1485,12 @@ _mm_cvtt_ss2si(__m128 __a)
 }
 
 #ifdef __x86_64__
-/// Converts a float value contained in the lower 32 bits of a vector of
-///    [4 x float] into a 64-bit integer, truncating the result when it is
-///    inexact.
+/// Converts the lower (first) element of a vector of [4 x float] into a signed
+///    truncated (rounded toward zero) 64-bit integer.
+///
+///    If the converted value does not fit in a 64-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1479,9 +1508,13 @@ _mm_cvttss_si64(__m128 __a)
 }
 #endif
 
-/// Converts two low-order float values in a 128-bit vector of
-///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
-///    when it is inexact.
+/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
+///    into two signed truncated (rounded toward zero) 32-bit integers,
+///    returned in a 64-bit vector of [2 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1497,9 +1530,13 @@ _mm_cvttps_pi32(__m128 __a)
   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
 }
 
-/// Converts two low-order float values in a 128-bit vector of [4 x
-///    float] into a 64-bit vector of [2 x i32], truncating the result when it
-///    is inexact.
+/// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
+///    into two signed truncated (rounded toward zero) 64-bit integers,
+///    returned in a 64-bit vector of [2 x i32].
+///
+///    If a converted value does not fit in a 32-bit integer, raises a
+///    floating-point invalid exception. If the exception is masked, returns
+///    the most negative integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/yvals_core.h b/clang/lib/Headers/yvals_core.h
new file mode 100644
index 00000000000000..5ee194a3e5f5f6
--- /dev/null
+++ b/clang/lib/Headers/yvals_core.h
@@ -0,0 +1,25 @@
+//===----- yvals_core.h - Internal MSVC STL core header -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Only include this if we are aiming for MSVC compatibility.
+#ifndef _MSC_VER
+#include_next <yvals_core.h>
+#else
+
+#ifndef __clang_yvals_core_h
+#define __clang_yvals_core_h
+
+#include_next <yvals_core.h>
+
+#ifdef _STL_INTRIN_HEADER
+#undef _STL_INTRIN_HEADER
+#define _STL_INTRIN_HEADER <intrin0.h>
+#endif
+
+#endif
+#endif
diff --git a/clang/lib/InstallAPI/CMakeLists.txt b/clang/lib/InstallAPI/CMakeLists.txt
index dc90d6370de418..894db699578f20 100644
--- a/clang/lib/InstallAPI/CMakeLists.txt
+++ b/clang/lib/InstallAPI/CMakeLists.txt
@@ -1,10 +1,12 @@
 set(LLVM_LINK_COMPONENTS
   Support
   TextAPI
+  Demangle
   Core
   )
 
 add_clang_library(clangInstallAPI
+  DylibVerifier.cpp
   FileList.cpp
   Frontend.cpp
   HeaderFile.cpp
diff --git a/clang/lib/InstallAPI/DylibVerifier.cpp b/clang/lib/InstallAPI/DylibVerifier.cpp
new file mode 100644
index 00000000000000..700763b3fee0db
--- /dev/null
+++ b/clang/lib/InstallAPI/DylibVerifier.cpp
@@ -0,0 +1,528 @@
+#include "clang/InstallAPI/DylibVerifier.h"
+#include "clang/InstallAPI/FrontendRecords.h"
+#include "clang/InstallAPI/InstallAPIDiagnostic.h"
+#include "llvm/Demangle/Demangle.h"
+
+using namespace llvm::MachO;
+
+namespace clang {
+namespace installapi {
+
+/// Metadata stored about a mapping of a declaration to a symbol.
+struct DylibVerifier::SymbolContext {
+  // Name to use for printing in diagnostics.
+  std::string PrettyPrintName{""};
+
+  // Name to use for all querying and verification
+  // purposes.
+  std::string SymbolName{""};
+
+  // Kind to map symbol type against record.
+  EncodeKind Kind = EncodeKind::GlobalSymbol;
+
+  // Frontend Attributes tied to the AST.
+  const FrontendAttrs *FA = nullptr;
+
+  // The ObjCInterface symbol type, if applicable.
+  ObjCIFSymbolKind ObjCIFKind = ObjCIFSymbolKind::None;
+
+  // Whether Decl is inlined.
+  bool Inlined = false;
+};
+
+static std::string
+getAnnotatedName(const Record *R, EncodeKind Kind, StringRef Name,
+                 bool ValidSourceLoc = true,
+                 ObjCIFSymbolKind ObjCIF = ObjCIFSymbolKind::None) {
+  assert(!Name.empty() && "Need symbol name for printing");
+
+  std::string Annotation;
+  if (R->isWeakDefined())
+    Annotation += "(weak-def) ";
+  if (R->isWeakReferenced())
+    Annotation += "(weak-ref) ";
+  if (R->isThreadLocalValue())
+    Annotation += "(tlv) ";
+
+  // Check if symbol represents only part of a @interface declaration.
+  const bool IsAnnotatedObjCClass = ((ObjCIF != ObjCIFSymbolKind::None) &&
+                                     (ObjCIF <= ObjCIFSymbolKind::EHType));
+
+  if (IsAnnotatedObjCClass) {
+    if (ObjCIF == ObjCIFSymbolKind::EHType)
+      Annotation += "Exception Type of ";
+    if (ObjCIF == ObjCIFSymbolKind::MetaClass)
+      Annotation += "Metaclass of ";
+    if (ObjCIF == ObjCIFSymbolKind::Class)
+      Annotation += "Class of ";
+  }
+
+  // Only print symbol type prefix or leading "_" if there is no source location
+  // tied to it. This can only ever happen when the location has to come from
+  // debug info.
+  if (ValidSourceLoc) {
+    if ((Kind == EncodeKind::GlobalSymbol) && Name.starts_with("_"))
+      return Annotation + Name.drop_front(1).str();
+    return Annotation + Name.str();
+  }
+
+  if (IsAnnotatedObjCClass)
+    return Annotation + Name.str();
+
+  switch (Kind) {
+  case EncodeKind::GlobalSymbol:
+    return Annotation + Name.str();
+  case EncodeKind::ObjectiveCInstanceVariable:
+    return Annotation + "(ObjC IVar) " + Name.str();
+  case EncodeKind::ObjectiveCClass:
+    return Annotation + "(ObjC Class) " + Name.str();
+  case EncodeKind::ObjectiveCClassEHType:
+    return Annotation + "(ObjC Class EH) " + Name.str();
+  }
+
+  llvm_unreachable("unexpected case for EncodeKind");
+}
+
+static std::string demangle(StringRef Name) {
+  // InstallAPI currently only supports itanium manglings.
+  if (!(Name.starts_with("_Z") || Name.starts_with("__Z") ||
+        Name.starts_with("___Z")))
+    return Name.str();
+  char *Result = llvm::itaniumDemangle(Name);
+  if (!Result)
+    return Name.str();
+
+  std::string Demangled(Result);
+  free(Result);
+  return Demangled;
+}
+
+static DylibVerifier::Result updateResult(const DylibVerifier::Result Prev,
+                                          const DylibVerifier::Result Curr) {
+  if (Prev == Curr)
+    return Prev;
+
+  // Never update from invalid or noverify state.
+  if ((Prev == DylibVerifier::Result::Invalid) ||
+      (Prev == DylibVerifier::Result::NoVerify))
+    return Prev;
+
+  // Don't let an ignored verification remove a valid one.
+  if (Prev == DylibVerifier::Result::Valid &&
+      Curr == DylibVerifier::Result::Ignore)
+    return Prev;
+
+  return Curr;
+}
+// __private_extern__ is a deprecated specifier that clang does not
+// respect in all contexts, it should just be considered hidden for InstallAPI.
+static bool shouldIgnorePrivateExternAttr(const Decl *D) {
+  if (const FunctionDecl *FD = cast<FunctionDecl>(D))
+    return FD->getStorageClass() == StorageClass::SC_PrivateExtern;
+  if (const VarDecl *VD = cast<VarDecl>(D))
+    return VD->getStorageClass() == StorageClass::SC_PrivateExtern;
+
+  return false;
+}
+
+Record *findRecordFromSlice(const RecordsSlice *Slice, StringRef Name,
+                            EncodeKind Kind) {
+  switch (Kind) {
+  case EncodeKind::GlobalSymbol:
+    return Slice->findGlobal(Name);
+  case EncodeKind::ObjectiveCInstanceVariable:
+    return Slice->findObjCIVar(Name.contains('.'), Name);
+  case EncodeKind::ObjectiveCClass:
+  case EncodeKind::ObjectiveCClassEHType:
+    return Slice->findObjCInterface(Name);
+  }
+  llvm_unreachable("unexpected end when finding record");
+}
+
+void DylibVerifier::updateState(Result State) {
+  Ctx.FrontendState = updateResult(Ctx.FrontendState, State);
+}
+
+void DylibVerifier::addSymbol(const Record *R, SymbolContext &SymCtx,
+                              TargetList &&Targets) {
+  if (Targets.empty())
+    Targets = {Ctx.Target};
+
+  Exports->addGlobal(SymCtx.Kind, SymCtx.SymbolName, R->getFlags(), Targets);
+}
+
+bool DylibVerifier::shouldIgnoreObsolete(const Record *R, SymbolContext &SymCtx,
+                                         const Record *DR) {
+  return SymCtx.FA->Avail.isObsoleted();
+}
+
+bool DylibVerifier::compareObjCInterfaceSymbols(const Record *R,
+                                                SymbolContext &SymCtx,
+                                                const ObjCInterfaceRecord *DR) {
+  const bool IsDeclVersionComplete =
+      ((SymCtx.ObjCIFKind & ObjCIFSymbolKind::Class) ==
+       ObjCIFSymbolKind::Class) &&
+      ((SymCtx.ObjCIFKind & ObjCIFSymbolKind::MetaClass) ==
+       ObjCIFSymbolKind::MetaClass);
+
+  const bool IsDylibVersionComplete = DR->isCompleteInterface();
+
+  // The common case, a complete ObjCInterface.
+  if (IsDeclVersionComplete && IsDylibVersionComplete)
+    return true;
+
+  auto PrintDiagnostic = [&](auto SymLinkage, const Record *Record,
+                             StringRef SymName, bool PrintAsWarning = false) {
+    if (SymLinkage == RecordLinkage::Unknown)
+      Ctx.emitDiag([&]() {
+        Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                         PrintAsWarning ? diag::warn_library_missing_symbol
+                                        : diag::err_library_missing_symbol)
+            << SymName;
+      });
+    else
+      Ctx.emitDiag([&]() {
+        Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                         PrintAsWarning ? diag::warn_library_hidden_symbol
+                                        : diag::err_library_hidden_symbol)
+            << SymName;
+      });
+  };
+
+  if (IsDeclVersionComplete) {
+    // The decl represents a complete ObjCInterface, but the symbols in the
+    // dylib do not. Determine which symbol is missing. To keep older projects
+    // building, treat this as a warning.
+    if (!DR->isExportedSymbol(ObjCIFSymbolKind::Class))
+      PrintDiagnostic(DR->getLinkageForSymbol(ObjCIFSymbolKind::Class), R,
+                      getAnnotatedName(R, SymCtx.Kind, SymCtx.PrettyPrintName,
+                                       /*ValidSourceLoc=*/true,
+                                       ObjCIFSymbolKind::Class),
+                      /*PrintAsWarning=*/true);
+
+    if (!DR->isExportedSymbol(ObjCIFSymbolKind::MetaClass))
+      PrintDiagnostic(DR->getLinkageForSymbol(ObjCIFSymbolKind::MetaClass), R,
+                      getAnnotatedName(R, SymCtx.Kind, SymCtx.PrettyPrintName,
+                                       /*ValidSourceLoc=*/true,
+                                       ObjCIFSymbolKind::MetaClass),
+                      /*PrintAsWarning=*/true);
+    return true;
+  }
+
+  if (DR->isExportedSymbol(SymCtx.ObjCIFKind)) {
+    if (!IsDylibVersionComplete) {
+      // Both the declaration and dylib have a non-complete interface.
+      SymCtx.Kind = EncodeKind::GlobalSymbol;
+      SymCtx.SymbolName = R->getName();
+    }
+    return true;
+  }
+
+  // At this point that means there was not a matching class symbol
+  // to represent the one discovered as a declaration.
+  PrintDiagnostic(DR->getLinkageForSymbol(SymCtx.ObjCIFKind), R,
+                  SymCtx.PrettyPrintName);
+  return false;
+}
+
+DylibVerifier::Result DylibVerifier::compareVisibility(const Record *R,
+                                                       SymbolContext &SymCtx,
+                                                       const Record *DR) {
+
+  if (R->isExported()) {
+    if (!DR) {
+      Ctx.emitDiag([&]() {
+        Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                         diag::err_library_missing_symbol)
+            << SymCtx.PrettyPrintName;
+      });
+      return Result::Invalid;
+    }
+    if (DR->isInternal()) {
+      Ctx.emitDiag([&]() {
+        Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                         diag::err_library_hidden_symbol)
+            << SymCtx.PrettyPrintName;
+      });
+      return Result::Invalid;
+    }
+  }
+
+  // Emit a diagnostic for hidden declarations with external symbols, except
+  // when theres an inlined attribute.
+  if ((R->isInternal() && !SymCtx.Inlined) && DR && DR->isExported()) {
+
+    if (Mode == VerificationMode::ErrorsOnly)
+      return Result::Ignore;
+
+    if (shouldIgnorePrivateExternAttr(SymCtx.FA->D))
+      return Result::Ignore;
+
+    unsigned ID;
+    Result Outcome;
+    if (Mode == VerificationMode::ErrorsAndWarnings) {
+      ID = diag::warn_header_hidden_symbol;
+      Outcome = Result::Ignore;
+    } else {
+      ID = diag::err_header_hidden_symbol;
+      Outcome = Result::Invalid;
+    }
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(), ID)
+          << SymCtx.PrettyPrintName;
+    });
+    return Outcome;
+  }
+
+  if (R->isInternal())
+    return Result::Ignore;
+
+  return Result::Valid;
+}
+
+DylibVerifier::Result DylibVerifier::compareAvailability(const Record *R,
+                                                         SymbolContext &SymCtx,
+                                                         const Record *DR) {
+  if (!SymCtx.FA->Avail.isUnavailable())
+    return Result::Valid;
+
+  const bool IsDeclAvailable = SymCtx.FA->Avail.isUnavailable();
+
+  switch (Mode) {
+  case VerificationMode::ErrorsAndWarnings:
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::warn_header_availability_mismatch)
+          << SymCtx.PrettyPrintName << IsDeclAvailable << IsDeclAvailable;
+    });
+    return Result::Ignore;
+  case VerificationMode::Pedantic:
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::err_header_availability_mismatch)
+          << SymCtx.PrettyPrintName << IsDeclAvailable << IsDeclAvailable;
+    });
+    return Result::Invalid;
+  case VerificationMode::ErrorsOnly:
+    return Result::Ignore;
+  case VerificationMode::Invalid:
+    llvm_unreachable("Unexpected verification mode symbol verification");
+  }
+  llvm_unreachable("Unexpected verification mode symbol verification");
+}
+
+bool DylibVerifier::compareSymbolFlags(const Record *R, SymbolContext &SymCtx,
+                                       const Record *DR) {
+  std::string DisplayName =
+      Demangle ? demangle(DR->getName()) : DR->getName().str();
+
+  if (DR->isThreadLocalValue() && !R->isThreadLocalValue()) {
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::err_dylib_symbol_flags_mismatch)
+          << getAnnotatedName(DR, SymCtx.Kind, DisplayName)
+          << DR->isThreadLocalValue();
+    });
+    return false;
+  }
+  if (!DR->isThreadLocalValue() && R->isThreadLocalValue()) {
+    Ctx.emitDiag([&]() {
+      SymCtx.FA->D->getLocation(),
+          Ctx.Diag->Report(diag::err_header_symbol_flags_mismatch)
+              << SymCtx.PrettyPrintName << R->isThreadLocalValue();
+    });
+    return false;
+  }
+
+  if (DR->isWeakDefined() && !R->isWeakDefined()) {
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::err_dylib_symbol_flags_mismatch)
+          << getAnnotatedName(DR, SymCtx.Kind, DisplayName)
+          << R->isWeakDefined();
+    });
+    return false;
+  }
+  if (!DR->isWeakDefined() && R->isWeakDefined()) {
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::err_header_symbol_flags_mismatch)
+          << SymCtx.PrettyPrintName << R->isWeakDefined();
+    });
+    return false;
+  }
+
+  return true;
+}
+
+DylibVerifier::Result DylibVerifier::verifyImpl(Record *R,
+                                                SymbolContext &SymCtx) {
+  R->setVerify();
+  if (!canVerify()) {
+    // Accumulate symbols when not in verifying against dylib.
+    if (R->isExported() && !SymCtx.FA->Avail.isUnavailable() &&
+        !SymCtx.FA->Avail.isObsoleted()) {
+      addSymbol(R, SymCtx);
+    }
+    return Ctx.FrontendState;
+  }
+
+  Record *DR =
+      findRecordFromSlice(Ctx.DylibSlice, SymCtx.SymbolName, SymCtx.Kind);
+  if (DR)
+    DR->setVerify();
+
+  if (shouldIgnoreObsolete(R, SymCtx, DR)) {
+    updateState(Result::Ignore);
+    return Ctx.FrontendState;
+  }
+
+  // Unavailable declarations don't need matching symbols.
+  if (SymCtx.FA->Avail.isUnavailable() && (!DR || DR->isInternal())) {
+    updateState(Result::Valid);
+    return Ctx.FrontendState;
+  }
+
+  Result VisibilityCheck = compareVisibility(R, SymCtx, DR);
+  if (VisibilityCheck != Result::Valid) {
+    updateState(VisibilityCheck);
+    return Ctx.FrontendState;
+  }
+
+  // All missing symbol cases to diagnose have been handled now.
+  if (!DR) {
+    updateState(Result::Ignore);
+    return Ctx.FrontendState;
+  }
+
+  // Check for mismatching ObjC interfaces.
+  if (SymCtx.ObjCIFKind != ObjCIFSymbolKind::None) {
+    if (!compareObjCInterfaceSymbols(
+            R, SymCtx, Ctx.DylibSlice->findObjCInterface(DR->getName()))) {
+      updateState(Result::Invalid);
+      return Ctx.FrontendState;
+    }
+  }
+
+  Result AvailabilityCheck = compareAvailability(R, SymCtx, DR);
+  if (AvailabilityCheck != Result::Valid) {
+    updateState(AvailabilityCheck);
+    return Ctx.FrontendState;
+  }
+
+  if (!compareSymbolFlags(R, SymCtx, DR)) {
+    updateState(Result::Invalid);
+    return Ctx.FrontendState;
+  }
+
+  addSymbol(R, SymCtx);
+  updateState(Result::Valid);
+  return Ctx.FrontendState;
+}
+
+bool DylibVerifier::canVerify() {
+  return Ctx.FrontendState != Result::NoVerify;
+}
+
+void DylibVerifier::assignSlice(const Target &T) {
+  assert(T == Ctx.Target && "Active targets should match.");
+  if (Dylib.empty())
+    return;
+
+  // Note: there are no reexport slices with binaries, as opposed to TBD files,
+  // so it can be assumed that the target match is the active top-level library.
+  auto It = find_if(
+      Dylib, [&T](const auto &Slice) { return T == Slice->getTarget(); });
+
+  assert(It != Dylib.end() && "Target slice should always exist.");
+  Ctx.DylibSlice = It->get();
+}
+
+void DylibVerifier::setTarget(const Target &T) {
+  Ctx.Target = T;
+  Ctx.DiscoveredFirstError = false;
+  if (Dylib.empty()) {
+    updateState(Result::NoVerify);
+    return;
+  }
+  updateState(Result::Ignore);
+  assignSlice(T);
+}
+
+DylibVerifier::Result DylibVerifier::verify(ObjCIVarRecord *R,
+                                            const FrontendAttrs *FA,
+                                            const StringRef SuperClass) {
+  if (R->isVerified())
+    return getState();
+
+  std::string FullName =
+      ObjCIVarRecord::createScopedName(SuperClass, R->getName());
+  SymbolContext SymCtx{
+      getAnnotatedName(R, EncodeKind::ObjectiveCInstanceVariable,
+                       Demangle ? demangle(FullName) : FullName),
+      FullName, EncodeKind::ObjectiveCInstanceVariable, FA};
+  return verifyImpl(R, SymCtx);
+}
+
+static ObjCIFSymbolKind assignObjCIFSymbolKind(const ObjCInterfaceRecord *R) {
+  ObjCIFSymbolKind Result = ObjCIFSymbolKind::None;
+  if (R->getLinkageForSymbol(ObjCIFSymbolKind::Class) != RecordLinkage::Unknown)
+    Result |= ObjCIFSymbolKind::Class;
+  if (R->getLinkageForSymbol(ObjCIFSymbolKind::MetaClass) !=
+      RecordLinkage::Unknown)
+    Result |= ObjCIFSymbolKind::MetaClass;
+  if (R->getLinkageForSymbol(ObjCIFSymbolKind::EHType) !=
+      RecordLinkage::Unknown)
+    Result |= ObjCIFSymbolKind::EHType;
+  return Result;
+}
+
+DylibVerifier::Result DylibVerifier::verify(ObjCInterfaceRecord *R,
+                                            const FrontendAttrs *FA) {
+  if (R->isVerified())
+    return getState();
+  SymbolContext SymCtx;
+  SymCtx.SymbolName = R->getName();
+  SymCtx.ObjCIFKind = assignObjCIFSymbolKind(R);
+
+  std::string DisplayName =
+      Demangle ? demangle(SymCtx.SymbolName) : SymCtx.SymbolName;
+  SymCtx.Kind = R->hasExceptionAttribute() ? EncodeKind::ObjectiveCClassEHType
+                                           : EncodeKind::ObjectiveCClass;
+  SymCtx.PrettyPrintName = getAnnotatedName(R, SymCtx.Kind, DisplayName);
+  SymCtx.FA = FA;
+
+  return verifyImpl(R, SymCtx);
+}
+
+DylibVerifier::Result DylibVerifier::verify(GlobalRecord *R,
+                                            const FrontendAttrs *FA) {
+  if (R->isVerified())
+    return getState();
+
+  // Global classifications could be obfusciated with `asm`.
+  SimpleSymbol Sym = parseSymbol(R->getName());
+  SymbolContext SymCtx;
+  SymCtx.SymbolName = Sym.Name;
+  SymCtx.PrettyPrintName =
+      getAnnotatedName(R, Sym.Kind, Demangle ? demangle(Sym.Name) : Sym.Name);
+  SymCtx.Kind = Sym.Kind;
+  SymCtx.FA = FA;
+  SymCtx.Inlined = R->isInlined();
+  return verifyImpl(R, SymCtx);
+}
+
+void DylibVerifier::VerifierContext::emitDiag(
+    llvm::function_ref<void()> Report) {
+  if (!DiscoveredFirstError) {
+    Diag->Report(diag::warn_target)
+        << (PrintArch ? getArchitectureName(Target.Arch)
+                      : getTargetTripleName(Target));
+    DiscoveredFirstError = true;
+  }
+
+  Report();
+}
+
+} // namespace installapi
+} // namespace clang
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
index 707aeb17dc8906..12cd5fcbc22bf7 100644
--- a/clang/lib/InstallAPI/Frontend.cpp
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -16,41 +16,47 @@ using namespace llvm;
 using namespace llvm::MachO;
 
 namespace clang::installapi {
-
-GlobalRecord *FrontendRecordsSlice::addGlobal(
+std::pair<GlobalRecord *, FrontendAttrs *> FrontendRecordsSlice::addGlobal(
     StringRef Name, RecordLinkage Linkage, GlobalRecord::Kind GV,
     const clang::AvailabilityInfo Avail, const Decl *D, const HeaderType Access,
     SymbolFlags Flags, bool Inlined) {
 
-  auto *GR =
+  GlobalRecord *GR =
       llvm::MachO::RecordsSlice::addGlobal(Name, Linkage, GV, Flags, Inlined);
-  FrontendRecords.insert({GR, FrontendAttrs{Avail, D, Access}});
-  return GR;
+  auto Result = FrontendRecords.insert({GR, FrontendAttrs{Avail, D, Access}});
+  return {GR, &(Result.first->second)};
 }
 
-ObjCInterfaceRecord *FrontendRecordsSlice::addObjCInterface(
-    StringRef Name, RecordLinkage Linkage, const clang::AvailabilityInfo Avail,
-    const Decl *D, HeaderType Access, bool IsEHType) {
+std::pair<ObjCInterfaceRecord *, FrontendAttrs *>
+FrontendRecordsSlice::addObjCInterface(StringRef Name, RecordLinkage Linkage,
+                                       const clang::AvailabilityInfo Avail,
+                                       const Decl *D, HeaderType Access,
+                                       bool IsEHType) {
   ObjCIFSymbolKind SymType =
       ObjCIFSymbolKind::Class | ObjCIFSymbolKind::MetaClass;
   if (IsEHType)
     SymType |= ObjCIFSymbolKind::EHType;
-  auto *ObjCR =
+
+  ObjCInterfaceRecord *ObjCR =
       llvm::MachO::RecordsSlice::addObjCInterface(Name, Linkage, SymType);
-  FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
-  return ObjCR;
+  auto Result =
+      FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
+  return {ObjCR, &(Result.first->second)};
 }
 
-ObjCCategoryRecord *FrontendRecordsSlice::addObjCCategory(
-    StringRef ClassToExtend, StringRef CategoryName,
-    const clang::AvailabilityInfo Avail, const Decl *D, HeaderType Access) {
-  auto *ObjCR =
+std::pair<ObjCCategoryRecord *, FrontendAttrs *>
+FrontendRecordsSlice::addObjCCategory(StringRef ClassToExtend,
+                                      StringRef CategoryName,
+                                      const clang::AvailabilityInfo Avail,
+                                      const Decl *D, HeaderType Access) {
+  ObjCCategoryRecord *ObjCR =
       llvm::MachO::RecordsSlice::addObjCCategory(ClassToExtend, CategoryName);
-  FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
-  return ObjCR;
+  auto Result =
+      FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
+  return {ObjCR, &(Result.first->second)};
 }
 
-ObjCIVarRecord *FrontendRecordsSlice::addObjCIVar(
+std::pair<ObjCIVarRecord *, FrontendAttrs *> FrontendRecordsSlice::addObjCIVar(
     ObjCContainerRecord *Container, StringRef IvarName, RecordLinkage Linkage,
     const clang::AvailabilityInfo Avail, const Decl *D, HeaderType Access,
     const clang::ObjCIvarDecl::AccessControl AC) {
@@ -59,11 +65,12 @@ ObjCIVarRecord *FrontendRecordsSlice::addObjCIVar(
   if ((Linkage == RecordLinkage::Exported) &&
       ((AC == ObjCIvarDecl::Private) || (AC == ObjCIvarDecl::Package)))
     Linkage = RecordLinkage::Internal;
-  auto *ObjCR =
+  ObjCIVarRecord *ObjCR =
       llvm::MachO::RecordsSlice::addObjCIVar(Container, IvarName, Linkage);
-  FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
+  auto Result =
+      FrontendRecords.insert({ObjCR, FrontendAttrs{Avail, D, Access}});
 
-  return nullptr;
+  return {ObjCR, &(Result.first->second)};
 }
 
 std::optional<HeaderType>
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
index b4ed5974a05704..452c8f2fb1e489 100644
--- a/clang/lib/InstallAPI/Visitor.cpp
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -11,6 +11,7 @@
 #include "clang/AST/ParentMapContext.h"
 #include "clang/AST/VTableBuilder.h"
 #include "clang/Basic/Linkage.h"
+#include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/FrontendRecords.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -156,7 +157,9 @@ void InstallAPIVisitor::recordObjCInstanceVariables(
     StringRef Name = IV->getName();
     const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(IV);
     auto AC = IV->getCanonicalAccessControl();
-    Ctx.Slice->addObjCIVar(Record, Name, Linkage, Avail, IV, *Access, AC);
+    auto [ObjCIVR, FA] =
+        Ctx.Slice->addObjCIVar(Record, Name, Linkage, Avail, IV, *Access, AC);
+    Ctx.Verifier->verify(ObjCIVR, FA, SuperClass);
   }
 }
 
@@ -178,15 +181,16 @@ bool InstallAPIVisitor::VisitObjCInterfaceDecl(const ObjCInterfaceDecl *D) {
       (!D->getASTContext().getLangOpts().ObjCRuntime.isFragile() &&
        hasObjCExceptionAttribute(D));
 
-  ObjCInterfaceRecord *Class =
+  auto [Class, FA] =
       Ctx.Slice->addObjCInterface(Name, Linkage, Avail, D, *Access, IsEHType);
+  Ctx.Verifier->verify(Class, FA);
 
   // Get base class.
   StringRef SuperClassName;
   if (const auto *SuperClass = D->getSuperClass())
     SuperClassName = SuperClass->getObjCRuntimeNameAsString();
 
-  recordObjCInstanceVariables(D->getASTContext(), Class, SuperClassName,
+  recordObjCInstanceVariables(D->getASTContext(), Class, Class->getName(),
                               D->ivars());
   return true;
 }
@@ -201,8 +205,8 @@ bool InstallAPIVisitor::VisitObjCCategoryDecl(const ObjCCategoryDecl *D) {
   const ObjCInterfaceDecl *InterfaceD = D->getClassInterface();
   const StringRef InterfaceName = InterfaceD->getName();
 
-  ObjCCategoryRecord *Category = Ctx.Slice->addObjCCategory(
-      InterfaceName, CategoryName, Avail, D, *Access);
+  auto [Category, FA] = Ctx.Slice->addObjCCategory(InterfaceName, CategoryName,
+                                                   Avail, D, *Access);
   recordObjCInstanceVariables(D->getASTContext(), Category, InterfaceName,
                               D->ivars());
   return true;
@@ -236,8 +240,10 @@ bool InstallAPIVisitor::VisitVarDecl(const VarDecl *D) {
   const bool WeakDef = D->hasAttr<WeakAttr>();
   const bool ThreadLocal = D->getTLSKind() != VarDecl::TLS_None;
   const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(D);
-  Ctx.Slice->addGlobal(getMangledName(D), Linkage, GlobalRecord::Kind::Variable,
-                       Avail, D, *Access, getFlags(WeakDef, ThreadLocal));
+  auto [GR, FA] = Ctx.Slice->addGlobal(getMangledName(D), Linkage,
+                                       GlobalRecord::Kind::Variable, Avail, D,
+                                       *Access, getFlags(WeakDef, ThreadLocal));
+  Ctx.Verifier->verify(GR, FA);
   return true;
 }
 
@@ -287,8 +293,10 @@ bool InstallAPIVisitor::VisitFunctionDecl(const FunctionDecl *D) {
   const RecordLinkage Linkage = (Inlined || !isExported(D))
                                     ? RecordLinkage::Internal
                                     : RecordLinkage::Exported;
-  Ctx.Slice->addGlobal(Name, Linkage, GlobalRecord::Kind::Function, Avail, D,
-                       *Access, getFlags(WeakDef), Inlined);
+  auto [GR, FA] =
+      Ctx.Slice->addGlobal(Name, Linkage, GlobalRecord::Kind::Function, Avail,
+                           D, *Access, getFlags(WeakDef), Inlined);
+  Ctx.Verifier->verify(GR, FA);
   return true;
 }
 
@@ -478,9 +486,10 @@ void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
         VTableLinkage == CXXLinkage::WeakODRLinkage) {
       const std::string Name = getMangledCXXVTableName(D);
       const bool WeakDef = VTableLinkage == CXXLinkage::WeakODRLinkage;
-      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                           GlobalRecord::Kind::Variable, Avail, D, Access,
-                           getFlags(WeakDef));
+      auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                           GlobalRecord::Kind::Variable, Avail,
+                                           D, Access, getFlags(WeakDef));
+      Ctx.Verifier->verify(GR, FA);
       if (!D->getDescribedClassTemplate() && !D->isInvalidDecl()) {
         VTableContextBase *VTable = D->getASTContext().getVTableContext();
         auto AddThunk = [&](GlobalDecl GD) {
@@ -491,9 +500,10 @@ void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
 
           for (const auto &Thunk : *Thunks) {
             const std::string Name = getMangledCXXThunk(GD, Thunk);
-            Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                                 GlobalRecord::Kind::Function, Avail,
-                                 GD.getDecl(), Access);
+            auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                                 GlobalRecord::Kind::Function,
+                                                 Avail, GD.getDecl(), Access);
+            Ctx.Verifier->verify(GR, FA);
           }
         };
 
@@ -519,12 +529,16 @@ void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
 
   if (hasRTTI(D)) {
     std::string Name = getMangledCXXRTTI(D);
-    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                         GlobalRecord::Kind::Variable, Avail, D, Access);
+    auto [GR, FA] =
+        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                             GlobalRecord::Kind::Variable, Avail, D, Access);
+    Ctx.Verifier->verify(GR, FA);
 
     Name = getMangledCXXRTTIName(D);
-    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                         GlobalRecord::Kind::Variable, Avail, D, Access);
+    auto [NamedGR, NamedFA] =
+        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                             GlobalRecord::Kind::Variable, Avail, D, Access);
+    Ctx.Verifier->verify(NamedGR, NamedFA);
   }
 
   for (const auto &It : D->bases()) {
@@ -615,15 +629,17 @@ bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
         continue;
 
       std::string Name = getMangledCtorDtor(M, Ctor_Base);
-      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                           GlobalRecord::Kind::Function, Avail, D, *Access,
-                           getFlags(WeakDef));
+      auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                           GlobalRecord::Kind::Function, Avail,
+                                           D, *Access, getFlags(WeakDef));
+      Ctx.Verifier->verify(GR, FA);
 
       if (!D->isAbstract()) {
         std::string Name = getMangledCtorDtor(M, Ctor_Complete);
-        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                             GlobalRecord::Kind::Function, Avail, D, *Access,
-                             getFlags(WeakDef));
+        auto [GR, FA] = Ctx.Slice->addGlobal(
+            Name, RecordLinkage::Exported, GlobalRecord::Kind::Function, Avail,
+            D, *Access, getFlags(WeakDef));
+        Ctx.Verifier->verify(GR, FA);
       }
 
       continue;
@@ -635,20 +651,23 @@ bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
         continue;
 
       std::string Name = getMangledCtorDtor(M, Dtor_Base);
-      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                           GlobalRecord::Kind::Function, Avail, D, *Access,
-                           getFlags(WeakDef));
+      auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                           GlobalRecord::Kind::Function, Avail,
+                                           D, *Access, getFlags(WeakDef));
+      Ctx.Verifier->verify(GR, FA);
 
       Name = getMangledCtorDtor(M, Dtor_Complete);
-      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                           GlobalRecord::Kind::Function, Avail, D, *Access,
-                           getFlags(WeakDef));
+      auto [CompleteGR, CompleteFA] = Ctx.Slice->addGlobal(
+          Name, RecordLinkage::Exported, GlobalRecord::Kind::Function, Avail, D,
+          *Access, getFlags(WeakDef));
+      Ctx.Verifier->verify(CompleteGR, CompleteFA);
 
       if (Dtor->isVirtual()) {
         Name = getMangledCtorDtor(M, Dtor_Deleting);
-        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                             GlobalRecord::Kind::Function, Avail, D, *Access,
-                             getFlags(WeakDef));
+        auto [VirtualGR, VirtualFA] = Ctx.Slice->addGlobal(
+            Name, RecordLinkage::Exported, GlobalRecord::Kind::Function, Avail,
+            D, *Access, getFlags(WeakDef));
+        Ctx.Verifier->verify(VirtualGR, VirtualFA);
       }
 
       continue;
@@ -661,9 +680,10 @@ bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
       continue;
 
     std::string Name = getMangledName(M);
-    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                         GlobalRecord::Kind::Function, Avail, D, *Access,
-                         getFlags(WeakDef));
+    auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                         GlobalRecord::Kind::Function, Avail, M,
+                                         *Access, getFlags(WeakDef));
+    Ctx.Verifier->verify(GR, FA);
   }
 
   if (auto *Templ = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
@@ -694,9 +714,10 @@ bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
     const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(Var);
     const bool WeakDef = Var->hasAttr<WeakAttr>() || KeepInlineAsWeak;
 
-    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
-                         GlobalRecord::Kind::Variable, Avail, D, *Access,
-                         getFlags(WeakDef));
+    auto [GR, FA] = Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                         GlobalRecord::Kind::Variable, Avail, D,
+                                         *Access, getFlags(WeakDef));
+    Ctx.Verifier->verify(GR, FA);
   }
 
   return true;
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index dd179414a14191..0aa14b0510746b 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -629,6 +629,10 @@ void Parser::ParseGNUAttributeArgs(
     ParseAttributeWithTypeArg(*AttrName, AttrNameLoc, Attrs, ScopeName,
                               ScopeLoc, Form);
     return;
+  } else if (AttrKind == ParsedAttr::AT_CountedBy) {
+    ParseBoundsAttribute(*AttrName, AttrNameLoc, Attrs, ScopeName, ScopeLoc,
+                         Form);
+    return;
   }
 
   // These may refer to the function arguments, but need to be parsed early to
@@ -3250,6 +3254,54 @@ void Parser::ParseAlignmentSpecifier(ParsedAttributes &Attrs,
   }
 }
 
+/// Bounds attributes (e.g., counted_by):
+///   AttrName '(' expression ')'
+void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
+                                  SourceLocation AttrNameLoc,
+                                  ParsedAttributes &Attrs,
+                                  IdentifierInfo *ScopeName,
+                                  SourceLocation ScopeLoc,
+                                  ParsedAttr::Form Form) {
+  assert(Tok.is(tok::l_paren) && "Attribute arg list not starting with '('");
+
+  BalancedDelimiterTracker Parens(*this, tok::l_paren);
+  Parens.consumeOpen();
+
+  if (Tok.is(tok::r_paren)) {
+    Diag(Tok.getLocation(), diag::err_argument_required_after_attribute);
+    Parens.consumeClose();
+    return;
+  }
+
+  ArgsVector ArgExprs;
+  // Don't evaluate argument when the attribute is ignored.
+  using ExpressionKind =
+      Sema::ExpressionEvaluationContextRecord::ExpressionKind;
+  EnterExpressionEvaluationContext EC(
+      Actions, Sema::ExpressionEvaluationContext::PotentiallyEvaluated, nullptr,
+      ExpressionKind::EK_BoundsAttrArgument);
+
+  ExprResult ArgExpr(
+      Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression()));
+
+  if (ArgExpr.isInvalid()) {
+    Parens.skipToEnd();
+    return;
+  }
+
+  ArgExprs.push_back(ArgExpr.get());
+  Parens.consumeClose();
+
+  ASTContext &Ctx = Actions.getASTContext();
+
+  ArgExprs.push_back(IntegerLiteral::Create(
+      Ctx, llvm::APInt(Ctx.getTypeSize(Ctx.getSizeType()), 0),
+      Ctx.getSizeType(), SourceLocation()));
+
+  Attrs.addNew(&AttrName, SourceRange(AttrNameLoc, Parens.getCloseLocation()),
+               ScopeName, ScopeLoc, ArgExprs.data(), ArgExprs.size(), Form);
+}
+
 ExprResult Parser::ParseExtIntegerArgument() {
   assert(Tok.isOneOf(tok::kw__ExtInt, tok::kw__BitInt) &&
          "Not an extended int type");
@@ -4679,6 +4731,39 @@ void Parser::ParseDeclarationSpecifiers(
   }
 }
 
+static void DiagnoseCountAttributedTypeInUnnamedAnon(ParsingDeclSpec &DS,
+                                                     Parser &P) {
+
+  if (DS.getTypeSpecType() != DeclSpec::TST_struct)
+    return;
+
+  auto *RD = dyn_cast<RecordDecl>(DS.getRepAsDecl());
+  // We're only interested in unnamed, non-anonymous struct
+  if (!RD || !RD->getName().empty() || RD->isAnonymousStructOrUnion())
+    return;
+
+  for (auto *I : RD->decls()) {
+    auto *VD = dyn_cast<ValueDecl>(I);
+    if (!VD)
+      continue;
+
+    auto *CAT = VD->getType()->getAs<CountAttributedType>();
+    if (!CAT)
+      continue;
+
+    for (const auto &DD : CAT->dependent_decls()) {
+      if (!RD->containsDecl(DD.getDecl())) {
+        P.Diag(VD->getBeginLoc(),
+               diag::err_flexible_array_count_not_in_same_struct)
+            << DD.getDecl();
+        P.Diag(DD.getDecl()->getBeginLoc(),
+               diag::note_flexible_array_counted_by_attr_field)
+            << DD.getDecl();
+      }
+    }
+  }
+}
+
 /// ParseStructDeclaration - Parse a struct declaration without the terminating
 /// semicolon.
 ///
@@ -4759,6 +4844,11 @@ void Parser::ParseStructDeclaration(
     } else
       DeclaratorInfo.D.SetIdentifier(nullptr, Tok.getLocation());
 
+    // Here, we now know that the unnamed struct is not an anonymous struct.
+    // Report an error if a counted_by attribute refers to a field in a
+    // different named struct.
+    DiagnoseCountAttributedTypeInUnnamedAnon(DS, *this);
+
     if (TryConsumeToken(tok::colon)) {
       ExprResult Res(ParseConstantExpression());
       if (Res.isInvalid())
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index a5f42b630c3fa2..ef3ab16ba29b41 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1971,6 +1971,191 @@ static bool SemaOpenCLBuiltinToAddr(Sema &S, unsigned BuiltinID,
   return false;
 }
 
+namespace {
+enum PointerAuthOpKind {
+  PAO_Strip,
+  PAO_Sign,
+  PAO_Auth,
+  PAO_SignGeneric,
+  PAO_Discriminator,
+  PAO_BlendPointer,
+  PAO_BlendInteger
+};
+}
+
+static bool checkPointerAuthEnabled(Sema &S, Expr *E) {
+  if (S.getLangOpts().PointerAuthIntrinsics)
+    return false;
+
+  S.Diag(E->getExprLoc(), diag::err_ptrauth_disabled) << E->getSourceRange();
+  return true;
+}
+
+static bool checkPointerAuthKey(Sema &S, Expr *&Arg) {
+  // Convert it to type 'int'.
+  if (convertArgumentToType(S, Arg, S.Context.IntTy))
+    return true;
+
+  // Value-dependent expressions are okay; wait for template instantiation.
+  if (Arg->isValueDependent())
+    return false;
+
+  unsigned KeyValue;
+  return S.checkConstantPointerAuthKey(Arg, KeyValue);
+}
+
+bool Sema::checkConstantPointerAuthKey(Expr *Arg, unsigned &Result) {
+  // Attempt to constant-evaluate the expression.
+  std::optional<llvm::APSInt> KeyValue = Arg->getIntegerConstantExpr(Context);
+  if (!KeyValue) {
+    Diag(Arg->getExprLoc(), diag::err_expr_not_ice)
+        << 0 << Arg->getSourceRange();
+    return true;
+  }
+
+  // Ask the target to validate the key parameter.
+  if (!Context.getTargetInfo().validatePointerAuthKey(*KeyValue)) {
+    llvm::SmallString<32> Value;
+    {
+      llvm::raw_svector_ostream Str(Value);
+      Str << *KeyValue;
+    }
+
+    Diag(Arg->getExprLoc(), diag::err_ptrauth_invalid_key)
+        << Value << Arg->getSourceRange();
+    return true;
+  }
+
+  Result = KeyValue->getZExtValue();
+  return false;
+}
+
+static bool checkPointerAuthValue(Sema &S, Expr *&Arg,
+                                  PointerAuthOpKind OpKind) {
+  if (Arg->hasPlaceholderType()) {
+    ExprResult R = S.CheckPlaceholderExpr(Arg);
+    if (R.isInvalid())
+      return true;
+    Arg = R.get();
+  }
+
+  auto AllowsPointer = [](PointerAuthOpKind OpKind) {
+    return OpKind != PAO_BlendInteger;
+  };
+  auto AllowsInteger = [](PointerAuthOpKind OpKind) {
+    return OpKind == PAO_Discriminator || OpKind == PAO_BlendInteger ||
+           OpKind == PAO_SignGeneric;
+  };
+
+  // Require the value to have the right range of type.
+  QualType ExpectedTy;
+  if (AllowsPointer(OpKind) && Arg->getType()->isPointerType()) {
+    ExpectedTy = Arg->getType().getUnqualifiedType();
+  } else if (AllowsPointer(OpKind) && Arg->getType()->isNullPtrType()) {
+    ExpectedTy = S.Context.VoidPtrTy;
+  } else if (AllowsInteger(OpKind) &&
+             Arg->getType()->isIntegralOrUnscopedEnumerationType()) {
+    ExpectedTy = S.Context.getUIntPtrType();
+
+  } else {
+    // Diagnose the failures.
+    S.Diag(Arg->getExprLoc(), diag::err_ptrauth_value_bad_type)
+        << unsigned(OpKind == PAO_Discriminator  ? 1
+                    : OpKind == PAO_BlendPointer ? 2
+                    : OpKind == PAO_BlendInteger ? 3
+                                                 : 0)
+        << unsigned(AllowsInteger(OpKind) ? (AllowsPointer(OpKind) ? 2 : 1) : 0)
+        << Arg->getType() << Arg->getSourceRange();
+    return true;
+  }
+
+  // Convert to that type.  This should just be an lvalue-to-rvalue
+  // conversion.
+  if (convertArgumentToType(S, Arg, ExpectedTy))
+    return true;
+
+  // Warn about null pointers for non-generic sign and auth operations.
+  if ((OpKind == PAO_Sign || OpKind == PAO_Auth) &&
+      Arg->isNullPointerConstant(S.Context, Expr::NPC_ValueDependentIsNull)) {
+    S.Diag(Arg->getExprLoc(), OpKind == PAO_Sign
+                                  ? diag::warn_ptrauth_sign_null_pointer
+                                  : diag::warn_ptrauth_auth_null_pointer)
+        << Arg->getSourceRange();
+  }
+
+  return false;
+}
+
+static ExprResult SemaPointerAuthStrip(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 2))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], PAO_Strip) ||
+      checkPointerAuthKey(S, Call->getArgs()[1]))
+    return ExprError();
+
+  Call->setType(Call->getArgs()[0]->getType());
+  return Call;
+}
+
+static ExprResult SemaPointerAuthBlendDiscriminator(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 2))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], PAO_BlendPointer) ||
+      checkPointerAuthValue(S, Call->getArgs()[1], PAO_BlendInteger))
+    return ExprError();
+
+  Call->setType(S.Context.getUIntPtrType());
+  return Call;
+}
+
+static ExprResult SemaPointerAuthSignGenericData(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 2))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], PAO_SignGeneric) ||
+      checkPointerAuthValue(S, Call->getArgs()[1], PAO_Discriminator))
+    return ExprError();
+
+  Call->setType(S.Context.getUIntPtrType());
+  return Call;
+}
+
+static ExprResult SemaPointerAuthSignOrAuth(Sema &S, CallExpr *Call,
+                                            PointerAuthOpKind OpKind) {
+  if (checkArgCount(S, Call, 3))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], OpKind) ||
+      checkPointerAuthKey(S, Call->getArgs()[1]) ||
+      checkPointerAuthValue(S, Call->getArgs()[2], PAO_Discriminator))
+    return ExprError();
+
+  Call->setType(Call->getArgs()[0]->getType());
+  return Call;
+}
+
+static ExprResult SemaPointerAuthAuthAndResign(Sema &S, CallExpr *Call) {
+  if (checkArgCount(S, Call, 5))
+    return ExprError();
+  if (checkPointerAuthEnabled(S, Call))
+    return ExprError();
+  if (checkPointerAuthValue(S, Call->getArgs()[0], PAO_Auth) ||
+      checkPointerAuthKey(S, Call->getArgs()[1]) ||
+      checkPointerAuthValue(S, Call->getArgs()[2], PAO_Discriminator) ||
+      checkPointerAuthKey(S, Call->getArgs()[3]) ||
+      checkPointerAuthValue(S, Call->getArgs()[4], PAO_Discriminator))
+    return ExprError();
+
+  Call->setType(Call->getArgs()[0]->getType());
+  return Call;
+}
+
 static ExprResult SemaBuiltinLaunder(Sema &S, CallExpr *TheCall) {
   if (checkArgCount(S, TheCall, 1))
     return ExprError();
@@ -2683,6 +2868,18 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     }
     break;
   }
+  case Builtin::BI__builtin_ptrauth_strip:
+    return SemaPointerAuthStrip(*this, TheCall);
+  case Builtin::BI__builtin_ptrauth_blend_discriminator:
+    return SemaPointerAuthBlendDiscriminator(*this, TheCall);
+  case Builtin::BI__builtin_ptrauth_sign_unauthenticated:
+    return SemaPointerAuthSignOrAuth(*this, TheCall, PAO_Sign);
+  case Builtin::BI__builtin_ptrauth_auth:
+    return SemaPointerAuthSignOrAuth(*this, TheCall, PAO_Auth);
+  case Builtin::BI__builtin_ptrauth_sign_generic_data:
+    return SemaPointerAuthSignGenericData(*this, TheCall);
+  case Builtin::BI__builtin_ptrauth_auth_and_resign:
+    return SemaPointerAuthAuthAndResign(*this, TheCall);
   // OpenCL v2.0, s6.13.16 - Pipe functions
   case Builtin::BIread_pipe:
   case Builtin::BIwrite_pipe:
@@ -4992,6 +5189,7 @@ static bool isPPC_64Builtin(unsigned BuiltinID) {
   case PPC::BI__builtin_ppc_fetch_and_andlp:
   case PPC::BI__builtin_ppc_fetch_and_orlp:
   case PPC::BI__builtin_ppc_fetch_and_swaplp:
+  case PPC::BI__builtin_ppc_rldimi:
     return true;
   }
   return false;
@@ -5093,8 +5291,10 @@ bool Sema::CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case PPC::BI__builtin_ppc_rlwnm:
     return SemaValueIsRunOfOnes(TheCall, 2);
   case PPC::BI__builtin_ppc_rlwimi:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 31) ||
+           SemaValueIsRunOfOnes(TheCall, 3);
   case PPC::BI__builtin_ppc_rldimi:
-    return SemaBuiltinConstantArg(TheCall, 2, Result) ||
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 63) ||
            SemaValueIsRunOfOnes(TheCall, 3);
   case PPC::BI__builtin_ppc_addex: {
     if (SemaBuiltinConstantArgRange(TheCall, 2, 0, 3))
@@ -5233,10 +5433,6 @@ bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
                            TheCall->getArg(1)->getEndLoc());
         retValue = true;
       }
-
-      if (!retValue)
-        TheCall->setType(VecTyA->getElementType());
-
       return retValue;
     }
   }
@@ -5250,11 +5446,12 @@ bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
   return true;
 }
 
-bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
-  QualType ExpectedType = S->Context.FloatTy;
+bool CheckArgsTypesAreCorrect(
+    Sema *S, CallExpr *TheCall, QualType ExpectedType,
+    llvm::function_ref<bool(clang::QualType PassedType)> Check) {
   for (unsigned i = 0; i < TheCall->getNumArgs(); ++i) {
     QualType PassedType = TheCall->getArg(i)->getType();
-    if (!PassedType->hasFloatingRepresentation()) {
+    if (Check(PassedType)) {
       if (auto *VecTyA = PassedType->getAs<VectorType>())
         ExpectedType = S->Context.getVectorType(
             ExpectedType, VecTyA->getNumElements(), VecTyA->getVectorKind());
@@ -5267,6 +5464,45 @@ bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
   return false;
 }
 
+bool CheckAllArgsHaveFloatRepresentation(Sema *S, CallExpr *TheCall) {
+  auto checkAllFloatTypes = [](clang::QualType PassedType) -> bool {
+    return !PassedType->hasFloatingRepresentation();
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                  checkAllFloatTypes);
+}
+
+bool CheckFloatOrHalfRepresentations(Sema *S, CallExpr *TheCall) {
+  auto checkFloatorHalf = [](clang::QualType PassedType) -> bool {
+    clang::QualType BaseType =
+        PassedType->isVectorType()
+            ? PassedType->getAs<clang::VectorType>()->getElementType()
+            : PassedType;
+    return !BaseType->isHalfType() && !BaseType->isFloat32Type();
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                  checkFloatorHalf);
+}
+
+bool CheckNoDoubleVectors(Sema *S, CallExpr *TheCall) {
+  auto checkDoubleVector = [](clang::QualType PassedType) -> bool {
+    if (const auto *VecTy = PassedType->getAs<VectorType>())
+      return VecTy->getElementType()->isDoubleType();
+    return false;
+  };
+  return CheckArgsTypesAreCorrect(S, TheCall, S->Context.FloatTy,
+                                  checkDoubleVector);
+}
+
+void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
+                                QualType ReturnType) {
+  auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
+  if (VecTyA)
+    ReturnType = S->Context.getVectorType(ReturnType, VecTyA->getNumElements(),
+                                          VectorKind::Generic);
+  TheCall->setType(ReturnType);
+}
+
 // Note: returning true in this case results in CheckBuiltinFunctionCall
 // returning an ExprError
 bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
@@ -5276,6 +5512,17 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
+  case Builtin::BI__builtin_hlsl_elementwise_clamp: {
+    if (checkArgCount(*this, TheCall, 3))
+      return true;
+    if (CheckVectorElementCallArgs(this, TheCall))
+      return true;
+    if (SemaBuiltinElementwiseTernaryMath(
+            TheCall, /*CheckForFloatArgs*/
+            TheCall->getArg(0)->getType()->hasFloatingRepresentation()))
+      return true;
+    break;
+  }
   case Builtin::BI__builtin_hlsl_dot: {
     if (checkArgCount(*this, TheCall, 2))
       return true;
@@ -5283,14 +5530,31 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     if (SemaBuiltinVectorToScalarMath(TheCall))
       return true;
+    if (CheckNoDoubleVectors(this, TheCall))
+      return true;
     break;
   }
-  case Builtin::BI__builtin_hlsl_elementwise_rcp:
+  case Builtin::BI__builtin_hlsl_elementwise_rcp: {
+    if (CheckAllArgsHaveFloatRepresentation(this, TheCall))
+      return true;
+    if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_rsqrt:
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
+    if (CheckFloatOrHalfRepresentations(this, TheCall))
+      return true;
     if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
       return true;
-    if (CheckAllArgsHaveFloatRepresentation(this, TheCall))
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_elementwise_isinf: {
+    if (CheckFloatOrHalfRepresentations(this, TheCall))
       return true;
+    if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
+      return true;
+    SetElementTypeAsReturnType(this, TheCall, this->Context.BoolTy);
     break;
   }
   case Builtin::BI__builtin_hlsl_lerp: {
@@ -5300,7 +5564,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     if (SemaBuiltinElementwiseTernaryMath(TheCall))
       return true;
-    if (CheckAllArgsHaveFloatRepresentation(this, TheCall))
+    if (CheckFloatOrHalfRepresentations(this, TheCall))
       return true;
     break;
   }
@@ -5309,7 +5573,9 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
-    if (SemaBuiltinElementwiseTernaryMath(TheCall, /*CheckForFloatArgs*/ false))
+    if (SemaBuiltinElementwiseTernaryMath(
+            TheCall, /*CheckForFloatArgs*/
+            TheCall->getArg(0)->getType()->hasFloatingRepresentation()))
       return true;
   }
   }
@@ -16351,12 +16617,26 @@ static void AnalyzeImplicitConversions(
         BO->getRHS()->isKnownToHaveBooleanValue() &&
         BO->getLHS()->HasSideEffects(S.Context) &&
         BO->getRHS()->HasSideEffects(S.Context)) {
-      S.Diag(BO->getBeginLoc(), diag::warn_bitwise_instead_of_logical)
-          << (BO->getOpcode() == BO_And ? "&" : "|") << OrigE->getSourceRange()
-          << FixItHint::CreateReplacement(
-                 BO->getOperatorLoc(),
-                 (BO->getOpcode() == BO_And ? "&&" : "||"));
-      S.Diag(BO->getBeginLoc(), diag::note_cast_operand_to_int);
+      SourceManager &SM = S.getSourceManager();
+      const LangOptions &LO = S.getLangOpts();
+      SourceLocation BLoc = BO->getOperatorLoc();
+      SourceLocation ELoc = Lexer::getLocForEndOfToken(BLoc, 0, SM, LO);
+      StringRef SR = clang::Lexer::getSourceText(
+          clang::CharSourceRange::getTokenRange(BLoc, ELoc), SM, LO);
+      // To reduce false positives, only issue the diagnostic if the operator
+      // is explicitly spelled as a punctuator. This suppresses the diagnostic
+      // when using 'bitand' or 'bitor' either as keywords in C++ or as macros
+      // in C, along with other macro spellings the user might invent.
+      if (SR.str() == "&" || SR.str() == "|") {
+
+        S.Diag(BO->getBeginLoc(), diag::warn_bitwise_instead_of_logical)
+            << (BO->getOpcode() == BO_And ? "&" : "|")
+            << OrigE->getSourceRange()
+            << FixItHint::CreateReplacement(
+                   BO->getOperatorLoc(),
+                   (BO->getOpcode() == BO_And ? "&&" : "||"));
+        S.Diag(BO->getBeginLoc(), diag::note_cast_operand_to_int);
+      }
     }
 
   // For conditional operators, we analyze the arguments as if they
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index a8e387e35fb4c9..1c546e9f5894f0 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -692,11 +692,15 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
   // A lambda conversion operator has the same constraints as the call operator
   // and constraints checking relies on whether we are in a lambda call operator
   // (and may refer to its parameters), so check the call operator instead.
+  // Note that the declarations outside of the lambda should also be
+  // considered. Turning on the 'ForOverloadResolution' flag results in the
+  // LocalInstantiationScope not looking into its parents, but we can still
+  // access Decls from the parents while building a lambda RAII scope later.
   if (const auto *MD = dyn_cast<CXXConversionDecl>(FD);
       MD && isLambdaConversionOperator(const_cast<CXXConversionDecl *>(MD)))
     return CheckFunctionConstraints(MD->getParent()->getLambdaCallOperator(),
                                     Satisfaction, UsageLoc,
-                                    ForOverloadResolution);
+                                    /*ShouldAddDeclsFromParentScope=*/true);
 
   DeclContext *CtxToSave = const_cast<FunctionDecl *>(FD);
 
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 5850cd0ab6b9aa..8ceb79555fb5e7 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2271,12 +2271,6 @@ void Sema::ActOnPopScope(SourceLocation Loc, Scope *S) {
       }
       ShadowingDecls.erase(ShadowI);
     }
-
-    if (!getLangOpts().CPlusPlus && S->isClassScope()) {
-      if (auto *FD = dyn_cast<FieldDecl>(TmpD);
-          FD && FD->hasAttr<CountedByAttr>())
-        CheckCountedByAttr(S, FD);
-    }
   }
 
   llvm::sort(DeclDiags,
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index ec00fdf3f88d9e..0a62c656d824ff 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8556,133 +8556,114 @@ static void handleZeroCallUsedRegsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(ZeroCallUsedRegsAttr::Create(S.Context, Kind, AL));
 }
 
-static void handleCountedByAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  if (!AL.isArgIdent(0)) {
-    S.Diag(AL.getLoc(), diag::err_attribute_argument_type)
-        << AL << AANT_ArgumentIdentifier;
-    return;
+static const RecordDecl *GetEnclosingNamedOrTopAnonRecord(const FieldDecl *FD) {
+  const auto *RD = FD->getParent();
+  // An unnamed struct is anonymous struct only if it's not instantiated.
+  // However, the struct may not be fully processed yet to determine
+  // whether it's anonymous or not. In that case, this function treats it as
+  // an anonymous struct and tries to find a named parent.
+  while (RD && (RD->isAnonymousStructOrUnion() ||
+                (!RD->isCompleteDefinition() && RD->getName().empty()))) {
+    const auto *Parent = dyn_cast<RecordDecl>(RD->getParent());
+    if (!Parent)
+      break;
+    RD = Parent;
   }
-
-  IdentifierLoc *IL = AL.getArgAsIdent(0);
-  CountedByAttr *CBA =
-      ::new (S.Context) CountedByAttr(S.Context, AL, IL->Ident);
-  CBA->setCountedByFieldLoc(IL->Loc);
-  D->addAttr(CBA);
+  return RD;
 }
 
-static const FieldDecl *
-FindFieldInTopLevelOrAnonymousStruct(const RecordDecl *RD,
-                                     const IdentifierInfo *FieldName) {
-  for (const Decl *D : RD->decls()) {
-    if (const auto *FD = dyn_cast<FieldDecl>(D))
-      if (FD->getName() == FieldName->getName())
-        return FD;
-
-    if (const auto *R = dyn_cast<RecordDecl>(D))
-      if (const FieldDecl *FD =
-              FindFieldInTopLevelOrAnonymousStruct(R, FieldName))
-        return FD;
+static bool
+CheckCountExpr(Sema &S, FieldDecl *FD, Expr *E,
+               llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
+  if (FD->getParent()->isUnion()) {
+    S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_in_union)
+        << FD->getSourceRange();
+    return true;
   }
 
-  return nullptr;
-}
+  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
+    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
+        << E->getSourceRange();
+    return true;
+  }
 
-bool Sema::CheckCountedByAttr(Scope *S, const FieldDecl *FD) {
   LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
       LangOptions::StrictFlexArraysLevelKind::IncompleteOnly;
-  if (!Decl::isFlexibleArrayMemberLike(Context, FD, FD->getType(),
+
+  if (!Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FD->getType(),
                                        StrictFlexArraysLevel, true)) {
     // The "counted_by" attribute must be on a flexible array member.
     SourceRange SR = FD->getLocation();
-    Diag(SR.getBegin(), diag::err_counted_by_attr_not_on_flexible_array_member)
+    S.Diag(SR.getBegin(),
+           diag::err_counted_by_attr_not_on_flexible_array_member)
         << SR;
     return true;
   }
 
-  const auto *CBA = FD->getAttr<CountedByAttr>();
-  const IdentifierInfo *FieldName = CBA->getCountedByField();
-
-  auto GetNonAnonStructOrUnion = [](const RecordDecl *RD) {
-    while (RD && !RD->getDeclName())
-      if (const auto *R = dyn_cast<RecordDecl>(RD->getDeclContext()))
-        RD = R;
-      else
-        break;
-
-    return RD;
-  };
-
-  const RecordDecl *EnclosingRD = GetNonAnonStructOrUnion(FD->getParent());
-  const FieldDecl *CountFD =
-      FindFieldInTopLevelOrAnonymousStruct(EnclosingRD, FieldName);
+  auto *DRE = dyn_cast<DeclRefExpr>(E);
+  if (!DRE) {
+    S.Diag(E->getBeginLoc(),
+           diag::err_counted_by_attr_only_support_simple_decl_reference)
+        << E->getSourceRange();
+    return true;
+  }
 
+  auto *CountDecl = DRE->getDecl();
+  FieldDecl *CountFD = dyn_cast<FieldDecl>(CountDecl);
+  if (auto *IFD = dyn_cast<IndirectFieldDecl>(CountDecl)) {
+    CountFD = IFD->getAnonField();
+  }
   if (!CountFD) {
-    DeclarationNameInfo NameInfo(FieldName,
-                                 CBA->getCountedByFieldLoc().getBegin());
-    LookupResult MemResult(*this, NameInfo, Sema::LookupMemberName);
-    LookupName(MemResult, S);
-
-    if (!MemResult.empty()) {
-      SourceRange SR = CBA->getCountedByFieldLoc();
-      Diag(SR.getBegin(), diag::err_flexible_array_count_not_in_same_struct)
-          << CBA->getCountedByField() << SR;
-
-      if (auto *ND = MemResult.getAsSingle<NamedDecl>()) {
-        SR = ND->getLocation();
-        Diag(SR.getBegin(), diag::note_flexible_array_counted_by_attr_field)
-            << ND << SR;
-      }
+    S.Diag(E->getBeginLoc(), diag::err_counted_by_must_be_in_structure)
+        << CountDecl << E->getSourceRange();
 
-      return true;
-    } else {
-      // The "counted_by" field needs to exist in the struct.
-      LookupResult OrdResult(*this, NameInfo, Sema::LookupOrdinaryName);
-      LookupName(OrdResult, S);
-
-      if (!OrdResult.empty()) {
-        SourceRange SR = FD->getLocation();
-        Diag(SR.getBegin(), diag::err_counted_by_must_be_in_structure)
-            << FieldName << SR;
-
-        if (auto *ND = OrdResult.getAsSingle<NamedDecl>()) {
-          SR = ND->getLocation();
-          Diag(SR.getBegin(), diag::note_flexible_array_counted_by_attr_field)
-              << ND << SR;
-        }
+    S.Diag(CountDecl->getBeginLoc(),
+           diag::note_flexible_array_counted_by_attr_field)
+        << CountDecl << CountDecl->getSourceRange();
+    return true;
+  }
 
-        return true;
-      }
+  if (FD->getParent() != CountFD->getParent()) {
+    if (CountFD->getParent()->isUnion()) {
+      S.Diag(CountFD->getBeginLoc(), diag::err_counted_by_attr_refer_to_union)
+          << CountFD->getSourceRange();
+      return true;
+    }
+    // Whether CountRD is an anonymous struct is not determined at this
+    // point. Thus, an additional diagnostic in case it's not anonymous struct
+    // is done later in `Parser::ParseStructDeclaration`.
+    auto *RD = GetEnclosingNamedOrTopAnonRecord(FD);
+    auto *CountRD = GetEnclosingNamedOrTopAnonRecord(CountFD);
+
+    if (RD != CountRD) {
+      S.Diag(E->getBeginLoc(),
+             diag::err_flexible_array_count_not_in_same_struct)
+          << CountFD << E->getSourceRange();
+      S.Diag(CountFD->getBeginLoc(),
+             diag::note_flexible_array_counted_by_attr_field)
+          << CountFD << CountFD->getSourceRange();
+      return true;
     }
-
-    CXXScopeSpec SS;
-    DeclFilterCCC<FieldDecl> Filter(FieldName);
-    return DiagnoseEmptyLookup(S, SS, MemResult, Filter, nullptr, std::nullopt,
-                               const_cast<DeclContext *>(FD->getDeclContext()));
   }
 
-  if (CountFD->hasAttr<CountedByAttr>()) {
-    // The "counted_by" field can't point to the flexible array member.
-    SourceRange SR = CBA->getCountedByFieldLoc();
-    Diag(SR.getBegin(), diag::err_counted_by_attr_refers_to_flexible_array)
-        << CBA->getCountedByField() << SR;
-    return true;
-  }
+  Decls.push_back(TypeCoupledDeclRefInfo(CountFD, /*IsDref*/ false));
+  return false;
+}
 
-  if (!CountFD->getType()->isIntegerType() ||
-      CountFD->getType()->isBooleanType()) {
-    // The "counted_by" field must have an integer type.
-    SourceRange SR = CBA->getCountedByFieldLoc();
-    Diag(SR.getBegin(),
-         diag::err_flexible_array_counted_by_attr_field_not_integer)
-        << CBA->getCountedByField() << SR;
+static void handleCountedByAttrField(Sema &S, Decl *D, const ParsedAttr &AL) {
+  auto *FD = dyn_cast<FieldDecl>(D);
+  assert(FD);
 
-    SR = CountFD->getLocation();
-    Diag(SR.getBegin(), diag::note_flexible_array_counted_by_attr_field)
-        << CountFD << SR;
-    return true;
-  }
+  auto *CountExpr = AL.getArgAsExpr(0);
+  if (!CountExpr)
+    return;
 
-  return false;
+  llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
+  if (CheckCountExpr(S, FD, CountExpr, Decls))
+    return;
+
+  QualType CAT = S.BuildCountAttributedArrayType(FD->getType(), CountExpr);
+  FD->setType(CAT);
 }
 
 static void handleFunctionReturnThunksAttr(Sema &S, Decl *D,
@@ -9704,7 +9685,7 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
     break;
 
   case ParsedAttr::AT_CountedBy:
-    handleCountedByAttr(S, D, AL);
+    handleCountedByAttrField(S, D, AL);
     break;
 
   // Microsoft attributes:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 8725b09f8546cf..5f03b981428251 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2737,6 +2737,24 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
     return ActOnDependentIdExpression(SS, TemplateKWLoc, NameInfo,
                                       IsAddressOfOperand, TemplateArgs);
 
+  // BoundsSafety: This specially handles arguments of bounds attributes
+  // appertains to a type of C struct field such that the name lookup
+  // within a struct finds the member name, which is not the case for other
+  // contexts in C.
+  if (isBoundsAttrContext() && !getLangOpts().CPlusPlus && S->isClassScope()) {
+    // See if this is reference to a field of struct.
+    LookupResult R(*this, NameInfo, LookupMemberName);
+    // LookupParsedName handles a name lookup from within anonymous struct.
+    if (LookupParsedName(R, S, &SS)) {
+      if (auto *VD = dyn_cast<ValueDecl>(R.getFoundDecl())) {
+        QualType type = VD->getType().getNonReferenceType();
+        // This will eventually be translated into MemberExpr upon
+        // the use of instantiated struct fields.
+        return BuildDeclRefExpr(VD, type, VK_PRValue, NameLoc);
+      }
+    }
+  }
+
   // Perform the required lookup.
   LookupResult R(*this, NameInfo,
                  (Id.getKind() == UnqualifiedIdKind::IK_ImplicitSelfParam)
@@ -2893,7 +2911,8 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
   // to get this right here so that we don't end up making a
   // spuriously dependent expression if we're inside a dependent
   // instance method.
-  if (!R.empty() && (*R.begin())->isCXXClassMember()) {
+  if (getLangOpts().CPlusPlus && !R.empty() &&
+      (*R.begin())->isCXXClassMember()) {
     bool MightBeImplicitMember;
     if (!IsAddressOfOperand)
       MightBeImplicitMember = true;
@@ -3549,7 +3568,8 @@ ExprResult Sema::BuildDeclarationNameExpr(
   case Decl::Field:
   case Decl::IndirectField:
   case Decl::ObjCIvar:
-    assert(getLangOpts().CPlusPlus && "building reference to field in C?");
+    assert((getLangOpts().CPlusPlus || isBoundsAttrContext()) &&
+           "building reference to field in C?");
 
     // These can't have reference type in well-formed programs, but
     // for internal consistency we do this anyway.
@@ -3740,7 +3760,10 @@ ExprResult Sema::BuildPredefinedExpr(SourceLocation Loc,
   else {
     // Pre-defined identifiers are of type char[x], where x is the length of
     // the string.
-    auto Str = PredefinedExpr::ComputeName(IK, currentDecl);
+    bool ForceElaboratedPrinting =
+        IK == PredefinedIdentKind::Function && getLangOpts().MSVCCompat;
+    auto Str =
+        PredefinedExpr::ComputeName(IK, currentDecl, ForceElaboratedPrinting);
     unsigned Length = Str.length();
 
     llvm::APInt LengthI(32, Length + 1);
@@ -4706,6 +4729,7 @@ static void captureVariablyModifiedType(ASTContext &Context, QualType T,
     case Type::BTFTagAttributed:
     case Type::SubstTemplateTypeParm:
     case Type::MacroQualified:
+    case Type::CountAttributed:
       // Keep walking after single level desugaring.
       T = T.getSingleStepDesugaredType(Context);
       break;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 0cc0cbacb37548..e9ad7bbde0f9b5 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -14366,7 +14366,8 @@ StmtResult Sema::ActOnOpenMPTargetParallelForSimdDirective(
   // The point of exit cannot be a branch out of the structured block.
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
-  for (int ThisCaptureLevel = getOpenMPCaptureLevels(OMPD_target_parallel_for);
+  for (int ThisCaptureLevel =
+           getOpenMPCaptureLevels(OMPD_target_parallel_for_simd);
        ThisCaptureLevel > 1; --ThisCaptureLevel) {
     CS = cast<CapturedStmt>(CS->getCapturedStmt());
     // 1.2.2 OpenMP Language Terminology
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 9aaacaa0771f2f..7b14323b0674b4 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -6503,6 +6503,9 @@ namespace {
     void VisitAttributedTypeLoc(AttributedTypeLoc TL) {
       fillAttributedTypeLoc(TL, State);
     }
+    void VisitCountAttributedTypeLoc(CountAttributedTypeLoc TL) {
+      // nothing
+    }
     void VisitBTFTagAttributedTypeLoc(BTFTagAttributedTypeLoc TL) {
       // nothing
     }
@@ -9754,6 +9757,26 @@ QualType Sema::BuildTypeofExprType(Expr *E, TypeOfKind Kind) {
   return Context.getTypeOfExprType(E, Kind);
 }
 
+static void
+BuildTypeCoupledDecls(Expr *E,
+                      llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
+  // Currently, 'counted_by' only allows direct DeclRefExpr to FieldDecl.
+  auto *CountDecl = cast<DeclRefExpr>(E)->getDecl();
+  Decls.push_back(TypeCoupledDeclRefInfo(CountDecl, /*IsDref*/ false));
+}
+
+QualType Sema::BuildCountAttributedArrayType(QualType WrappedTy,
+                                             Expr *CountExpr) {
+  assert(WrappedTy->isIncompleteArrayType());
+
+  llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
+  BuildTypeCoupledDecls(CountExpr, Decls);
+  /// When the resulting expression is invalid, we still create the AST using
+  /// the original count expression for the sake of AST dump.
+  return Context.getCountAttributedType(
+      WrappedTy, CountExpr, /*CountInBytes*/ false, /*OrNull*/ false, Decls);
+}
+
 /// getDecltypeForExpr - Given an expr, will return the decltype for
 /// that expression, according to the rules in C++11
 /// [dcl.type.simple]p4 and C++11 [expr.lambda.prim]p18.
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 2d22692f3ab750..80a10647ca5d33 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7269,6 +7269,34 @@ QualType TreeTransform<Derived>::TransformAttributedType(TypeLocBuilder &TLB,
       });
 }
 
+template <typename Derived>
+QualType TreeTransform<Derived>::TransformCountAttributedType(
+    TypeLocBuilder &TLB, CountAttributedTypeLoc TL) {
+  const CountAttributedType *OldTy = TL.getTypePtr();
+  QualType InnerTy = getDerived().TransformType(TLB, TL.getInnerLoc());
+  if (InnerTy.isNull())
+    return QualType();
+
+  Expr *OldCount = TL.getCountExpr();
+  Expr *NewCount = nullptr;
+  if (OldCount) {
+    ExprResult CountResult = getDerived().TransformExpr(OldCount);
+    if (CountResult.isInvalid())
+      return QualType();
+    NewCount = CountResult.get();
+  }
+
+  QualType Result = TL.getType();
+  if (getDerived().AlwaysRebuild() || InnerTy != OldTy->desugar() ||
+      OldCount != NewCount) {
+    // Currently, CountAttributedType can only wrap incomplete array types.
+    Result = SemaRef.BuildCountAttributedArrayType(InnerTy, NewCount);
+  }
+
+  TLB.push<CountAttributedTypeLoc>(Result);
+  return Result;
+}
+
 template <typename Derived>
 QualType TreeTransform<Derived>::TransformBTFTagAttributedType(
     TypeLocBuilder &TLB, BTFTagAttributedTypeLoc TL) {
@@ -13714,6 +13742,16 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
 
     // Capturing 'this' is trivial.
     if (C->capturesThis()) {
+      // If this is a lambda that is part of a default member initialiser
+      // and which we're instantiating outside the class that 'this' is
+      // supposed to refer to, adjust the type of 'this' accordingly.
+      //
+      // Otherwise, leave the type of 'this' as-is.
+      Sema::CXXThisScopeRAII ThisScope(
+          getSema(),
+          dyn_cast_if_present<CXXRecordDecl>(
+              getSema().getFunctionLevelDeclContext()),
+          Qualifiers());
       getSema().CheckCXXThisCapture(C->getLocation(), C->isExplicit(),
                                     /*BuildAndDiagnose*/ true, nullptr,
                                     C->getCaptureKind() == LCK_StarThis);
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index ede9f6e93469b7..28e8d27fef08c6 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -6995,6 +6995,10 @@ void TypeLocReader::VisitAttributedTypeLoc(AttributedTypeLoc TL) {
   TL.setAttr(ReadAttr());
 }
 
+void TypeLocReader::VisitCountAttributedTypeLoc(CountAttributedTypeLoc TL) {
+  // Nothing to do
+}
+
 void TypeLocReader::VisitBTFTagAttributedTypeLoc(BTFTagAttributedTypeLoc TL) {
   // Nothing to do.
 }
@@ -9116,6 +9120,10 @@ DeclarationNameInfo ASTRecordReader::readDeclarationNameInfo() {
   return NameInfo;
 }
 
+TypeCoupledDeclRefInfo ASTRecordReader::readTypeCoupledDeclRefInfo() {
+  return TypeCoupledDeclRefInfo(readDeclAs<ValueDecl>(), readBool());
+}
+
 void ASTRecordReader::readQualifierInfo(QualifierInfo &Info) {
   Info.QualifierLoc = readNestedNameSpecifierLoc();
   unsigned NumTPLists = readInt();
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 3653d94c6e0739..221409d011a38c 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -514,6 +514,10 @@ void TypeLocWriter::VisitAttributedTypeLoc(AttributedTypeLoc TL) {
   Record.AddAttr(TL.getAttr());
 }
 
+void TypeLocWriter::VisitCountAttributedTypeLoc(CountAttributedTypeLoc TL) {
+  // Nothing to do
+}
+
 void TypeLocWriter::VisitBTFTagAttributedTypeLoc(BTFTagAttributedTypeLoc TL) {
   // Nothing to do.
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 29eb932584027d..f82288f1099e88 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -83,6 +83,8 @@ class StateUpdateReporter {
     AssumedUpperBound = UpperBoundVal;
   }
 
+  bool assumedNonNegative() { return AssumedNonNegative; }
+
   const NoteTag *createNoteTag(CheckerContext &C) const;
 
 private:
@@ -402,7 +404,8 @@ static bool tryDividePair(std::optional<int64_t> &Val1,
 }
 
 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
-                               NonLoc Offset, NonLoc Extent, SVal Location) {
+                               NonLoc Offset, NonLoc Extent, SVal Location,
+                               bool AlsoMentionUnderflow) {
   std::string RegName = getRegionName(Region);
   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
   assert(EReg && "this checker only handles element access");
@@ -414,6 +417,7 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
   int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
 
   bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
+  const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
 
   SmallString<256> Buf;
   llvm::raw_svector_ostream Out(Buf);
@@ -421,10 +425,12 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
   if (!ExtentN && !UseByteOffsets)
     Out << "'" << ElemType.getAsString() << "' element in ";
   Out << RegName << " at ";
-  if (OffsetN) {
-    Out << (UseByteOffsets ? "byte offset " : "index ") << *OffsetN;
+  if (AlsoMentionUnderflow) {
+    Out << "a negative or overflowing " << OffsetOrIndex;
+  } else if (OffsetN) {
+    Out << OffsetOrIndex << " " << *OffsetN;
   } else {
-    Out << "an overflowing " << (UseByteOffsets ? "byte offset" : "index");
+    Out << "an overflowing " << OffsetOrIndex;
   }
   if (ExtentN) {
     Out << ", while it holds only ";
@@ -441,17 +447,20 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
       Out << "s";
   }
 
-  return {
-      formatv("Out of bound access to memory after the end of {0}", RegName),
-      std::string(Buf)};
+  return {formatv("Out of bound access to memory {0} {1}",
+                  AlsoMentionUnderflow ? "around" : "after the end of",
+                  RegName),
+          std::string(Buf)};
 }
 
-static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
+static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName,
+                             bool AlsoMentionUnderflow) {
   std::string RegName = getRegionName(Region);
   return {formatv("Potential out of bound access to {0} with tainted {1}",
                   RegName, OffsetName),
-          formatv("Access of {0} with a tainted {1} that may be too large",
-                  RegName, OffsetName)};
+          formatv("Access of {0} with a tainted {1} that may be {2}too large",
+                  RegName, OffsetName,
+                  AlsoMentionUnderflow ? "negative or " : "")};
 }
 
 const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
@@ -600,6 +609,13 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   // CHECK UPPER BOUND
   DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
   if (auto KnownSize = Size.getAs<NonLoc>()) {
+    // In a situation where both overflow and overflow are possible (but the
+    // index is either tainted or known to be invalid), the logic of this
+    // checker will first assume that the offset is non-negative, and then
+    // (with this additional assumption) it will detect an overflow error.
+    // In this situation the warning message should mention both possibilities.
+    bool AlsoMentionUnderflow = SUR.assumedNonNegative();
+
     auto [WithinUpperBound, ExceedsUpperBound] =
         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
 
@@ -615,8 +631,9 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
           return;
         }
 
-        Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
-                                       *KnownSize, Location);
+        Messages Msgs =
+            getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, *KnownSize,
+                           Location, AlsoMentionUnderflow);
         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
         return;
       }
@@ -632,7 +649,7 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
           if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
             OffsetName = "index";
 
-        Messages Msgs = getTaintMsgs(Reg, OffsetName);
+        Messages Msgs = getTaintMsgs(Reg, OffsetName, AlsoMentionUnderflow);
         reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
                   /*IsTaintBug=*/true);
         return;
diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
index 66e080adb1382b..e4373915410fb2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
@@ -20,48 +20,178 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState_Fwd.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+
+#include <iterator>
+#include <utility>
+#include <variant>
 
 using namespace clang;
 using namespace ento;
 
 namespace {
+
+struct CritSectionMarker {
+  const Expr *LockExpr{};
+  const MemRegion *LockReg{};
+
+  void Profile(llvm::FoldingSetNodeID &ID) const {
+    ID.Add(LockExpr);
+    ID.Add(LockReg);
+  }
+
+  [[nodiscard]] constexpr bool
+  operator==(const CritSectionMarker &Other) const noexcept {
+    return LockExpr == Other.LockExpr && LockReg == Other.LockReg;
+  }
+  [[nodiscard]] constexpr bool
+  operator!=(const CritSectionMarker &Other) const noexcept {
+    return !(*this == Other);
+  }
+};
+
+class CallDescriptionBasedMatcher {
+  CallDescription LockFn;
+  CallDescription UnlockFn;
+
+public:
+  CallDescriptionBasedMatcher(CallDescription &&LockFn,
+                              CallDescription &&UnlockFn)
+      : LockFn(std::move(LockFn)), UnlockFn(std::move(UnlockFn)) {}
+  [[nodiscard]] bool matches(const CallEvent &Call, bool IsLock) const {
+    if (IsLock) {
+      return LockFn.matches(Call);
+    }
+    return UnlockFn.matches(Call);
+  }
+};
+
+class FirstArgMutexDescriptor : public CallDescriptionBasedMatcher {
+public:
+  FirstArgMutexDescriptor(CallDescription &&LockFn, CallDescription &&UnlockFn)
+      : CallDescriptionBasedMatcher(std::move(LockFn), std::move(UnlockFn)) {}
+
+  [[nodiscard]] const MemRegion *getRegion(const CallEvent &Call, bool) const {
+    return Call.getArgSVal(0).getAsRegion();
+  }
+};
+
+class MemberMutexDescriptor : public CallDescriptionBasedMatcher {
+public:
+  MemberMutexDescriptor(CallDescription &&LockFn, CallDescription &&UnlockFn)
+      : CallDescriptionBasedMatcher(std::move(LockFn), std::move(UnlockFn)) {}
+
+  [[nodiscard]] const MemRegion *getRegion(const CallEvent &Call, bool) const {
+    return cast<CXXMemberCall>(Call).getCXXThisVal().getAsRegion();
+  }
+};
+
+class RAIIMutexDescriptor {
+  mutable const IdentifierInfo *Guard{};
+  mutable bool IdentifierInfoInitialized{};
+  mutable llvm::SmallString<32> GuardName{};
+
+  void initIdentifierInfo(const CallEvent &Call) const {
+    if (!IdentifierInfoInitialized) {
+      // In case of checking C code, or when the corresponding headers are not
+      // included, we might end up query the identifier table every time when
+      // this function is called instead of early returning it. To avoid this, a
+      // bool variable (IdentifierInfoInitialized) is used and the function will
+      // be run only once.
+      Guard = &Call.getCalleeAnalysisDeclContext()->getASTContext().Idents.get(
+          GuardName);
+      IdentifierInfoInitialized = true;
+    }
+  }
+
+  template <typename T> bool matchesImpl(const CallEvent &Call) const {
+    const T *C = dyn_cast<T>(&Call);
+    if (!C)
+      return false;
+    const IdentifierInfo *II =
+        cast<CXXRecordDecl>(C->getDecl()->getParent())->getIdentifier();
+    return II == Guard;
+  }
+
+public:
+  RAIIMutexDescriptor(StringRef GuardName) : GuardName(GuardName) {}
+  [[nodiscard]] bool matches(const CallEvent &Call, bool IsLock) const {
+    initIdentifierInfo(Call);
+    if (IsLock) {
+      return matchesImpl<CXXConstructorCall>(Call);
+    }
+    return matchesImpl<CXXDestructorCall>(Call);
+  }
+  [[nodiscard]] const MemRegion *getRegion(const CallEvent &Call,
+                                           bool IsLock) const {
+    const MemRegion *LockRegion = nullptr;
+    if (IsLock) {
+      if (std::optional<SVal> Object = Call.getReturnValueUnderConstruction()) {
+        LockRegion = Object->getAsRegion();
+      }
+    } else {
+      LockRegion = cast<CXXDestructorCall>(Call).getCXXThisVal().getAsRegion();
+    }
+    return LockRegion;
+  }
+};
+
+using MutexDescriptor =
+    std::variant<FirstArgMutexDescriptor, MemberMutexDescriptor,
+                 RAIIMutexDescriptor>;
+
 class BlockInCriticalSectionChecker : public Checker<check::PostCall> {
-  mutable IdentifierInfo *IILockGuard = nullptr;
-  mutable IdentifierInfo *IIUniqueLock = nullptr;
-  mutable bool IdentifierInfoInitialized = false;
-
-  const CallDescription LockFn{{"lock"}};
-  const CallDescription UnlockFn{{"unlock"}};
-  const CallDescription SleepFn{{"sleep"}};
-  const CallDescription GetcFn{{"getc"}};
-  const CallDescription FgetsFn{{"fgets"}};
-  const CallDescription ReadFn{{"read"}};
-  const CallDescription RecvFn{{"recv"}};
-  const CallDescription PthreadLockFn{{"pthread_mutex_lock"}};
-  const CallDescription PthreadTryLockFn{{"pthread_mutex_trylock"}};
-  const CallDescription PthreadUnlockFn{{"pthread_mutex_unlock"}};
-  const CallDescription MtxLock{{"mtx_lock"}};
-  const CallDescription MtxTimedLock{{"mtx_timedlock"}};
-  const CallDescription MtxTryLock{{"mtx_trylock"}};
-  const CallDescription MtxUnlock{{"mtx_unlock"}};
-
-  const llvm::StringLiteral ClassLockGuard{"lock_guard"};
-  const llvm::StringLiteral ClassUniqueLock{"unique_lock"};
+private:
+  const std::array<MutexDescriptor, 8> MutexDescriptors{
+      MemberMutexDescriptor(
+          CallDescription(/*QualifiedName=*/{"std", "mutex", "lock"},
+                          /*RequiredArgs=*/0),
+          CallDescription({"std", "mutex", "unlock"}, 0)),
+      FirstArgMutexDescriptor(CallDescription({"pthread_mutex_lock"}, 1),
+                              CallDescription({"pthread_mutex_unlock"}, 1)),
+      FirstArgMutexDescriptor(CallDescription({"mtx_lock"}, 1),
+                              CallDescription({"mtx_unlock"}, 1)),
+      FirstArgMutexDescriptor(CallDescription({"pthread_mutex_trylock"}, 1),
+                              CallDescription({"pthread_mutex_unlock"}, 1)),
+      FirstArgMutexDescriptor(CallDescription({"mtx_trylock"}, 1),
+                              CallDescription({"mtx_unlock"}, 1)),
+      FirstArgMutexDescriptor(CallDescription({"mtx_timedlock"}, 1),
+                              CallDescription({"mtx_unlock"}, 1)),
+      RAIIMutexDescriptor("lock_guard"),
+      RAIIMutexDescriptor("unique_lock")};
+
+  const std::array<CallDescription, 5> BlockingFunctions{
+      ArrayRef{StringRef{"sleep"}}, ArrayRef{StringRef{"getc"}},
+      ArrayRef{StringRef{"fgets"}}, ArrayRef{StringRef{"read"}},
+      ArrayRef{StringRef{"recv"}}};
 
   const BugType BlockInCritSectionBugType{
       this, "Call to blocking function in critical section", "Blocking Error"};
 
-  void initIdentifierInfo(ASTContext &Ctx) const;
+  void reportBlockInCritSection(const CallEvent &call, CheckerContext &C) const;
 
-  void reportBlockInCritSection(SymbolRef FileDescSym,
-                                const CallEvent &call,
-                                CheckerContext &C) const;
+  [[nodiscard]] const NoteTag *createCritSectionNote(CritSectionMarker M,
+                                                     CheckerContext &C) const;
 
-public:
-  bool isBlockingFunction(const CallEvent &Call) const;
-  bool isLockFunction(const CallEvent &Call) const;
-  bool isUnlockFunction(const CallEvent &Call) const;
+  [[nodiscard]] std::optional<MutexDescriptor>
+  checkDescriptorMatch(const CallEvent &Call, CheckerContext &C,
+                       bool IsLock) const;
+
+  void handleLock(const MutexDescriptor &Mutex, const CallEvent &Call,
+                  CheckerContext &C) const;
 
+  void handleUnlock(const MutexDescriptor &Mutex, const CallEvent &Call,
+                    CheckerContext &C) const;
+
+  [[nodiscard]] bool isBlockingInCritSection(const CallEvent &Call,
+                                             CheckerContext &C) const;
+
+public:
   /// Process unlock.
   /// Process lock.
   /// Process blocking functions (sleep, getc, fgets, read, recv)
@@ -70,73 +200,118 @@ class BlockInCriticalSectionChecker : public Checker<check::PostCall> {
 
 } // end anonymous namespace
 
-REGISTER_TRAIT_WITH_PROGRAMSTATE(MutexCounter, unsigned)
-
-void BlockInCriticalSectionChecker::initIdentifierInfo(ASTContext &Ctx) const {
-  if (!IdentifierInfoInitialized) {
-    /* In case of checking C code, or when the corresponding headers are not
-     * included, we might end up query the identifier table every time when this
-     * function is called instead of early returning it. To avoid this, a bool
-     * variable (IdentifierInfoInitialized) is used and the function will be run
-     * only once. */
-    IILockGuard  = &Ctx.Idents.get(ClassLockGuard);
-    IIUniqueLock = &Ctx.Idents.get(ClassUniqueLock);
-    IdentifierInfoInitialized = true;
-  }
+REGISTER_LIST_WITH_PROGRAMSTATE(ActiveCritSections, CritSectionMarker)
+
+namespace std {
+// Iterator traits for ImmutableList data structure
+// that enable the use of STL algorithms.
+// TODO: Move these to llvm::ImmutableList when overhauling immutable data
+// structures for proper iterator concept support.
+template <>
+struct iterator_traits<
+    typename llvm::ImmutableList<CritSectionMarker>::iterator> {
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = CritSectionMarker;
+  using difference_type = std::ptrdiff_t;
+  using reference = CritSectionMarker &;
+  using pointer = CritSectionMarker *;
+};
+} // namespace std
+
+std::optional<MutexDescriptor>
+BlockInCriticalSectionChecker::checkDescriptorMatch(const CallEvent &Call,
+                                                    CheckerContext &C,
+                                                    bool IsLock) const {
+  const auto Descriptor =
+      llvm::find_if(MutexDescriptors, [&Call, IsLock](auto &&Descriptor) {
+        return std::visit(
+            [&Call, IsLock](auto &&DescriptorImpl) {
+              return DescriptorImpl.matches(Call, IsLock);
+            },
+            Descriptor);
+      });
+  if (Descriptor != MutexDescriptors.end())
+    return *Descriptor;
+  return std::nullopt;
 }
 
-bool BlockInCriticalSectionChecker::isBlockingFunction(const CallEvent &Call) const {
-  return matchesAny(Call, SleepFn, GetcFn, FgetsFn, ReadFn, RecvFn);
+static const MemRegion *getRegion(const CallEvent &Call,
+                                  const MutexDescriptor &Descriptor,
+                                  bool IsLock) {
+  return std::visit(
+      [&Call, IsLock](auto &&Descriptor) {
+        return Descriptor.getRegion(Call, IsLock);
+      },
+      Descriptor);
 }
 
-bool BlockInCriticalSectionChecker::isLockFunction(const CallEvent &Call) const {
-  if (const auto *Ctor = dyn_cast<CXXConstructorCall>(&Call)) {
-    auto IdentifierInfo = Ctor->getDecl()->getParent()->getIdentifier();
-    if (IdentifierInfo == IILockGuard || IdentifierInfo == IIUniqueLock)
-      return true;
-  }
+void BlockInCriticalSectionChecker::handleLock(
+    const MutexDescriptor &LockDescriptor, const CallEvent &Call,
+    CheckerContext &C) const {
+  const MemRegion *MutexRegion =
+      getRegion(Call, LockDescriptor, /*IsLock=*/true);
+  if (!MutexRegion)
+    return;
 
-  return matchesAny(Call, LockFn, PthreadLockFn, PthreadTryLockFn, MtxLock,
-                    MtxTimedLock, MtxTryLock);
+  const CritSectionMarker MarkToAdd{Call.getOriginExpr(), MutexRegion};
+  ProgramStateRef StateWithLockEvent =
+      C.getState()->add<ActiveCritSections>(MarkToAdd);
+  C.addTransition(StateWithLockEvent, createCritSectionNote(MarkToAdd, C));
 }
 
-bool BlockInCriticalSectionChecker::isUnlockFunction(const CallEvent &Call) const {
-  if (const auto *Dtor = dyn_cast<CXXDestructorCall>(&Call)) {
-    const auto *DRecordDecl = cast<CXXRecordDecl>(Dtor->getDecl()->getParent());
-    auto IdentifierInfo = DRecordDecl->getIdentifier();
-    if (IdentifierInfo == IILockGuard || IdentifierInfo == IIUniqueLock)
-      return true;
+void BlockInCriticalSectionChecker::handleUnlock(
+    const MutexDescriptor &UnlockDescriptor, const CallEvent &Call,
+    CheckerContext &C) const {
+  const MemRegion *MutexRegion =
+      getRegion(Call, UnlockDescriptor, /*IsLock=*/false);
+  if (!MutexRegion)
+    return;
+
+  ProgramStateRef State = C.getState();
+  const auto ActiveSections = State->get<ActiveCritSections>();
+  const auto MostRecentLock =
+      llvm::find_if(ActiveSections, [MutexRegion](auto &&Marker) {
+        return Marker.LockReg == MutexRegion;
+      });
+  if (MostRecentLock == ActiveSections.end())
+    return;
+
+  // Build a new ImmutableList without this element.
+  auto &Factory = State->get_context<ActiveCritSections>();
+  llvm::ImmutableList<CritSectionMarker> NewList = Factory.getEmptyList();
+  for (auto It = ActiveSections.begin(), End = ActiveSections.end(); It != End;
+       ++It) {
+    if (It != MostRecentLock)
+      NewList = Factory.add(*It, NewList);
   }
 
-  return matchesAny(Call, UnlockFn, PthreadUnlockFn, MtxUnlock);
+  State = State->set<ActiveCritSections>(NewList);
+  C.addTransition(State);
+}
+
+bool BlockInCriticalSectionChecker::isBlockingInCritSection(
+    const CallEvent &Call, CheckerContext &C) const {
+  return llvm::any_of(BlockingFunctions,
+                      [&Call](auto &&Fn) { return Fn.matches(Call); }) &&
+         !C.getState()->get<ActiveCritSections>().isEmpty();
 }
 
 void BlockInCriticalSectionChecker::checkPostCall(const CallEvent &Call,
                                                   CheckerContext &C) const {
-  initIdentifierInfo(C.getASTContext());
-
-  if (!isBlockingFunction(Call)
-      && !isLockFunction(Call)
-      && !isUnlockFunction(Call))
-    return;
-
-  ProgramStateRef State = C.getState();
-  unsigned mutexCount = State->get<MutexCounter>();
-  if (isUnlockFunction(Call) && mutexCount > 0) {
-    State = State->set<MutexCounter>(--mutexCount);
-    C.addTransition(State);
-  } else if (isLockFunction(Call)) {
-    State = State->set<MutexCounter>(++mutexCount);
-    C.addTransition(State);
-  } else if (mutexCount > 0) {
-    SymbolRef BlockDesc = Call.getReturnValue().getAsSymbol();
-    reportBlockInCritSection(BlockDesc, Call, C);
+  if (isBlockingInCritSection(Call, C)) {
+    reportBlockInCritSection(Call, C);
+  } else if (std::optional<MutexDescriptor> LockDesc =
+                 checkDescriptorMatch(Call, C, /*IsLock=*/true)) {
+    handleLock(*LockDesc, Call, C);
+  } else if (std::optional<MutexDescriptor> UnlockDesc =
+                 checkDescriptorMatch(Call, C, /*IsLock=*/false)) {
+    handleUnlock(*UnlockDesc, Call, C);
   }
 }
 
 void BlockInCriticalSectionChecker::reportBlockInCritSection(
-    SymbolRef BlockDescSym, const CallEvent &Call, CheckerContext &C) const {
-  ExplodedNode *ErrNode = C.generateNonFatalErrorNode();
+    const CallEvent &Call, CheckerContext &C) const {
+  ExplodedNode *ErrNode = C.generateNonFatalErrorNode(C.getState());
   if (!ErrNode)
     return;
 
@@ -147,14 +322,63 @@ void BlockInCriticalSectionChecker::reportBlockInCritSection(
   auto R = std::make_unique<PathSensitiveBugReport>(BlockInCritSectionBugType,
                                                     os.str(), ErrNode);
   R->addRange(Call.getSourceRange());
-  R->markInteresting(BlockDescSym);
+  R->markInteresting(Call.getReturnValue());
   C.emitReport(std::move(R));
 }
 
+const NoteTag *
+BlockInCriticalSectionChecker::createCritSectionNote(CritSectionMarker M,
+                                                     CheckerContext &C) const {
+  const BugType *BT = &this->BlockInCritSectionBugType;
+  return C.getNoteTag([M, BT](PathSensitiveBugReport &BR,
+                              llvm::raw_ostream &OS) {
+    if (&BR.getBugType() != BT)
+      return;
+
+    // Get the lock events for the mutex of the current line's lock event.
+    const auto CritSectionBegins =
+        BR.getErrorNode()->getState()->get<ActiveCritSections>();
+    llvm::SmallVector<CritSectionMarker, 4> LocksForMutex;
+    llvm::copy_if(
+        CritSectionBegins, std::back_inserter(LocksForMutex),
+        [M](const auto &Marker) { return Marker.LockReg == M.LockReg; });
+    if (LocksForMutex.empty())
+      return;
+
+    // As the ImmutableList builds the locks by prepending them, we
+    // reverse the list to get the correct order.
+    std::reverse(LocksForMutex.begin(), LocksForMutex.end());
+
+    // Find the index of the lock expression in the list of all locks for a
+    // given mutex (in acquisition order).
+    const auto Position =
+        llvm::find_if(std::as_const(LocksForMutex), [M](const auto &Marker) {
+          return Marker.LockExpr == M.LockExpr;
+        });
+    if (Position == LocksForMutex.end())
+      return;
+
+    // If there is only one lock event, we don't need to specify how many times
+    // the critical section was entered.
+    if (LocksForMutex.size() == 1) {
+      OS << "Entering critical section here";
+      return;
+    }
+
+    const auto IndexOfLock =
+        std::distance(std::as_const(LocksForMutex).begin(), Position);
+
+    const auto OrdinalOfLock = IndexOfLock + 1;
+    OS << "Entering critical section for the " << OrdinalOfLock
+       << llvm::getOrdinalSuffix(OrdinalOfLock) << " time here";
+  });
+}
+
 void ento::registerBlockInCriticalSectionChecker(CheckerManager &mgr) {
   mgr.registerChecker<BlockInCriticalSectionChecker>();
 }
 
-bool ento::shouldRegisterBlockInCriticalSectionChecker(const CheckerManager &mgr) {
+bool ento::shouldRegisterBlockInCriticalSectionChecker(
+    const CheckerManager &mgr) {
   return true;
 }
diff --git a/clang/lib/Tooling/Inclusions/Stdlib/CSpecialSymbolMap.inc b/clang/lib/Tooling/Inclusions/Stdlib/CSpecialSymbolMap.inc
new file mode 100644
index 00000000000000..a515f69ea6a8cf
--- /dev/null
+++ b/clang/lib/Tooling/Inclusions/Stdlib/CSpecialSymbolMap.inc
@@ -0,0 +1,14 @@
+//===-- StdSpecialSymbolMap.inc -----------------------------------*- C -*-===//
+//
+// This is a hand-curated list for C symbols that cannot be parsed/extracted
+// via the include-mapping tool (gen_std.py).
+//
+//===----------------------------------------------------------------------===//
+
+SYMBOL(size_t, None, <stddef.h>)
+SYMBOL(size_t, None, <stdio.h>)
+SYMBOL(size_t, None, <stdlib.h>)
+SYMBOL(size_t, None, <string.h>)
+SYMBOL(size_t, None, <time.h>)
+SYMBOL(size_t, None, <uchar.h>)
+SYMBOL(size_t, None, <wchar.h>)
diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
index adf1b230ff0318..0832bcf66145fa 100644
--- a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
+++ b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
@@ -55,11 +55,12 @@ static const SymbolHeaderMapping *getMappingPerLang(Lang L) {
 }
 
 static int countSymbols(Lang Language) {
-  ArrayRef<const char*> Symbols;
+  ArrayRef<const char *> Symbols;
 #define SYMBOL(Name, NS, Header) #NS #Name,
   switch (Language) {
   case Lang::C: {
     static constexpr const char *CSymbols[] = {
+#include "CSpecialSymbolMap.inc"
 #include "CSymbolMap.inc"
     };
     Symbols = CSymbols;
@@ -147,6 +148,7 @@ static int initialize(Lang Language) {
   switch (Language) {
   case Lang::C: {
     static constexpr Symbol CSymbols[] = {
+#include "CSpecialSymbolMap.inc"
 #include "CSymbolMap.inc"
     };
     for (const Symbol &S : CSymbols)
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index 4b112d7fdddfdb..2443992f75fb7d 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -429,18 +429,13 @@ namespace Incomplete {
                           // both-note {{array-to-pointer decay of array member without known bound}}
 
   /// These are from test/SemaCXX/constant-expression-cxx11.cpp
-  /// and are the only tests using the 'indexing of array without known bound' diagnostic.
-  /// We currently diagnose them differently.
-  extern int arr[]; // expected-note 3{{declared here}}
+  extern int arr[];
   constexpr int *c = &arr[1]; // both-error  {{must be initialized by a constant expression}} \
-                              // ref-note {{indexing of array without known bound}} \
-                              // expected-note {{read of non-constexpr variable 'arr'}}
+                              // both-note {{indexing of array without known bound}}
   constexpr int *d = &arr[1]; // both-error  {{must be initialized by a constant expression}} \
-                              // ref-note {{indexing of array without known bound}} \
-                              // expected-note {{read of non-constexpr variable 'arr'}}
+                              // both-note {{indexing of array without known bound}}
   constexpr int *e = arr + 1; // both-error  {{must be initialized by a constant expression}} \
-                              // ref-note {{indexing of array without known bound}} \
-                              // expected-note {{read of non-constexpr variable 'arr'}}
+                              // both-note {{indexing of array without known bound}}
 }
 
 namespace GH69115 {
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index 08fca8428cf5e8..2e9d1a831dcf6e 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -510,3 +510,14 @@ namespace bswap {
   int h4 = __builtin_bswap32(0x1234) == 0x34120000 ? 1 : f();
   int h5 = __builtin_bswap64(0x1234) == 0x3412000000000000 ? 1 : f();
 }
+
+#define CFSTR __builtin___CFStringMakeConstantString
+void test7(void) {
+  const void *X;
+#if !defined(_AIX)
+  X = CFSTR("\242"); // both-warning {{input conversion stopped}}
+  X = CFSTR("\0"); // no-warning
+  X = CFSTR(242); // both-error {{cannot initialize a parameter of type 'const char *' with an rvalue of type 'int'}}
+  X = CFSTR("foo", "bar"); // both-error {{too many arguments to function call}}
+#endif
+}
diff --git a/clang/test/AST/Interp/builtins.cpp b/clang/test/AST/Interp/builtins.cpp
index 06a22b16b2dcbb..9095d1bf8d6a3a 100644
--- a/clang/test/AST/Interp/builtins.cpp
+++ b/clang/test/AST/Interp/builtins.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -verify -fms-extensions
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -fms-extensions -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -Wno-constant-evaluated -verify -fms-extensions
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter %s -Wno-constant-evaluated -fms-extensions -S -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -verify=ref %s -Wno-constant-evaluated -fms-extensions
 // RUN: %clang_cc1 -verify=ref %s -Wno-constant-evaluated %s -fms-extensions -S -emit-llvm -o - | FileCheck %s
 
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 8de6139efbea09..10e23839f2ba2a 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -66,11 +66,11 @@ _Static_assert((&a - 100) != 0, ""); // pedantic-ref-warning {{is a GNU extensio
                                      // pedantic-ref-note {{-100 of non-array}} \
                                      // pedantic-expected-note {{-100 of non-array}}
 /// extern variable of a composite type.
-/// FIXME: The 'cast from void*' note is missing in the new interpreter.
+/// FIXME: The 'this conversion is not allowed' note is missing in the new interpreter.
 extern struct Test50S Test50;
 _Static_assert(&Test50 != (void*)0, ""); // all-warning {{always true}} \
                                          // pedantic-ref-warning {{is a GNU extension}} \
-                                         // pedantic-ref-note {{cast from 'void *' is not allowed}} \
+                                         // pedantic-ref-note {{this conversion is not allowed in a constant expression}} \
                                          // pedantic-expected-warning {{is a GNU extension}}
 
 struct y {int x,y;};
@@ -201,3 +201,11 @@ void localCompoundLiteral(void) {
        // pedantic-ref-warning {{use of an empty initializer}}
   };
 }
+
+/// struct copy
+struct StrA {int a; };
+const struct StrA sa = { 12 };
+const struct StrA * const sb = &sa;
+const struct StrA sc = *sb;
+_Static_assert(sc.a == 12, ""); // pedantic-ref-warning {{GNU extension}} \
+                                // pedantic-expected-warning {{GNU extension}}
diff --git a/clang/test/AST/Interp/c23.c b/clang/test/AST/Interp/c23.c
new file mode 100644
index 00000000000000..cf1bf4d4e7d905
--- /dev/null
+++ b/clang/test/AST/Interp/c23.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -std=c23 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -std=c23 -verify=ref,both %s
+
+
+
+const _Bool inf1 =  (1.0/0.0 == __builtin_inf());
+constexpr _Bool inf2 = (1.0/0.0 == __builtin_inf()); // both-error {{must be initialized by a constant expression}} \
+                                                     // both-note {{division by zero}}
+constexpr _Bool inf3 = __builtin_inf() == __builtin_inf();
+
+
diff --git a/clang/test/AST/Interp/complex.c b/clang/test/AST/Interp/complex.c
index b5f30b87baa79a..a39f83160ef8ca 100644
--- a/clang/test/AST/Interp/complex.c
+++ b/clang/test/AST/Interp/complex.c
@@ -19,3 +19,12 @@ const _Complex float FC = {0.0f, 0.0f};
 _Static_assert(!FC, "");
 const _Complex float FI = {0, 0};
 _Static_assert(!FI, "");
+
+
+/// Make sure we're stripping the _Atomic part from the
+/// complex type.
+void testComplexFloat(_Atomic(_Complex float) *fp) {
+  _Atomic(_Complex float) x = 2.0f;
+  _Complex float f = *fp;
+  *fp = f;
+}
diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp
index 6a42afc68d26c7..09cb620d7b7c39 100644
--- a/clang/test/AST/Interp/complex.cpp
+++ b/clang/test/AST/Interp/complex.cpp
@@ -9,6 +9,9 @@ static_assert(&__imag z1 == &__real z1 + 1, "");
 static_assert((*(&__imag z1)) == __imag z1, "");
 static_assert((*(&__real z1)) == __real z1, "");
 
+constexpr _Complex int Comma1 = {1, 2};
+constexpr _Complex int Comma2 = (0, Comma1);
+static_assert(Comma1 == Comma1, "");
 
 constexpr double setter() {
   _Complex float d = {1.0, 2.0};
@@ -313,3 +316,53 @@ namespace Cmp {
   static_assert((0.0 + 0.0j) > (0.0 + 0.0j)); // both-error {{invalid operands to binary expression}}
   static_assert((0.0 + 0.0j) ^ (0.0 + 0.0j)); // both-error {{invalid operands to binary expression}}
 }
+
+/// From test/SemaCXX/constant-expression-cxx11.cpp
+///
+/// Some of the diagnostics we emit are different than the one of the
+/// current interpreter.
+///
+/// FIXME: For the '&test3 + 1' test, we are _not_ creating an explicit pointer variable
+/// anywhere and so the &test3+1 is the same as __imag(test3) for us.
+namespace ComplexConstexpr {
+  constexpr _Complex float test1 = {};
+  constexpr _Complex float test2 = {1};
+  constexpr _Complex double test3 = {1,2};
+  constexpr _Complex int test4 = {4};
+  constexpr _Complex int test5 = 4;
+  constexpr _Complex int test6 = {5,6};
+  typedef _Complex float fcomplex;
+  constexpr fcomplex test7 = fcomplex();
+
+  constexpr const double &t2r = __real test3;
+  constexpr const double &t2i = __imag test3;
+  static_assert(&t2r + 1 == &t2i, "");
+  static_assert(t2r == 1.0, "");
+  static_assert(t2i == 2.0, "");
+  constexpr const double *t2p = &t2r;
+  static_assert(t2p[-1] == 0.0, ""); // both-error {{constant expr}} \
+                                     // both-note {{cannot refer to element -1 of array of 2 elements}}
+  static_assert(t2p[0] == 1.0, "");
+  static_assert(t2p[1] == 2.0, "");
+  static_assert(t2p[2] == 0.0, ""); // both-error {{constant expr}} \
+                                    // both-note {{one-past-the-end pointer}}
+  static_assert(t2p[3] == 0.0, ""); // both-error {{constant expr}} \
+                                    // both-note {{cannot refer to element 3 of array of 2 elements}}
+  constexpr _Complex float *p = 0;
+  constexpr float pr = __real *p; // both-error {{constant expr}} \
+                                  // ref-note {{cannot access real component of null}} \
+                                  // expected-note {{read of dereferenced null pointer}}
+  constexpr float pi = __imag *p; // both-error {{constant expr}} \
+                                  // ref-note {{cannot access imaginary component of null}} \
+                                  // expected-note {{cannot perform pointer arithmetic on null pointer}}
+  constexpr const _Complex double *q = &test3 + 1;
+  constexpr double qr = __real *q; // ref-error {{constant expr}} \
+                                   // ref-note {{cannot access real component of pointer past the end}}
+  constexpr double qi = __imag *q; // both-error {{constant expr}} \
+                                   // ref-note {{cannot access imaginary component of pointer past the end}} \
+                                   // expected-note {{read of dereferenced one-past-the-end pointer}}
+
+  static_assert(__real test6 == 5, "");
+  static_assert(__imag test6 == 6, "");
+  static_assert(&__imag test6 == &__real test6 + 1, "");
+}
diff --git a/clang/test/AST/Interp/cxx23.cpp b/clang/test/AST/Interp/cxx23.cpp
index 127b58915127cf..042e29613aa753 100644
--- a/clang/test/AST/Interp/cxx23.cpp
+++ b/clang/test/AST/Interp/cxx23.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref20,all,all-20 %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref20,all,all20 %s
 // RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=ref23,all %s
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=expected20,all,all-20 %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=expected20,all,all20 %s -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=expected23,all %s -fexperimental-new-constant-interpreter
 
 /// FIXME: The new interpreter is missing all the 'control flows through...' diagnostics.
@@ -108,9 +108,9 @@ namespace StaticOperators {
   static_assert(f2() == 3);
 
   struct S1 {
-    constexpr S1() { // all-20-error {{never produces a constant expression}}
+    constexpr S1() { // all20-error {{never produces a constant expression}}
       throw; // all-note {{not valid in a constant expression}} \
-             // all-20-note {{not valid in a constant expression}}
+             // all20-note {{not valid in a constant expression}}
     }
     static constexpr int operator()() { return 3; } // ref20-warning {{C++23 extension}} \
                                                     // expected20-warning {{C++23 extension}}
@@ -121,3 +121,28 @@ namespace StaticOperators {
 
 
 }
+
+int test_in_lambdas() {
+  auto c = [](int n) constexpr {
+    if (n == 0)
+      return 0;
+    else
+      goto test; // all-note {{subexpression not valid in a constant expression}} \
+                 // all20-warning {{use of this statement in a constexpr function is a C++23 extension}}
+  test:
+    return 1;
+  };
+  c(0);
+  constexpr auto A = c(1); // all-error {{must be initialized by a constant expression}} \
+                           // all-note {{in call to}}
+  return 0;
+}
+
+/// PackIndexExpr.
+template <auto... p>
+struct check_ice {
+    enum e {
+        x = p...[0] // all-warning {{is a C++2c extension}}
+    };
+};
+static_assert(check_ice<42>::x == 42);
diff --git a/clang/test/AST/Interp/cxx98.cpp b/clang/test/AST/Interp/cxx98.cpp
index 73e45372066334..ba6bcd97d920dd 100644
--- a/clang/test/AST/Interp/cxx98.cpp
+++ b/clang/test/AST/Interp/cxx98.cpp
@@ -18,13 +18,12 @@ template struct C<cval>;
 
 /// FIXME: This example does not get properly diagnosed in the new interpreter.
 extern const int recurse1;
-const int recurse2 = recurse1; // both-note {{declared here}}
+const int recurse2 = recurse1; // ref-note {{declared here}}
 const int recurse1 = 1;
 int array1[recurse1];
 int array2[recurse2]; // ref-warning 2{{variable length array}} \
                       // ref-note {{initializer of 'recurse2' is not a constant expression}} \
                       // expected-warning {{variable length array}} \
-                      // expected-note {{read of non-const variable 'recurse2'}} \
                       // expected-error {{variable length array}}
 
 int NCI; // both-note {{declared here}}
@@ -39,3 +38,10 @@ struct V {
                          // both-error {{constructor cannot have a return type}}
 };
 _Static_assert(V().c[0], ""); // both-error {{is not an integral constant expression}}
+
+struct C0 {
+  template<typename U> static U Data; // both-warning {{C++14 extension}}
+  template<typename U> static const U Data<U*> = U();
+};
+const int c0_test = C0::Data<int*>;
+_Static_assert(c0_test == 0, "");
diff --git a/clang/test/AST/Interp/if.cpp b/clang/test/AST/Interp/if.cpp
index 86ae8de6f73ebb..37289d69d32554 100644
--- a/clang/test/AST/Interp/if.cpp
+++ b/clang/test/AST/Interp/if.cpp
@@ -1,8 +1,5 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fexperimental-new-constant-interpreter %s -verify
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only %s -verify=ref
-
-// expected-no-diagnostics
-// ref-no-diagnostics
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fexperimental-new-constant-interpreter %s -verify=expected,both
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only %s -verify=ref,both
 
 namespace ConstEval {
   constexpr int f() {
@@ -51,3 +48,13 @@ namespace InitDecl {
   }
   static_assert(attrs() == 1, "");
 };
+
+/// The faulty if statement creates a RecoveryExpr with contains-errors,
+/// but the execution will never reach that.
+constexpr char g(char const (&x)[2]) {
+    return 'x';
+  if (auto [a, b] = x) // both-error {{an array type is not allowed here}} \
+                       // both-warning {{ISO C++17 does not permit structured binding declaration in a condition}}
+    ;
+}
+static_assert(g("x") == 'x');
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index 7ae8499b1156dd..277438d2e63114 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -910,6 +910,18 @@ namespace TypeTraits {
   struct U {};
   static_assert(S3<U>{}.foo(), "");
   static_assert(!S3<T>{}.foo(), "");
+
+  typedef int Int;
+  typedef Int IntAr[10];
+  typedef const IntAr ConstIntAr;
+  typedef ConstIntAr ConstIntArAr[4];
+
+  static_assert(__array_rank(IntAr) == 1, "");
+  static_assert(__array_rank(ConstIntArAr) == 2, "");
+
+  static_assert(__array_extent(IntAr, 0) == 10, "");
+  static_assert(__array_extent(ConstIntArAr, 0) == 4, "");
+  static_assert(__array_extent(ConstIntArAr, 1) == 10, "");
 }
 
 #if __cplusplus >= 201402L
@@ -1113,6 +1125,9 @@ namespace InvalidDeclRefs {
   int b03 = 3; // both-note {{declared here}}
   static_assert(b03, ""); // both-error {{not an integral constant expression}} \
                           // both-note {{read of non-const variable}}
+
+  extern int var;
+  constexpr int *varp = &var; // Ok.
 }
 
 namespace NonConstReads {
@@ -1182,8 +1197,16 @@ namespace incdecbool {
 }
 
 #if __cplusplus >= 201402L
+/// NOTE: The diagnostics of the two interpreters are a little
+/// different here, but they both make sense.
 constexpr int externvar1() { // both-error {{never produces a constant expression}}
-  extern char arr[]; // both-note {{declared here}}
-  return arr[0]; // both-note {{read of non-constexpr variable 'arr'}}
+  extern char arr[]; // ref-note {{declared here}}
+   return arr[0]; // ref-note {{read of non-constexpr variable 'arr'}} \
+                  // expected-note {{array-to-pointer decay of array member without known bound is not supported}}
 }
 #endif
+
+namespace Extern {
+  constexpr extern char Oops = 1;
+  static_assert(Oops == 1, "");
+}
diff --git a/clang/test/AST/Interp/ms.cpp b/clang/test/AST/Interp/ms.cpp
new file mode 100644
index 00000000000000..99716e90c7a1db
--- /dev/null
+++ b/clang/test/AST/Interp/ms.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -verify=ref,both %s -fms-extensions
+// RUN: %clang_cc1 -verify=expected,both %s -fexperimental-new-constant-interpreter -fms-extensions
+
+// ref-no-diagnostics
+// expected-no-diagnostics
+
+/// Used to assert because the two parameters to _rotl do not have the same type.
+static_assert(_rotl(0x01, 5) == 32);
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 7ddc56c9b5dfc4..0f76e0cfe99277 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1238,3 +1238,50 @@ namespace InvalidCtorInitializer {
   // no crash on evaluating the constexpr ctor.
   constexpr int Z = X().Y; // both-error {{constexpr variable 'Z' must be initialized by a constant expression}}
 }
+
+extern int f(); // both-note {{here}}
+struct HasNonConstExprMemInit {
+  int x = f(); // both-note {{non-constexpr function}}
+  constexpr HasNonConstExprMemInit() {} // both-error {{never produces a constant expression}}
+};
+
+namespace {
+  template <class Tp, Tp v>
+  struct integral_constant {
+    static const Tp value = v;
+  };
+
+  template <class Tp, Tp v>
+  const Tp integral_constant<Tp, v>::value;
+
+  typedef integral_constant<bool, true> true_type;
+  typedef integral_constant<bool, false> false_type;
+
+  /// This might look innocent, but we get an evaluateAsInitializer call for the
+  /// static bool member before evaluating the first static_assert, but we do NOT
+  /// get such a call for the second one. So the second one needs to lazily visit
+  /// the data member itself.
+  static_assert(true_type::value, "");
+  static_assert(true_type::value, "");
+}
+
+#if __cplusplus >= 202002L
+namespace {
+  /// Used to crash because the CXXDefaultInitExpr is of compound type.
+  struct A {
+    int &x;
+    constexpr ~A() { --x; }
+  };
+  struct B {
+    int &x;
+    const A &a = A{x};
+  };
+  constexpr int a() {
+    int x = 1;
+    int f = B{x}.x;
+    B{x}; // both-warning {{expression result unused}}
+
+    return 1;
+  }
+}
+#endif
diff --git a/clang/test/Analysis/block-in-critical-section.cpp b/clang/test/Analysis/block-in-critical-section.cpp
index fcf6188fc033ec..87c26b9f1b5209 100644
--- a/clang/test/Analysis/block-in-critical-section.cpp
+++ b/clang/test/Analysis/block-in-critical-section.cpp
@@ -1,4 +1,8 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.unix.BlockInCriticalSection -std=c++11 -verify %s
+// RUN: %clang_analyze_cc1 \
+// RUN:   -analyzer-checker=alpha.unix.BlockInCriticalSection \
+// RUN:   -std=c++11 \
+// RUN:   -analyzer-output text \
+// RUN:   -verify %s
 
 void sleep(int x) {}
 
@@ -21,108 +25,242 @@ template<typename T>
 struct not_real_lock {
   not_real_lock<T>(std::mutex) {}
 };
-}
+} // namespace std
+
+struct FILE;
+int getc(FILE *stream);
+char* fgets(char *str, FILE *stream);
+using ssize_t = long long;
+using size_t = unsigned long long;
+ssize_t read(int fd, void *buf, size_t count);
+ssize_t recv(int sockfd, void *buf, size_t len, int flags);
 
-void getc() {}
-void fgets() {}
-void read() {}
-void recv() {}
+struct pthread_mutex_t;
+void pthread_mutex_lock(pthread_mutex_t *mutex);
+void pthread_mutex_trylock(pthread_mutex_t *mutex);
+void pthread_mutex_unlock(pthread_mutex_t *mutex);
 
-void pthread_mutex_lock() {}
-void pthread_mutex_trylock() {}
-void pthread_mutex_unlock() {}
+struct mtx_t;
+void mtx_lock(mtx_t *mutex);
+void mtx_timedlock(mtx_t *mutex);
+void mtx_trylock(mtx_t *mutex);
+void mtx_unlock(mtx_t *mutex);
 
-void mtx_lock() {}
-void mtx_timedlock() {}
-void mtx_trylock() {}
-void mtx_unlock() {}
+// global params for dummy function calls
+FILE *stream;
+char *str;
+int fd;
+void *buf;
+size_t count;
+int sockfd;
+size_t len;
+int flags;
 
 void testBlockInCriticalSectionWithStdMutex() {
   std::mutex m;
-  m.lock();
+  m.lock(); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
   m.unlock();
 }
 
-void testBlockInCriticalSectionWithPthreadMutex() {
-  pthread_mutex_lock();
+void testBlockInCriticalSectionWithPthreadMutex(pthread_mutex_t *mutex) {
+  pthread_mutex_lock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  pthread_mutex_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  pthread_mutex_unlock(mutex);
 
-  pthread_mutex_trylock();
+  pthread_mutex_trylock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  pthread_mutex_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  pthread_mutex_unlock(mutex);
 }
 
-void testBlockInCriticalSectionC11Locks() {
-  mtx_lock();
+void testBlockInCriticalSectionC11Locks(mtx_t *mutex) {
+  mtx_lock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  mtx_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  mtx_unlock(mutex);
 
-  mtx_timedlock();
+  mtx_timedlock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  mtx_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  mtx_unlock(mutex);
 
-  mtx_trylock();
+  mtx_trylock(mutex); // expected-note 5{{Entering critical section here}}
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
-  getc(); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
-  fgets(); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
-  read(); // expected-warning {{Call to blocking function 'read' inside of critical section}}
-  recv(); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
-  mtx_unlock();
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  getc(stream); // expected-warning {{Call to blocking function 'getc' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'getc' inside of critical section}}
+  fgets(str, stream); // expected-warning {{Call to blocking function 'fgets' inside of critical section}}
+           // expected-note@-1 {{Call to blocking function 'fgets' inside of critical section}}
+  read(fd, buf, count); // expected-warning {{Call to blocking function 'read' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'read' inside of critical section}}
+  recv(sockfd, buf, count, flags); // expected-warning {{Call to blocking function 'recv' inside of critical section}}
+          // expected-note@-1 {{Call to blocking function 'recv' inside of critical section}}
+  mtx_unlock(mutex);
+}
+
+void testMultipleBlockingCalls() {
+  std::mutex m;
+  m.lock(); // expected-note 1{{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  sleep(2); // no-warning
 }
 
-void testBlockInCriticalSectionWithNestedMutexes() {
+void testMultipleMutexesMultipleBlockingCalls() {
   std::mutex m, n, k;
-  m.lock();
-  n.lock();
-  k.lock();
-  sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+  m.lock(); // expected-note 2{{Entering critical section here}}
+  n.lock(); // expected-note 2{{Entering critical section here}}
+  k.lock(); // expected-note 1{{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
   k.unlock();
-  sleep(5); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+}
+
+
+void testRecursiveAcquisition() {
+  std::mutex m;
+  m.lock(); // expected-note {{Entering critical section for the 1st time here}}
+  m.lock(); // expected-note {{Entering critical section for the 2nd time here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  m.unlock();
+}
+
+void testRecursiveAcquisitionWithMultipleBlockingCalls() {
+  std::mutex m;
+  m.lock(); // expected-note 1{{Entering critical section for the 1st time here}}
+            // expected-note@-1 {{Entering critical section here}}
+  m.lock(); // expected-note 1{{Entering critical section for the 2nd time here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  // this next 'sleep' call is only in the critical section of the first lock
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+}
+
+void testRecursiveAcquisitionWithMultipleMutexes() {
+  std::mutex m, n;
+  m.lock(); // expected-note 1{{Entering critical section here}}
+  n.lock(); // expected-note 2{{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  // this next 'sleep' call is only in the critical section of mutex 'n'
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  n.unlock();
+}
+
+
+void testNestedMutexes() {
+  std::mutex m, n, k;
+  m.lock(); // expected-note 3{{Entering critical section here}}
+  n.lock(); // expected-note 2{{Entering critical section here}}
+  k.lock(); // expected-note 1{{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  k.unlock();
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
   n.unlock();
   sleep(3); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+  sleep(4); // no-warning
+}
+
+void testNonOverlappingMutexes() {
+  std::mutex m;
+  m.lock(); // There should be no warning here
+  m.unlock();
+  m.lock(); // expected-note {{Entering critical section here}}
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
+  m.unlock();
+}
+
+void testMixedMutexLocksWithIntermittentUnlock() {
+  std::mutex m, n, k;
+  m.lock(); // expected-note {{Entering critical section here}}
+  n.lock(); // the problem is not is this lock's critical section
+  n.unlock();
+  k.lock(); // same as for n.lock()
+  k.unlock();
+  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
   m.unlock();
-  sleep(3); // no-warning
 }
 
 void f() {
   sleep(1000); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+               // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
 }
 
 void testBlockInCriticalSectionInterProcedural() {
   std::mutex m;
-  m.lock();
-  f();
+  m.lock(); // expected-note {{Entering critical section here}}
+  f(); // expected-note {{Calling 'f'}}
   m.unlock();
 }
 
+void unknown_function_that_may_lock(std::mutex &);
 void testBlockInCriticalSectionUnexpectedUnlock() {
   std::mutex m;
+  unknown_function_that_may_lock(m);
   m.unlock();
   sleep(1); // no-warning
-  m.lock();
-  sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+  m.lock(); // expected-note {{Entering critical section here}}
+  sleep(2); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
 }
 
 void testBlockInCriticalSectionLockGuard() {
@@ -130,12 +268,13 @@ void testBlockInCriticalSectionLockGuard() {
   std::not_real_lock<std::mutex> not_real_lock(g_mutex);
   sleep(1); // no-warning
 
-  std::lock_guard<std::mutex> lock(g_mutex);
+  std::lock_guard<std::mutex> lock(g_mutex); // expected-note {{Entering critical section here}}
   sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
 }
 
 void testBlockInCriticalSectionLockGuardNested() {
-  testBlockInCriticalSectionLockGuard();
+  testBlockInCriticalSectionLockGuard(); // expected-note {{Calling 'testBlockInCriticalSectionLockGuard'}}
   sleep(1); // no-warning
 }
 
@@ -144,11 +283,12 @@ void testBlockInCriticalSectionUniqueLock() {
   std::not_real_lock<std::mutex> not_real_lock(g_mutex);
   sleep(1); // no-warning
 
-  std::unique_lock<std::mutex> lock(g_mutex);
+  std::unique_lock<std::mutex> lock(g_mutex); // expected-note {{Entering critical section here}}
   sleep(1); // expected-warning {{Call to blocking function 'sleep' inside of critical section}}
+            // expected-note@-1 {{Call to blocking function 'sleep' inside of critical section}}
 }
 
 void testBlockInCriticalSectionUniqueLockNested() {
-  testBlockInCriticalSectionUniqueLock();
+  testBlockInCriticalSectionUniqueLock(); // expected-note {{Calling 'testBlockInCriticalSectionUniqueLock'}}
   sleep(1); // no-warning
 }
diff --git a/clang/test/Analysis/eval-predefined-exprs.cpp b/clang/test/Analysis/eval-predefined-exprs.cpp
index 1eec4476a065f3..b26091869cd03f 100644
--- a/clang/test/Analysis/eval-predefined-exprs.cpp
+++ b/clang/test/Analysis/eval-predefined-exprs.cpp
@@ -52,12 +52,13 @@ void foo() {
 
 struct A {
   A() {
-    clang_analyzer_dump(__func__);
-    clang_analyzer_dump(__FUNCTION__);
-    clang_analyzer_dump(__PRETTY_FUNCTION__);
-    // expected-warning@-3 {{&Element{"A",0 S64b,char}}}
-    // expected-warning@-3 {{&Element{"A",0 S64b,char}}}
-    // expected-warning@-3 {{&Element{"A::A()",0 S64b,char}}}
+    clang_analyzer_dump(__func__);     // expected-warning {{&Element{"A",0 S64b,char}}}
+#ifdef ANALYZER_MS
+    clang_analyzer_dump(__FUNCTION__); // expected-warning {{&Element{"A::A",0 S64b,char}}}
+#else
+    clang_analyzer_dump(__FUNCTION__); // expected-warning {{&Element{"A",0 S64b,char}}}
+#endif
+    clang_analyzer_dump(__PRETTY_FUNCTION__);  // expected-warning {{&Element{"A::A()",0 S64b,char}}}
 
 #ifdef ANALYZER_MS
     clang_analyzer_dump(__FUNCDNAME__);
@@ -71,12 +72,13 @@ struct A {
 #endif
   }
   ~A() {
-    clang_analyzer_dump(__func__);
-    clang_analyzer_dump(__FUNCTION__);
-    clang_analyzer_dump(__PRETTY_FUNCTION__);
-    // expected-warning@-3 {{&Element{"~A",0 S64b,char}}}
-    // expected-warning@-3 {{&Element{"~A",0 S64b,char}}}
-    // expected-warning@-3 {{&Element{"A::~A()",0 S64b,char}}}
+    clang_analyzer_dump(__func__);          // expected-warning {{&Element{"~A",0 S64b,char}}}
+#ifdef ANALYZER_MS
+    clang_analyzer_dump(__FUNCTION__);      // expected-warning {{&Element{"A::~A",0 S64b,char}}}
+#else
+    clang_analyzer_dump(__FUNCTION__);      // expected-warning {{&Element{"~A",0 S64b,char}}}
+#endif
+    clang_analyzer_dump(__PRETTY_FUNCTION__); // expected-warning {{&Element{"A::~A()",0 S64b,char}}}
 
 #ifdef ANALYZER_MS
     clang_analyzer_dump(__FUNCDNAME__);
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 0c3c67c6a546ad..92f983d8b15612 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -24,6 +24,33 @@ void taintedIndex(void) {
   scanf("%d", &index);
   // expected-note@-1 {{Taint originated here}}
   // expected-note@-2 {{Taint propagated to the 2nd argument}}
+  TenElements[index] = 5;
+  // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}}
+  // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be negative or too large}}
+}
+
+void taintedIndexNonneg(void) {
+  int index;
+  scanf("%d", &index);
+  // expected-note@-1 {{Taint originated here}}
+  // expected-note@-2 {{Taint propagated to the 2nd argument}}
+
+  // expected-note@+2 {{Assuming 'index' is >= 0}}
+  // expected-note@+1 {{Taking false branch}}
+  if (index < 0)
+    return;
+
+  TenElements[index] = 5;
+  // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}}
+  // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be too large}}
+}
+
+void taintedIndexUnsigned(void) {
+  unsigned index;
+  scanf("%u", &index);
+  // expected-note@-1 {{Taint originated here}}
+  // expected-note@-2 {{Taint propagated to the 2nd argument}}
+
   TenElements[index] = 5;
   // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted index}}
   // expected-note@-2 {{Access of 'TenElements' with a tainted index that may be too large}}
@@ -59,7 +86,7 @@ void taintedOffset(void) {
   int *p = TenElements + index;
   p[0] = 5;
   // expected-warning@-1 {{Potential out of bound access to 'TenElements' with tainted offset}}
-  // expected-note@-2 {{Access of 'TenElements' with a tainted offset that may be too large}}
+  // expected-note@-2 {{Access of 'TenElements' with a tainted offset that may be negative or too large}}
 }
 
 void arrayOverflow(void) {
@@ -109,6 +136,33 @@ int *potentialAfterTheEndPtr(int idx) {
   // &TenElements[idx].
 }
 
+int overflowOrUnderflow(int arg) {
+  // expected-note@+2 {{Assuming 'arg' is < 0}}
+  // expected-note@+1 {{Taking false branch}}
+  if (arg >= 0)
+    return 0;
+
+  return TenElements[arg - 1];
+  // expected-warning@-1 {{Out of bound access to memory around 'TenElements'}}
+  // expected-note@-2 {{Access of 'TenElements' at a negative or overflowing index, while it holds only 10 'int' elements}}
+}
+
+char TwoElements[2] = {11, 22};
+char overflowOrUnderflowConcrete(int arg) {
+  // expected-note@#cond {{Assuming 'arg' is < 3}}
+  // expected-note@#cond {{Left side of '||' is false}}
+  // expected-note@#cond {{Assuming 'arg' is not equal to 0}}
+  // expected-note@#cond {{Left side of '||' is false}}
+  // expected-note@#cond {{Assuming 'arg' is not equal to 1}}
+  // expected-note@#cond {{Taking false branch}}
+  if (arg >= 3 || arg == 0 || arg == 1) // #cond
+    return 0;
+
+  return TwoElements[arg];
+  // expected-warning@-1 {{Out of bound access to memory around 'TwoElements'}}
+  // expected-note@-2 {{Access of 'TwoElements' at a negative or overflowing index, while it holds only 2 'char' elements}}
+}
+
 int scalar;
 int scalarOverflow(void) {
   return (&scalar)[1];
diff --git a/clang/test/Analysis/taint-diagnostic-visitor.c b/clang/test/Analysis/taint-diagnostic-visitor.c
index 020e9579ac535c..2ba7d9938fc3d8 100644
--- a/clang/test/Analysis/taint-diagnostic-visitor.c
+++ b/clang/test/Analysis/taint-diagnostic-visitor.c
@@ -30,7 +30,7 @@ int taintDiagnosticOutOfBound(void) {
   scanf("%d", &index); // expected-note {{Taint originated here}}
                        // expected-note@-1 {{Taint propagated to the 2nd argument}}
   return Array[index]; // expected-warning {{Potential out of bound access to 'Array' with tainted index}}
-                       // expected-note@-1 {{Access of 'Array' with a tainted index that may be too large}}
+                       // expected-note@-1 {{Access of 'Array' with a tainted index}}
 }
 
 int taintDiagnosticDivZero(int operand) {
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-error.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-error.c
index 5f57d7575c859a..272e0222dc9e41 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-error.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-error.c
@@ -24,13 +24,16 @@ void test_trap(void) {
   __tw(ia, ib, 0); //expected-error {{argument value 0 is outside the valid range [1, 31]}}
 }
 
+#ifdef __PPC64__
 void test_builtin_ppc_rldimi() {
   unsigned int shift;
   unsigned long long mask;
   unsigned long long res = __builtin_ppc_rldimi(ull, ull, shift, 7); // expected-error {{argument to '__builtin_ppc_rldimi' must be a constant integer}}
   res = __builtin_ppc_rldimi(ull, ull, 63, mask);                    // expected-error {{argument to '__builtin_ppc_rldimi' must be a constant integer}}
   res = __builtin_ppc_rldimi(ull, ull, 63, 0xFFFF000000000F00);      // expected-error {{argument 3 value should represent a contiguous bit field}}
+  res = __builtin_ppc_rldimi(ull, ull, 64, 0xFFFF000000000000);      // expected-error {{argument value 64 is outside the valid range [0, 63]}}
 }
+#endif
 
 void test_builtin_ppc_rlwimi() {
   unsigned int shift;
@@ -83,6 +86,10 @@ void testalignx(const void *pointer, unsigned int alignment) {
 }
 
 #ifndef __PPC64__
+unsigned long long testrldimi32() {
+  return __rldimi(ull, ui, 3, 0x7ffff8ULL); //expected-error {{this builtin is only available on 64-bit targets}}
+}
+
 long long testbpermd(long long bit_selector, long long source) {
   return __bpermd(bit_selector, source); //expected-error {{this builtin is only available on 64-bit targets}}
 }
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
index b218547c00d931..4773d6cb1a0cfd 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
@@ -1,8 +1,10 @@
 // REQUIRES: powerpc-registered-target
 // RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu \
-// RUN:   -emit-llvm %s -o - -target-cpu pwr7 | FileCheck %s
+// RUN:   -emit-llvm %s -o - -target-cpu pwr7 | FileCheck %s \
+// RUN:   -check-prefixes=PPC64,CHECK
 // RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu \
-// RUN:   -emit-llvm %s -o - -target-cpu pwr8 | FileCheck %s
+// RUN:   -emit-llvm %s -o - -target-cpu pwr8 | FileCheck %s \
+// RUN:   -check-prefixes=PPC64,CHECK
 // RUN: %clang_cc1 -triple powerpc-unknown-aix \
 // RUN:   -emit-llvm %s -o - -target-cpu pwr7 | FileCheck %s
 // RUN: %clang_cc1 -triple powerpc64-unknown-aix \
@@ -11,18 +13,20 @@
 extern unsigned int ui;
 extern unsigned long long ull;
 
+#ifdef __PPC64__
 void test_builtin_ppc_rldimi() {
-  // CHECK-LABEL: test_builtin_ppc_rldimi
-  // CHECK:       %res = alloca i64, align 8
-  // CHECK-NEXT:  [[RA:%[0-9]+]] = load i64, ptr @ull, align 8
-  // CHECK-NEXT:  [[RB:%[0-9]+]] = load i64, ptr @ull, align 8
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i64 @llvm.ppc.rldimi(i64 [[RA]], i64 [[RB]], i32 63, i64 72057593769492480)
-  // CHECK-NEXT:  store i64 [[RC]], ptr %res, align 8
-  // CHECK-NEXT:  ret void
+  // PPC64-LABEL: test_builtin_ppc_rldimi
+  // PPC64:       %res = alloca i64, align 8
+  // PPC64-NEXT:  [[RA:%[0-9]+]] = load i64, ptr @ull, align 8
+  // PPC64-NEXT:  [[RB:%[0-9]+]] = load i64, ptr @ull, align 8
+  // PPC64-NEXT:  [[RC:%[0-9]+]] = call i64 @llvm.ppc.rldimi(i64 [[RA]], i64 [[RB]], i32 63, i64 72057593769492480)
+  // PPC64-NEXT:  store i64 [[RC]], ptr %res, align 8
+  // PPC64-NEXT:  ret void
 
   /*shift = 63, mask = 0x00FFFFFFF0000000 = 72057593769492480, ~mask = 0xFF0000000FFFFFFF = -72057593769492481*/
   unsigned long long res = __builtin_ppc_rldimi(ull, ull, 63, 0x00FFFFFFF0000000);
 }
+#endif
 
 void test_builtin_ppc_rlwimi() {
   // CHECK-LABEL: test_builtin_ppc_rlwimi
diff --git a/clang/test/CodeGen/aarch64-soft-float-abi-errors.c b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
new file mode 100644
index 00000000000000..3e5ab9e92a1d8c
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-soft-float-abi-errors.c
@@ -0,0 +1,99 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +fp-armv8 -S -target-abi aapcs      -verify=fp-hard %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs-soft -verify=nofp-soft %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs      -verify=nofp-hard %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fp-armv8 -S -target-abi aapcs -O1  -verify=nofp-hard,nofp-hard-opt -emit-llvm %s
+// No run line needed for soft-float ABI with an FPU because that is rejected by the driver
+
+// With the hard-float ABI and a target with an FPU, FP arguments are passed in
+// FP registers, no diagnostics needed.
+// fp-hard-no-diagnostics
+
+// With the soft-float ABI, FP arguments are passed in integer registers, no
+// diagnostics needed.
+// nofp-soft-no-diagnostics
+
+// With the hard-float ABI but no FPU, FP arguments cannot be passed in an
+// ABI-compatible way, so we report errors for these cases:
+
+struct HFA {
+  float x, y;
+};
+
+struct non_HFA {
+  float x;
+  int y;
+};
+
+// Floating-point arguments are returns are rejected
+void test_fp16_arg(__fp16 a) {}
+// nofp-hard-error@-1 {{'a' requires '__fp16' type support, but ABI 'aapcs' does not support it}}
+__fp16 test_fp16_ret(void) { return 3.141; }
+// nofp-hard-error@-1 {{'test_fp16_ret' requires '__fp16' type support, but ABI 'aapcs' does not support it}}
+void test_float_arg(float a) {}
+// nofp-hard-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}}
+float test_float_ret(void) { return 3.141f; }
+// nofp-hard-error@-1 {{'test_float_ret' requires 'float' type support, but ABI 'aapcs' does not support it}}
+void test_double_arg(double a) {}
+// nofp-hard-error@-1 {{'a' requires 'double' type support, but ABI 'aapcs' does not support it}}
+double test_double_ret(void) { return 3.141; }
+// nofp-hard-error@-1 {{'test_double_ret' requires 'double' type support, but ABI 'aapcs' does not support it}}
+void test_long_double_arg(long double a) {}
+// nofp-hard-error@-1 {{'a' requires 'long double' type support, but ABI 'aapcs' does not support it}}
+long double test_long_double_ret(void) { return 3.141L; }
+// nofp-hard-error@-1 {{'test_long_double_ret' requires 'long double' type support, but ABI 'aapcs' does not support it}}
+
+// HFAs would be passed in floating-point registers, so are rejected.
+void test_hfa_arg(struct HFA a) {}
+// nofp-hard-error@-1 {{'a' requires 'struct HFA' type support, but ABI 'aapcs' does not support it}}
+struct HFA test_hfa_ret(void) { return (struct HFA){}; }
+// nofp-hard-error@-1 {{'test_hfa_ret' requires 'struct HFA' type support, but ABI 'aapcs' does not support it}}
+
+// Note: vector types cannot be created at all for targets without an FPU, so
+// it is not possible to create a function which passes/returns them when using
+// either the default or soft-float ABI. This is tested elsewhere.
+
+// This struct contains a floating-point type, but is not an HFA, so can be
+// passed/returned without affecting the ABI.
+struct non_HFA test_non_hfa_ret(void) { return (struct non_HFA){}; }
+void test_non_hfa_arg(struct non_HFA a) {}
+
+// This inline function does not get code-generated because there is no use of
+// it in this file, so we we don't emit an error for it, matching GCC's
+// behaviour.
+inline void test_float_arg_inline(float a) {}
+
+// This inline function is used, so we emit the error if we generate code for
+// it. The code isn't generated at -O0, so no error is emitted there.
+inline void test_float_arg_inline_used(float a) {}
+// nofp-hard-opt-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}}
+void use_inline() { test_float_arg_inline_used(1.0f); }
+
+// The always_inline attribute causes an inline function to always be
+// code-genned, even at -O0, so we always emit the error.
+__attribute((always_inline))
+inline void test_float_arg_always_inline_used(float a) {}
+// nofp-hard-error@-1 {{'a' requires 'float' type support, but ABI 'aapcs' does not support it}}
+void use_always_inline() { test_float_arg_always_inline_used(1.0f); }
+
+// Floating-point expressions, global variables and local variables do not
+// affect the ABI, so are allowed. GCC does reject some uses of floating point
+// types like this, but it does so after optimisation, which we can't
+// accurately match in clang.
+int test_expr_float(int a) { return a + 1.0f; }
+int test_expr_double(int a) { return a + 1.0; }
+
+float global_float = 2.0f * 3.5f;
+float global_double = 2.0 * 3.5;
+
+int test_var_float(int a) {
+  float f = a;
+  f *= 6.0;
+  return (int)f;
+}
+int test_var_double(int a) {
+  double d = a;
+  d *= 6.0;
+  return (int)d;
+}
diff --git a/clang/test/CodeGen/aarch64-soft-float-abi.c b/clang/test/CodeGen/aarch64-soft-float-abi.c
new file mode 100644
index 00000000000000..143377e218a197
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-soft-float-abi.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -triple aarch64 -target-feature +fp-armv8 -target-abi aapcs -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,HARD
+// RUN: %clang_cc1 -triple aarch64 -target-feature -fp-armv8 -target-abi aapcs-soft -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,SOFT
+
+// See also llvm/test/CodeGen/AArch64/soft-float-abi.ll, which checks the LLVM
+// backend parts of the soft-float ABI.
+
+// The va_list type does not change between the ABIs
+// CHECK: %struct.__va_list = type { ptr, ptr, ptr, i32, i32 }
+
+// Floats are passed in integer registers, this will be handled by the backend.
+// CHECK: define dso_local half @test0(half noundef %a)
+// CHECK: define dso_local bfloat @test1(bfloat noundef %a)
+// CHECK: define dso_local float @test2(float noundef %a)
+// CHECK: define dso_local double @test3(double noundef %a)
+// CHECK: define dso_local fp128 @test4(fp128 noundef %a)
+__fp16 test0(__fp16 a) { return a; }
+__bf16 test1(__bf16 a) { return a; }
+float test2(float a) { return a; }
+double test3(double a) { return a; }
+long double test4(long double a) { return a; }
+
+// No types are considered to be HFAs or HVAs by the soft-float PCS, so these
+// are converted to integer types.
+struct A {
+  float x;
+};
+// SOFT: define dso_local i32 @test10(i64 %a.coerce)
+// HARD: define dso_local %struct.A @test10([1 x float] alignstack(8) %a.coerce)
+struct A test10(struct A a) { return a; }
+
+struct B {
+  double x;
+  double y;
+};
+// SOFT: define dso_local [2 x i64] @test11([2 x i64] %a.coerce)
+// HARD: define dso_local %struct.B @test11([2 x double] alignstack(8) %a.coerce)
+struct B test11(struct B a) { return a; }
+
+#include <stdarg.h>
+
+// The layout of the va_list struct is unchanged between the ABIs, but for
+// aapcs-soft, floating-point arguments will be retreived from the GPR save
+// area, as if they were an integer type of the same size.
+// CHECK-LABEL: define dso_local double @test20(i32 noundef %a, ...)
+// CHECK: %vl = alloca %struct.__va_list, align 8
+// SOFT: %gr_offs_p = getelementptr inbounds %struct.__va_list, ptr %vl, i32 0, i32 3
+// SOFT: %reg_top_p = getelementptr inbounds %struct.__va_list, ptr %vl, i32 0, i32 1
+// HARD: %vr_offs_p = getelementptr inbounds %struct.__va_list, ptr %vl, i32 0, i32 4
+// HARD: %reg_top_p = getelementptr inbounds %struct.__va_list, ptr %vl, i32 0, i32 2
+double test20(int a, ...) {
+  va_list vl;
+  va_start(vl, a);
+  return va_arg(vl, double);
+}
+
+// Vector types are only available for targets with the correct hardware, and
+// their calling-convention is left undefined by the soft-float ABI, so they
+// aren't tested here.
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 276a7b87b7a1b4..94095f9aa3e1f4 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -23,8 +23,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 
 
 
-
-
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
 // CHECK: @ftc.ifunc = weak_odr alias i32 (), ptr @ftc
@@ -177,15 +175,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @ftc_dup2._McrcMdotprod
 // CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 256
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 256
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
 // CHECK-NEXT:    ret ptr @ftc_dup2._Mfp
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @ftc_dup2.default
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
diff --git a/clang/test/CodeGen/ptrauth-intrinsics.c b/clang/test/CodeGen/ptrauth-intrinsics.c
new file mode 100644
index 00000000000000..17f28dddb38015
--- /dev/null
+++ b/clang/test/CodeGen/ptrauth-intrinsics.c
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fptrauth-intrinsics -emit-llvm %s  -o - | FileCheck %s
+
+void (*fnptr)(void);
+long int_discriminator;
+void *ptr_discriminator;
+long signature;
+
+// CHECK-LABEL: define void @test_auth()
+void test_auth() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC0:%.*]] = load ptr, ptr @ptr_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[DISC:%.*]] = ptrtoint ptr [[DISC0]] to i64
+  // CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T0]], i32 0, i64 [[DISC]])
+  // CHECK-NEXT: [[RESULT:%.*]] = inttoptr  i64 [[T1]] to ptr
+  // CHECK-NEXT: store ptr [[RESULT]], ptr @fnptr,
+  fnptr = __builtin_ptrauth_auth(fnptr, 0, ptr_discriminator);
+}
+
+// CHECK-LABEL: define void @test_strip()
+void test_strip() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.strip(i64 [[T0]], i32 0)
+  // CHECK-NEXT: [[RESULT:%.*]] = inttoptr  i64 [[T1]] to ptr
+  // CHECK-NEXT: store ptr [[RESULT]], ptr @fnptr,
+  fnptr = __builtin_ptrauth_strip(fnptr, 0);
+}
+
+// CHECK-LABEL: define void @test_sign_unauthenticated()
+void test_sign_unauthenticated() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC0:%.*]] = load ptr, ptr @ptr_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[DISC:%.*]] = ptrtoint ptr [[DISC0]] to i64
+  // CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 [[T0]], i32 0, i64 [[DISC]])
+  // CHECK-NEXT: [[RESULT:%.*]] = inttoptr  i64 [[T1]] to ptr
+  // CHECK-NEXT: store ptr [[RESULT]], ptr @fnptr,
+  fnptr = __builtin_ptrauth_sign_unauthenticated(fnptr, 0, ptr_discriminator);
+}
+
+// CHECK-LABEL: define void @test_auth_and_resign()
+void test_auth_and_resign() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC0:%.*]] = load ptr, ptr @ptr_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[DISC:%.*]] = ptrtoint ptr [[DISC0]] to i64
+  // CHECK-NEXT: [[T1:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[T0]], i32 0, i64 [[DISC]], i32 3, i64 15)
+  // CHECK-NEXT: [[RESULT:%.*]] = inttoptr  i64 [[T1]] to ptr
+  // CHECK-NEXT: store ptr [[RESULT]], ptr @fnptr,
+  fnptr = __builtin_ptrauth_auth_and_resign(fnptr, 0, ptr_discriminator, 3, 15);
+}
+
+// CHECK-LABEL: define void @test_blend_discriminator()
+void test_blend_discriminator() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC:%.*]] = load i64, ptr @int_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T0]], i64 [[DISC]])
+  // CHECK-NEXT: store i64 [[RESULT]], ptr @int_discriminator,
+  int_discriminator = __builtin_ptrauth_blend_discriminator(fnptr, int_discriminator);
+}
+
+// CHECK-LABEL: define void @test_sign_generic_data()
+void test_sign_generic_data() {
+  // CHECK:      [[PTR:%.*]] = load ptr, ptr @fnptr,
+  // CHECK-NEXT: [[DISC0:%.*]] = load ptr, ptr @ptr_discriminator,
+  // CHECK-NEXT: [[T0:%.*]] = ptrtoint ptr [[PTR]] to i64
+  // CHECK-NEXT: [[DISC:%.*]] = ptrtoint ptr [[DISC0]] to i64
+  // CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.ptrauth.sign.generic(i64 [[T0]], i64 [[DISC]])
+  // CHECK-NEXT: store i64 [[RESULT]], ptr @signature,
+  signature = __builtin_ptrauth_sign_generic_data(fnptr, ptr_discriminator);
+}
diff --git a/clang/test/CodeGen/target-avx-abi-diag.c b/clang/test/CodeGen/target-avx-abi-diag.c
index 84be9e252db5cc..dfbbc3213ca6b2 100644
--- a/clang/test/CodeGen/target-avx-abi-diag.c
+++ b/clang/test/CodeGen/target-avx-abi-diag.c
@@ -2,13 +2,9 @@
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx -verify=no512 -o - -S
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -verify=both -o - -S
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature +evex512 -verify=both -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -verify=avx512-256 -DAVX512_ERR=1 -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -verify=avx512-256 -DAVX512_ERR=2 -DNOEVEX512 -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -verify=avx512-256 -DAVX512_ERR=3 -DNOEVEX512 -o - -S
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx512f -target-feature -evex512 -verify=avx512-256 -DNOEVEX512 -o - -S
 // RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-512 -verify=both -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-256 -verify=avx512-256 -DAVX512_ERR=1 -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-256 -verify=avx512-256 -DAVX512_ERR=2 -o - -S
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-256 -verify=avx512-256 -DAVX512_ERR=3 -o - -S
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -target-feature +avx10.1-256 -verify=avx512-256 -o - -S
 // REQUIRES: x86-registered-target
 
 // both-no-diagnostics
@@ -25,7 +21,6 @@ void takesAvx512_no_target(avx512fType t);
 void variadic(int i, ...);
 __attribute__((target("avx512f"))) void variadic_err(int i, ...);
 
-#if !defined(AVX512_ERR) || AVX512_ERR == 1
 // If neither side has an attribute, warn.
 void call_warn(void) {
   avx256Type t1;
@@ -39,9 +34,7 @@ void call_warn(void) {
   // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
   variadic(3, t2); // no512-warning {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
 }
-#endif
 
-#if !defined(AVX512_ERR) || AVX512_ERR == 2
 // If only 1 side has an attribute, error.
 void call_errors(void) {
   avx256Type t1;
@@ -54,16 +47,21 @@ void call_errors(void) {
   // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
   variadic_err(3, t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
 }
-#if defined(__AVX10_1__)
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-#endif
-#endif
 
-#if !defined(AVX512_ERR) || AVX512_ERR == 3
+// Check that these errors are treated as non-fatal, so we can report them for
+// all functions, not just the first.
+void call_errors_2(void) {
+  avx256Type t1;
+  takesAvx256(t1); // no256-error {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}}
+  avx512fType t2;
+  // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
+  takesAvx512(t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
+
+  variadic_err(1, t1); // no256-error {{AVX vector argument of type 'avx256Type' (vector of 16 'short' values) without 'avx' enabled changes the ABI}}
+  // avx512-256-error@+1 {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'evex512' enabled changes the ABI}}
+  variadic_err(3, t2); // no512-error {{AVX vector argument of type 'avx512fType' (vector of 32 'short' values) without 'avx512f' enabled changes the ABI}}
+}
+
 __attribute__((target("avx"))) void call_avx256_ok(void) {
   avx256Type t;
   takesAvx256(t);
@@ -93,5 +91,11 @@ __attribute__((target("avx512f"))) void call_avx512_ok2(void) {
 // avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
 // avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
 // avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
-#endif
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
+// avx512-256-warning@*:* {{invalid feature combination: +avx512f +avx10.1-256; will be promoted to avx10.1-512}}
 #endif
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-virtual-inheritance.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-virtual-inheritance.cpp
index f6cf834b503251..f03bb747b2a7b1 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-virtual-inheritance.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/diamond-virtual-inheritance.cpp
@@ -9,19 +9,19 @@
 // CHECK: @_ZTV1B.local = private unnamed_addr constant { [4 x i32], [4 x i32] } { [4 x i32] [i32 8, i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1B.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1B4barBEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 3) to i64)) to i32)], [4 x i32] [i32 0, i32 -8, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1B.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 1, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 1, i32 3) to i64)) to i32)] }, align 4
 
 // VTT for B
-// CHECK: @_ZTT1B ={{.*}} unnamed_addr constant [2 x ptr] [ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1B.local, i32 0, inrange i32 0, i32 3), ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1B.local, i32 0, inrange i32 1, i32 3)], align 8
+// CHECK: @_ZTT1B ={{.*}} unnamed_addr constant [2 x ptr] [ptr getelementptr inbounds inrange(-12, 4) ({ [4 x i32], [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 0, i32 3), ptr getelementptr inbounds inrange(-12, 4) ({ [4 x i32], [4 x i32] }, ptr @_ZTV1B.local, i32 0, i32 1, i32 3)], align 8
 
 // VTable for C
 // CHECK: @_ZTV1C.local = private unnamed_addr constant { [4 x i32], [4 x i32] } { [4 x i32] [i32 8, i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1C.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1C.local, i32 0, i32 0, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1C4barCEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1C.local, i32 0, i32 0, i32 3) to i64)) to i32)], [4 x i32] [i32 0, i32 -8, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1C.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1C.local, i32 0, i32 1, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1C.local, i32 0, i32 1, i32 3) to i64)) to i32)] }, align 4
 
 // VTT for C
-// CHECK: @_ZTT1C ={{.*}} unnamed_addr constant [2 x ptr] [ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1C.local, i32 0, inrange i32 0, i32 3), ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTV1C.local, i32 0, inrange i32 1, i32 3)], align 8
+// CHECK: @_ZTT1C ={{.*}} unnamed_addr constant [2 x ptr] [ptr getelementptr inbounds inrange(-12, 4) ({ [4 x i32], [4 x i32] }, ptr @_ZTV1C.local, i32 0, i32 0, i32 3), ptr getelementptr inbounds inrange(-12, 4) ({ [4 x i32], [4 x i32] }, ptr @_ZTV1C.local, i32 0, i32 1, i32 3)], align 8
 
 // VTable for D
 // CHECK: @_ZTV1D.local = private unnamed_addr constant { [5 x i32], [4 x i32], [4 x i32] } { [5 x i32] [i32 16, i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1D.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 0, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1B4barBEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 0, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1D3bazEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 0, i32 3) to i64)) to i32)], [4 x i32] [i32 8, i32 -8, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1D.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 1, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1C4barCEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 1, i32 3) to i64)) to i32)], [4 x i32] [i32 0, i32 -16, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1D.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 2, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 2, i32 3) to i64)) to i32)] }, align 4
 
 // VTT for D
-// CHECK: @_ZTT1D ={{.*}} unnamed_addr constant [7 x ptr] [ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, inrange i32 0, i32 3), ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D0_1B.local, i32 0, inrange i32 0, i32 3), ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D0_1B.local, i32 0, inrange i32 1, i32 3), ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D8_1C.local, i32 0, inrange i32 0, i32 3), ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D8_1C.local, i32 0, inrange i32 1, i32 3), ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, inrange i32 2, i32 3), ptr getelementptr inbounds ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, inrange i32 1, i32 3)], align 8
+// CHECK: @_ZTT1D ={{.*}} unnamed_addr constant [7 x ptr] [ptr getelementptr inbounds inrange(-12, 8) ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 0, i32 3), ptr getelementptr inbounds inrange(-12, 4) ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D0_1B.local, i32 0, i32 0, i32 3), ptr getelementptr inbounds inrange(-12, 4) ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D0_1B.local, i32 0, i32 1, i32 3), ptr getelementptr inbounds inrange(-12, 4) ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D8_1C.local, i32 0, i32 0, i32 3), ptr getelementptr inbounds inrange(-12, 4) ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D8_1C.local, i32 0, i32 1, i32 3), ptr getelementptr inbounds inrange(-12, 4) ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 2, i32 3), ptr getelementptr inbounds inrange(-12, 4) ({ [5 x i32], [4 x i32], [4 x i32] }, ptr @_ZTV1D.local, i32 0, i32 1, i32 3)], align 8
 
 // Construction vtable for B-in-D
 // CHECK: @_ZTC1D0_1B.local = private unnamed_addr constant { [4 x i32], [4 x i32] } { [4 x i32] [i32 16, i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1B.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D0_1B.local, i32 0, i32 0, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1B4barBEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D0_1B.local, i32 0, i32 0, i32 3) to i64)) to i32)], [4 x i32] [i32 0, i32 -16, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1B.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D0_1B.local, i32 0, i32 1, i32 3) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [4 x i32], [4 x i32] }, ptr @_ZTC1D0_1B.local, i32 0, i32 1, i32 3) to i64)) to i32)] }, align 4
diff --git a/clang/test/CodeGenCXX/auto-var-init.cpp b/clang/test/CodeGenCXX/auto-var-init.cpp
index e5a9d015f22f27..991eb73fe45c58 100644
--- a/clang/test/CodeGenCXX/auto-var-init.cpp
+++ b/clang/test/CodeGenCXX/auto-var-init.cpp
@@ -1345,7 +1345,7 @@ TEST_UNINIT(base, base);
 // PATTERN-O0: call void @llvm.memcpy{{.*}} @__const.test_base_uninit.uninit{{.+}}), !annotation [[AUTO_INIT]]
 // ZERO-LABEL: @test_base_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0,{{.+}}), !annotation [[AUTO_INIT]]
-// ZERO-O1: store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV4base, i64 0, inrange i32 0, i64 2), {{.*}}, align 8
+// ZERO-O1: store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV4base, i64 0, i32 0, i64 2), {{.*}}, align 8
 // ZERO-O1-NOT: !annotation
 
 TEST_BRACES(base, base);
@@ -1366,7 +1366,7 @@ TEST_UNINIT(derived, derived);
 // ZERO-LABEL: @test_derived_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0, {{.+}}), !annotation [[AUTO_INIT]]
 // ZERO-O1: store i64 0, {{.*}} align 8, !annotation [[AUTO_INIT]]
-// ZERO-O1: store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7derived, i64 0, inrange i32 0, i64 2), {{.*}} align 8
+// ZERO-O1: store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV7derived, i64 0, i32 0, i64 2), {{.*}} align 8
 
 TEST_BRACES(derived, derived);
 // CHECK-LABEL: @test_derived_braces()
diff --git a/clang/test/CodeGenCXX/const-init-cxx11.cpp b/clang/test/CodeGenCXX/const-init-cxx11.cpp
index 3a12fe444f137b..7c92af0def5279 100644
--- a/clang/test/CodeGenCXX/const-init-cxx11.cpp
+++ b/clang/test/CodeGenCXX/const-init-cxx11.cpp
@@ -344,13 +344,13 @@ namespace VirtualMembers {
     constexpr E() : B(3), c{'b','y','e'} {}
     char c[3];
   };
-  // CHECK: @_ZN14VirtualMembers1eE ={{.*}} global { ptr, double, i32, ptr, double, [5 x i8], i16, ptr, double, [5 x i8], [3 x i8] } { ptr getelementptr inbounds ({ [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN14VirtualMembers1EE, i32 0, inrange i32 0, i32 2), double 1.000000e+00, i32 64, ptr getelementptr inbounds ({ [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN14VirtualMembers1EE, i32 0, inrange i32 1, i32 2), double 2.000000e+00, [5 x i8] c"hello", i16 5, ptr getelementptr inbounds ({ [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN14VirtualMembers1EE, i32 0, inrange i32 2, i32 2), double 3.000000e+00, [5 x i8] c"world", [3 x i8] c"bye" }
+  // CHECK: @_ZN14VirtualMembers1eE ={{.*}} global { ptr, double, i32, ptr, double, [5 x i8], i16, ptr, double, [5 x i8], [3 x i8] } { ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN14VirtualMembers1EE, i32 0, i32 0, i32 2), double 1.000000e+00, i32 64, ptr getelementptr inbounds inrange(-16, 16) ({ [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN14VirtualMembers1EE, i32 0, i32 1, i32 2), double 2.000000e+00, [5 x i8] c"hello", i16 5, ptr getelementptr inbounds inrange(-16, 16) ({ [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN14VirtualMembers1EE, i32 0, i32 2, i32 2), double 3.000000e+00, [5 x i8] c"world", [3 x i8] c"bye" }
   E e;
 
   struct nsMemoryImpl {
     virtual void f();
   };
-  // CHECK: @_ZN14VirtualMembersL13sGlobalMemoryE = internal global %"struct.VirtualMembers::nsMemoryImpl" { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN14VirtualMembers12nsMemoryImplE, i32 0, inrange i32 0, i32 2) }
+  // CHECK: @_ZN14VirtualMembersL13sGlobalMemoryE = internal global %"struct.VirtualMembers::nsMemoryImpl" { ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN14VirtualMembers12nsMemoryImplE, i32 0, i32 0, i32 2) }
   __attribute__((used))
   static nsMemoryImpl sGlobalMemory;
 
@@ -361,7 +361,7 @@ namespace VirtualMembers {
 
     T t;
   };
-  // CHECK: @_ZN14VirtualMembers1tE ={{.*}} global { ptr, i32 } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN14VirtualMembers13TemplateClassIiEE, i32 0, inrange i32 0, i32 2), i32 42 }
+  // CHECK: @_ZN14VirtualMembers1tE ={{.*}} global { ptr, i32 } { ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN14VirtualMembers13TemplateClassIiEE, i32 0, i32 0, i32 2), i32 42 }
   TemplateClass<int> t;
 }
 
diff --git a/clang/test/CodeGenCXX/constructor-init.cpp b/clang/test/CodeGenCXX/constructor-init.cpp
index 3d473e67ea0d8e..f191599f360e7b 100644
--- a/clang/test/CodeGenCXX/constructor-init.cpp
+++ b/clang/test/CodeGenCXX/constructor-init.cpp
@@ -94,20 +94,20 @@ namespace InitVTable {
   };
 
   // CHECK-LABEL: define{{.*}} void @_ZN10InitVTable1BC2Ev(ptr {{[^,]*}} %this) unnamed_addr
-  // CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN10InitVTable1BE, i32 0, inrange i32 0, i32 2), ptr [[THIS:%.*]],
+  // CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN10InitVTable1BE, i32 0, i32 0, i32 2), ptr [[THIS:%.*]],
   // CHECK:      [[VTBL:%.*]] = load ptr, ptr {{%.*}}
   // CHECK-NEXT: [[FNP:%.*]] = getelementptr inbounds ptr, ptr [[VTBL]], i64 0
   // CHECK-NEXT: [[FN:%.*]] = load ptr, ptr [[FNP]]
   // CHECK-NEXT: [[ARG:%.*]] = call noundef i32 [[FN]](ptr {{[^,]*}} [[THIS]])
   // CHECK-NEXT: call void @_ZN10InitVTable1AC2Ei(ptr {{[^,]*}} {{%.*}}, i32 noundef [[ARG]])
-  // CHECK-NEXT: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN10InitVTable1BE, i32 0, inrange i32 0, i32 2), ptr [[THIS]]
+  // CHECK-NEXT: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN10InitVTable1BE, i32 0, i32 0, i32 2), ptr [[THIS]]
   // CHECK-NEXT: ret void
   B::B() : A(foo()) {}
 
   // CHECK-LABEL: define{{.*}} void @_ZN10InitVTable1BC2Ei(ptr {{[^,]*}} %this, i32 noundef %x) unnamed_addr
   // CHECK:      [[ARG:%.*]] = add nsw i32 {{%.*}}, 5
   // CHECK-NEXT: call void @_ZN10InitVTable1AC2Ei(ptr {{[^,]*}} {{%.*}}, i32 noundef [[ARG]])
-  // CHECK-NEXT: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN10InitVTable1BE, i32 0, inrange i32 0, i32 2), ptr {{%.*}}
+  // CHECK-NEXT: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN10InitVTable1BE, i32 0, i32 0, i32 2), ptr {{%.*}}
   // CHECK-NEXT: ret void
   B::B(int x) : A(x + 5) {}
 }
diff --git a/clang/test/CodeGenCXX/copy-constructor-synthesis-2.cpp b/clang/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
index 043dff44f37c26..4f96a3ae670774 100644
--- a/clang/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
+++ b/clang/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
@@ -24,4 +24,4 @@ struct A { virtual void a(); };
 A x(A& y) { return y; }
 
 // CHECK: define linkonce_odr {{.*}} @_ZN1AC1ERKS_(ptr {{.*}}%this, ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0) unnamed_addr
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2)
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2)
diff --git a/clang/test/CodeGenCXX/copy-constructor-synthesis.cpp b/clang/test/CodeGenCXX/copy-constructor-synthesis.cpp
index 3548897ec4ba0f..e4f6995aec3cfd 100644
--- a/clang/test/CodeGenCXX/copy-constructor-synthesis.cpp
+++ b/clang/test/CodeGenCXX/copy-constructor-synthesis.cpp
@@ -163,7 +163,7 @@ void f(B b1) {
 
 // CHECK-LABEL:    define linkonce_odr void @_ZN12rdar138169401AC2ERKS0_(
 // CHECK:      [[THIS:%.*]] = load ptr, ptr
-// CHECK-NEXT: store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTVN12rdar138169401AE, i32 0, inrange i32 0, i32 2), ptr [[THIS]]
+// CHECK-NEXT: store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTVN12rdar138169401AE, i32 0, i32 0, i32 2), ptr [[THIS]]
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[A]], ptr [[THIS]], i32 0, i32 1
 // CHECK-NEXT: [[OTHER:%.*]] = load ptr, ptr
 // CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds [[A]], ptr [[OTHER]], i32 0, i32 1
diff --git a/clang/test/CodeGenCXX/dynamic-cast-exact.cpp b/clang/test/CodeGenCXX/dynamic-cast-exact.cpp
index bd283e85101b4b..86e1965b4ba68b 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-exact.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-exact.cpp
@@ -23,7 +23,7 @@ B *exact_single(A *a) {
 
   // CHECK: [[LABEL_NOTNULL]]:
   // CHECK: %[[VPTR:.*]] = load ptr, ptr %[[PTR]]
-  // CHECK: %[[MATCH:.*]] = icmp eq ptr %[[VPTR]], getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 1, i32 2)
+  // CHECK: %[[MATCH:.*]] = icmp eq ptr %[[VPTR]], getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTV1B, i32 0, i32 1, i32 2)
   // CHECK: %[[RESULT:.*]] = getelementptr inbounds i8, ptr %[[PTR]], i64 -8
   // CHECK: br i1 %[[MATCH]], label %[[LABEL_END:.*]], label %[[LABEL_FAILED]]
 
@@ -42,7 +42,7 @@ B &exact_ref(A &a) {
 
   // CHECK: [[LABEL_NOTNULL]]:
   // CHECK: %[[VPTR:.*]] = load ptr, ptr %[[PTR]]
-  // CHECK: %[[MATCH:.*]] = icmp eq ptr %[[VPTR]], getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 1, i32 2)
+  // CHECK: %[[MATCH:.*]] = icmp eq ptr %[[VPTR]], getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTV1B, i32 0, i32 1, i32 2)
   // CHECK: %[[RESULT:.*]] = getelementptr inbounds i8, ptr %[[PTR]], i64 -8
   // CHECK: br i1 %[[MATCH]], label %[[LABEL_END:.*]], label %[[LABEL_FAILED]]
 
@@ -66,7 +66,7 @@ H *exact_multi(A *a) {
   // CHECK: %[[OFFSET_TO_TOP:.*]] = load i64, ptr %[[OFFSET_TO_TOP_SLOT]]
   // CHECK: %[[RESULT:.*]] = getelementptr inbounds i8, ptr %[[PTR]], i64 %[[OFFSET_TO_TOP]]
   // CHECK: %[[DERIVED_VPTR:.*]] = load ptr, ptr %[[RESULT]]
-  // CHECK: %[[MATCH:.*]] = icmp eq ptr %[[DERIVED_VPTR]], getelementptr inbounds ({ [5 x ptr], [4 x ptr], [4 x ptr], [6 x ptr], [6 x ptr] }, ptr @_ZTV1H, i32 0, inrange i32 0, i32 3)
+  // CHECK: %[[MATCH:.*]] = icmp eq ptr %[[DERIVED_VPTR]], getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr], [4 x ptr], [6 x ptr], [6 x ptr] }, ptr @_ZTV1H, i32 0, i32 0, i32 3)
   // CHECK: br i1 %[[MATCH]], label %[[LABEL_END:.*]], label %[[LABEL_FAILED]]
 
   // CHECK: [[LABEL_FAILED]]:
diff --git a/clang/test/CodeGenCXX/mangle-ms-back-references.cpp b/clang/test/CodeGenCXX/mangle-ms-back-references.cpp
index cb95c100b3d22e..b27a9c5acacb77 100644
--- a/clang/test/CodeGenCXX/mangle-ms-back-references.cpp
+++ b/clang/test/CodeGenCXX/mangle-ms-back-references.cpp
@@ -83,3 +83,20 @@ class H;
 
 void ManyParams(T01 &, T02 &, T03 &, T04 &, T05 &, T06 &, T07 &, T08 &, T09 &, T10 &, H<T11> &, H<T11> &) {}
 // CHECK: "?ManyParams@@YAXAAVT01@@AAVT02@@AAVT03@@AAVT04@@AAVT05@@AAVT06@@AAVT07@@AAVT08@@AAVT09@@AAVT10@@AAV?$H@VT11@@@@AAV?$H@VT11@@@@@Z"
+
+namespace NS {
+// The name "TSS0" for the name of the class below has been specifically
+// chosen to ensure that back reference lookup does not match against the
+// implicitly generated "$TSS0" name of the thread safe static initialization
+// variable.
+struct __declspec(dllexport) TSS0 {
+  static TSS0& get();
+  __forceinline static TSS0& singleton() {
+    static TSS0& lsv = get();
+    return lsv;
+  }
+};
+}
+// CHECK: "?singleton@TSS0@NS@@SAAAU12@XZ"
+// CHECK: "?lsv@?1??singleton@TSS0@NS@@SAAAU23@XZ@4AAU23@A"
+// CHECK: "?$TSS0@?1??singleton@TSS0@NS@@SAAAU23@XZ@4HA"
diff --git a/clang/test/CodeGenCXX/microsoft-interface.cpp b/clang/test/CodeGenCXX/microsoft-interface.cpp
index d70caa15ed3c18..09b30bc96c855a 100644
--- a/clang/test/CodeGenCXX/microsoft-interface.cpp
+++ b/clang/test/CodeGenCXX/microsoft-interface.cpp
@@ -31,10 +31,10 @@ int fn() {
 
 // CHECK-LABEL: define linkonce_odr dso_local x86_thiscallcc void @_ZN1SC2Ev(ptr {{[^,]*}} %this)
 // CHECK:   call x86_thiscallcc void @_ZN1IC2Ev(ptr {{[^,]*}} %{{[.0-9A-Z_a-z]+}})
-// CHECK:   store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1S, i32 0, inrange i32 0, i32 2), ptr %{{[.0-9A-Z_a-z]+}}
+// CHECK:   store ptr getelementptr inbounds inrange(-8, 4) ({ [3 x ptr] }, ptr @_ZTV1S, i32 0, i32 0, i32 2), ptr %{{[.0-9A-Z_a-z]+}}
 
 // CHECK-LABEL: define linkonce_odr dso_local x86_thiscallcc void @_ZN1IC2Ev(ptr {{[^,]*}} %this)
-// CHECK:   store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1I, i32 0, inrange i32 0, i32 2), ptr %{{[.0-9A-Z_a-z]+}}
+// CHECK:   store ptr getelementptr inbounds inrange(-8, 4) ({ [3 x ptr] }, ptr @_ZTV1I, i32 0, i32 0, i32 2), ptr %{{[.0-9A-Z_a-z]+}}
 
 // CHECK-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef i32 @_ZN1I4testEv(ptr {{[^,]*}} %this)
 // CHECK:   ret i32 1
diff --git a/clang/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp b/clang/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp
index eb8f21b57aa7b6..c001ce9b755d14 100644
--- a/clang/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp
+++ b/clang/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp
@@ -27,7 +27,7 @@ struct A {
 };
 
 // CHECK-LABEL: define{{.*}} void @_ZN5Test21AD2Ev
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5Test21AE, i32 0, inrange i32 0, i32 2), ptr
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5Test21AE, i32 0, i32 0, i32 2), ptr
 A::~A() {
   f();
 }
@@ -50,7 +50,7 @@ struct A {
 };
 
 // CHECK-LABEL: define{{.*}} void @_ZN5Test31AD2Ev
-// CHECK-NOT: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5Test31AE, i32 0, inrange i32 0, i32 2), ptr
+// CHECK-NOT: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5Test31AE, i32 0, i32 0, i32 2), ptr
 A::~A() {
   
 }
@@ -76,7 +76,7 @@ struct A {
 };
 
 // CHECK-LABEL: define{{.*}} void @_ZN5Test41AD2Ev
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5Test41AE, i32 0, inrange i32 0, i32 2), ptr
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5Test41AE, i32 0, i32 0, i32 2), ptr
 A::~A()
 {
 }
@@ -100,7 +100,7 @@ struct A {
 };
 
 // CHECK-LABEL: define{{.*}} void @_ZN5Test51AD2Ev
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5Test51AE, i32 0, inrange i32 0, i32 2), ptr
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5Test51AE, i32 0, i32 0, i32 2), ptr
 A::~A()
 {
 }
@@ -128,7 +128,7 @@ struct A {
 };
 
 // CHECK-LABEL: define{{.*}} void @_ZN5Test61AD2Ev
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5Test61AE, i32 0, inrange i32 0, i32 2), ptr
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5Test61AE, i32 0, i32 0, i32 2), ptr
 A::~A()
 {
 }
@@ -154,7 +154,7 @@ struct A {
 };
 
 // CHECK-LABEL: define{{.*}} void @_ZN5Test71AD2Ev
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5Test71AE, i32 0, inrange i32 0, i32 2), ptr
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5Test71AE, i32 0, i32 0, i32 2), ptr
 A::~A()
 {
 }
@@ -180,7 +180,7 @@ struct A {
 };
 
 // CHECK-LABEL: define{{.*}} void @_ZN5Test81AD2Ev
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5Test81AE, i32 0, inrange i32 0, i32 2), ptr
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5Test81AE, i32 0, i32 0, i32 2), ptr
 A::~A()
 {
 }
diff --git a/clang/test/CodeGenCXX/static-init.cpp b/clang/test/CodeGenCXX/static-init.cpp
index a44f78116dbbee..51080c895ec65a 100644
--- a/clang/test/CodeGenCXX/static-init.cpp
+++ b/clang/test/CodeGenCXX/static-init.cpp
@@ -14,7 +14,7 @@
 // CHECK: @_ZGVZ2h2vE1i = linkonce_odr global i64 0, comdat, align 8{{$}}
 // CHECK: @_ZZN5test1L6getvarEiE3var = internal constant [4 x i32] [i32 1, i32 0, i32 2, i32 4], align 16
 // CHECK98: @_ZZN5test414useStaticLocalEvE3obj = linkonce_odr global %"struct.test4::HasVTable" zeroinitializer, comdat, align 8
-// CHECK11: @_ZZN5test414useStaticLocalEvE3obj = linkonce_odr global %"struct.test4::HasVTable" { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5test49HasVTableE, i32 0, inrange i32 0, i32 2) }, comdat, align 8
+// CHECK11: @_ZZN5test414useStaticLocalEvE3obj = linkonce_odr global %"struct.test4::HasVTable" { ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5test49HasVTableE, i32 0, i32 0, i32 2) }, comdat, align 8
 
 struct A {
   A();
diff --git a/clang/test/CodeGenCXX/strict-vtable-pointers.cpp b/clang/test/CodeGenCXX/strict-vtable-pointers.cpp
index 34e50b39a5daa4..6dffa50a5430d0 100644
--- a/clang/test/CodeGenCXX/strict-vtable-pointers.cpp
+++ b/clang/test/CodeGenCXX/strict-vtable-pointers.cpp
@@ -154,10 +154,10 @@ struct DynamicDerivedMultiple;
 // CHECK-CTORS: call void @_ZN12DynamicBase2C2Ev(
 // CHECK-CTORS-NOT: @llvm.launder.invariant.group.p0
 
-// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i32 0, inrange i32 0, i32 2), {{.*}} %[[THIS0]]
+// CHECK-CTORS: store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV22DynamicDerivedMultiple, i32 0, i32 0, i32 2), ptr %[[THIS0]]
 // CHECK-CTORS: %[[THIS_ADD:.*]] = getelementptr inbounds i8, ptr %[[THIS0]], i64 16
 
-// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i32 0, inrange i32 1, i32 2), {{.*}} %[[THIS_ADD]]
+// CHECK-CTORS: store ptr getelementptr inbounds inrange(-16, 8) ({ [4 x ptr], [3 x ptr] }, ptr @_ZTV22DynamicDerivedMultiple, i32 0, i32 1, i32 2), ptr %[[THIS_ADD]]
 // CHECK-CTORS-LABEL: {{^}}}
 
 struct DynamicFromStatic;
diff --git a/clang/test/CodeGenCXX/visibility.cpp b/clang/test/CodeGenCXX/visibility.cpp
index e307b8747c8477..e1061f3dbd18f8 100644
--- a/clang/test/CodeGenCXX/visibility.cpp
+++ b/clang/test/CodeGenCXX/visibility.cpp
@@ -207,7 +207,7 @@ namespace test27 {
 // CHECK-HIDDEN: @_ZGVZN6test681fC1EvE4test = linkonce_odr hidden global
 
 // CHECK-HIDDEN: @_ZTVN6test701DE = linkonce_odr hidden unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZTIN6test701DE] }, align 8
-// CHECK-HIDDEN: @_ZTTN6test701DE = linkonce_odr hidden unnamed_addr constant [1 x ptr] [ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN6test701DE, i32 0, inrange i32 0, i32 3)], align 8
+// CHECK-HIDDEN: @_ZTTN6test701DE = linkonce_odr hidden unnamed_addr constant [1 x ptr] [ptr getelementptr inbounds inrange(-24, 0) ({ [3 x ptr] }, ptr @_ZTVN6test701DE, i32 0, i32 0, i32 3)], align 8
 
 // CHECK: @_ZZN6Test193fooIiEEvvE1a = linkonce_odr global
 // CHECK-HIDDEN: @_ZZN6Test193fooIiEEvvE1a = linkonce_odr hidden global
diff --git a/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
index 251d12bbb62f30..d765fe94d9b084 100644
--- a/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
@@ -25,7 +25,7 @@ void g(A *a) { a->foo(); }
 // CHECK1-LABEL: define{{.*}} void @_ZN5test14fooAEv()
 // CHECK1: call void @_ZN5test11AC1Ev(ptr
 // CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
-// CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11AE, i32 0, inrange i32 0, i32 2)
+// CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11AE, i32 0, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
 
@@ -37,7 +37,7 @@ void fooA() {
 // CHECK1-LABEL: define{{.*}} void @_ZN5test14fooBEv()
 // CHECK1: call void @_ZN5test11BC1Ev(ptr {{[^,]*}} %{{.*}})
 // CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
-// CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11BE, i32 0, inrange i32 0, i32 2)
+// CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11BE, i32 0, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
 
@@ -71,12 +71,12 @@ void h(B *b) { b->bar(); }
 // CHECK2-LABEL: define{{.*}} void @_ZN5test24testEv()
 // CHECK2: call void @_ZN5test21CC1Ev(ptr
 // CHECK2: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{.*}}
-// CHECK2: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, inrange i32 0, i32 2)
+// CHECK2: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, i32 0, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP]])
 
 // CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %{{.*}}, i64 8
 // CHECK2: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr %[[ADD_PTR]]
-// CHECK2: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, inrange i32 1, i32 2)
+// CHECK2: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, i32 1, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP2]])
 
 // CHECK2: call void @_ZN5test21gEPNS_1AE(
@@ -107,7 +107,7 @@ void g(B *a) { a->foo(); }
 
 // CHECK3-LABEL: define{{.*}} void @_ZN5test34testEv()
 // CHECK3: call void @_ZN5test31CC1Ev(ptr
-// CHECK3: %[[CMP:.*]] = icmp eq ptr addrspace(1) %{{.*}}, getelementptr inbounds ({ [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test31CE, i32 0, inrange i32 0, i32 3)
+// CHECK3: %[[CMP:.*]] = icmp eq ptr addrspace(1) %{{.*}}, getelementptr inbounds inrange(-24, 8) ({ [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test31CE, i32 0, i32 0, i32 3)
 // CHECK3: call void @llvm.assume(i1 %[[CMP]])
 // CHECK3-LABLEL: }
 void test() {
@@ -136,11 +136,11 @@ void g(C *c) { c->foo(); }
 // CHECK4-LABEL: define{{.*}} void @_ZN5test44testEv()
 // CHECK4: call void @_ZN5test41CC1Ev(ptr
 // CHECK4: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
-// CHECK4: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, inrange i32 0, i32 4)
+// CHECK4: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP]]
 
 // CHECK4: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr %{{.*}}
-// CHECK4: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, inrange i32 0, i32 4)
+// CHECK4: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP2]])
 // CHECK4-LABEL: {{^}}}
 
diff --git a/clang/test/CodeGenCXX/vtable-assume-load.cpp b/clang/test/CodeGenCXX/vtable-assume-load.cpp
index bd9e4fc62520d8..6ce07d0db1b15a 100644
--- a/clang/test/CodeGenCXX/vtable-assume-load.cpp
+++ b/clang/test/CodeGenCXX/vtable-assume-load.cpp
@@ -27,7 +27,7 @@ void g(A *a) { a->foo(); }
 // CHECK1-LABEL: define{{.*}} void @_ZN5test14fooAEv()
 // CHECK1: call void @_ZN5test11AC1Ev(ptr
 // CHECK1: %[[VTABLE:.*]] = load ptr, ptr %{{.*}}
-// CHECK1: %[[CMP:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5test11AE, i32 0, inrange i32 0, i32 2)
+// CHECK1: %[[CMP:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5test11AE, i32 0, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
 
@@ -39,7 +39,7 @@ void fooA() {
 // CHECK1-LABEL: define{{.*}} void @_ZN5test14fooBEv()
 // CHECK1: call void @_ZN5test11BC1Ev(ptr {{[^,]*}} %{{.*}})
 // CHECK1: %[[VTABLE:.*]] = load ptr, ptr %{{.*}}
-// CHECK1: %[[CMP:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5test11BE, i32 0, inrange i32 0, i32 2)
+// CHECK1: %[[CMP:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5test11BE, i32 0, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
 
@@ -73,12 +73,12 @@ void h(B *b) { b->bar(); }
 // CHECK2-LABEL: define{{.*}} void @_ZN5test24testEv()
 // CHECK2: call void @_ZN5test21CC1Ev(ptr
 // CHECK2: %[[VTABLE:.*]] = load ptr, ptr {{.*}}
-// CHECK2: %[[CMP:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @_ZTVN5test21CE, i32 0, inrange i32 0, i32 2)
+// CHECK2: %[[CMP:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr], [3 x ptr] }, ptr @_ZTVN5test21CE, i32 0, i32 0, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP]])
 
 // CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %{{.*}}, i64 8
 // CHECK2: %[[VTABLE2:.*]] = load ptr, ptr %[[ADD_PTR]]
-// CHECK2: %[[CMP2:.*]] = icmp eq ptr %[[VTABLE2]], getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @_ZTVN5test21CE, i32 0, inrange i32 1, i32 2)
+// CHECK2: %[[CMP2:.*]] = icmp eq ptr %[[VTABLE2]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr], [3 x ptr] }, ptr @_ZTVN5test21CE, i32 0, i32 1, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP2]])
 
 // CHECK2: call void @_ZN5test21gEPNS_1AE(
@@ -109,7 +109,7 @@ void g(B *a) { a->foo(); }
 
 // CHECK3-LABEL: define{{.*}} void @_ZN5test34testEv()
 // CHECK3: call void @_ZN5test31CC1Ev(ptr
-// CHECK3: %[[CMP:.*]] = icmp eq ptr %{{.*}}, getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTVN5test31CE, i32 0, inrange i32 0, i32 3)
+// CHECK3: %[[CMP:.*]] = icmp eq ptr %{{.*}}, getelementptr inbounds inrange(-24, 8) ({ [4 x ptr] }, ptr @_ZTVN5test31CE, i32 0, i32 0, i32 3)
 // CHECK3: call void @llvm.assume(i1 %[[CMP]])
 // CHECK3-LABLEL: }
 void test() {
@@ -138,11 +138,11 @@ void g(C *c) { c->foo(); }
 // CHECK4-LABEL: define{{.*}} void @_ZN5test44testEv()
 // CHECK4: call void @_ZN5test41CC1Ev(ptr
 // CHECK4: %[[VTABLE:.*]] = load ptr, ptr %{{.*}}
-// CHECK4: %[[CMP:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTVN5test41CE, i32 0, inrange i32 0, i32 4)
+// CHECK4: %[[CMP:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr] }, ptr @_ZTVN5test41CE, i32 0, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP]]
 
 // CHECK4: %[[VTABLE2:.*]] = load ptr, ptr %{{.*}}
-// CHECK4: %[[CMP2:.*]] = icmp eq ptr %[[VTABLE2]], getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTVN5test41CE, i32 0, inrange i32 0, i32 4)
+// CHECK4: %[[CMP2:.*]] = icmp eq ptr %[[VTABLE2]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr] }, ptr @_ZTVN5test41CE, i32 0, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP2]])
 // CHECK4-LABEL: {{^}}}
 
diff --git a/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
index 247864862fecf8..a3f12f0ebfc87b 100644
--- a/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
@@ -21,13 +21,13 @@ struct A : Base {
 
 // CHECK-LABEL: define{{.*}} void @_ZN1AC2Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
-// CHECK: store ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1A, i32 0, inrange i32 0, i32 2)
+// CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1A, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldC1Ev(
 // CHECK: ret void
 A::A() { }
 
 // CHECK-LABEL: define{{.*}} void @_ZN1AD2Ev(ptr {{[^,]*}} %this) unnamed_addr
-// CHECK: store ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1A, i32 0, inrange i32 0, i32 2)
+// CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1A, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
@@ -49,12 +49,12 @@ void f() { B b; }
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BC2Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
-// CHECK: store ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, inrange i32 0, i32 2)
+// CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldC1Ev
 // CHECK: ret void
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BD2Ev(ptr {{[^,]*}} %this) unnamed_addr
-// CHECK: store ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, inrange i32 0, i32 2)
+// CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
diff --git a/clang/test/CodeGenCXX/vtable-pointer-initialization.cpp b/clang/test/CodeGenCXX/vtable-pointer-initialization.cpp
index 00eda6233d940b..16b180705fa6cb 100644
--- a/clang/test/CodeGenCXX/vtable-pointer-initialization.cpp
+++ b/clang/test/CodeGenCXX/vtable-pointer-initialization.cpp
@@ -21,13 +21,13 @@ struct A : Base {
 
 // CHECK-LABEL: define{{.*}} void @_ZN1AC2Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2)
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldC1Ev(
 // CHECK: ret void
 A::A() { }
 
 // CHECK-LABEL: define{{.*}} void @_ZN1AD2Ev(ptr {{[^,]*}} %this) unnamed_addr
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2)
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
@@ -49,12 +49,12 @@ void f() { B b; }
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BC2Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2)
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldC1Ev
 // CHECK: ret void
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BD2Ev(ptr {{[^,]*}} %this) unnamed_addr
-// CHECK: store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2)
+// CHECK: store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
diff --git a/clang/test/CodeGenCXX/vtt-address-space.cpp b/clang/test/CodeGenCXX/vtt-address-space.cpp
index e567ae49811a47..24f4e2a755da04 100644
--- a/clang/test/CodeGenCXX/vtt-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtt-address-space.cpp
@@ -18,7 +18,7 @@ namespace Test {
   D d;
 }
 
-// CHECK: linkonce_odr unnamed_addr addrspace(1) constant [13 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 0, i32 5), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE0_NS_2C1E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE0_NS_2C1E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 1, i32 6), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 1, i32 6), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, inrange i32 3, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE64_NS_2V2E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE64_NS_2V2E, i32 0, inrange i32 1, i32 3)], comdat, align 8
+// CHECK: linkonce_odr unnamed_addr addrspace(1) constant [13 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds inrange(-40, 0) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, i32 0, i32 5), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE0_NS_2C1E, i32 0, i32 0, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE0_NS_2C1E, i32 0, i32 1, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-48, 8) ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, i32 0, i32 6), ptr addrspace(1) getelementptr inbounds inrange(-48, 8) ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, i32 0, i32 6), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, i32 1, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE16_NS_2C2E, i32 0, i32 2, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, i32 2, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-48, 8) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, i32 1, i32 6), ptr addrspace(1) getelementptr inbounds inrange(-48, 8) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, i32 1, i32 6), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN4Test1DE, i32 0, i32 3, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE64_NS_2V2E, i32 0, i32 0, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN4Test1DE64_NS_2V2E, i32 0, i32 1, i32 3)], comdat, align 8
 // CHECK: call void @_ZN4Test2V2C2Ev(ptr noundef nonnull align 8 dereferenceable(20) %2, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 11))
 // CHECK: call void @_ZN4Test2C1C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this1, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 1))
 // CHECK: call void @_ZN4Test2C2C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %3, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 3))
diff --git a/clang/test/CodeGenCXX/vtt-layout-address-space.cpp b/clang/test/CodeGenCXX/vtt-layout-address-space.cpp
index 2f80c9ec5f9fe1..409efb9be0215e 100644
--- a/clang/test/CodeGenCXX/vtt-layout-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtt-layout-address-space.cpp
@@ -78,12 +78,12 @@ namespace Test6 {
   }
 }
 
-// CHECK: @_ZTTN5Test11BE ={{.*}} unnamed_addr addrspace(1) constant [1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test11BE, i32 0, inrange i32 0, i32 3)]
+// CHECK: @_ZTTN5Test11BE ={{.*}} unnamed_addr addrspace(1) constant [1 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test11BE, i32 0, i32 0, i32 3)]
 // CHECK: @_ZTVN5Test51AE ={{.*}} unnamed_addr addrspace(1) constant { [4 x ptr addrspace(1)] } { [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTIN5Test51AE, ptr addrspace(1) addrspacecast (ptr @__cxa_pure_virtual to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN5Test51A6anchorEv to ptr addrspace(1))] }
 // CHECK: @_ZTVN5Test61AE ={{.*}} unnamed_addr addrspace(1) constant { [4 x ptr addrspace(1)] } { [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTIN5Test61AE, ptr addrspace(1) addrspacecast (ptr @__cxa_deleted_virtual to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN5Test61A6anchorEv to ptr addrspace(1))] }
-// CHECK: @_ZTTN5Test21CE = linkonce_odr unnamed_addr addrspace(1) constant [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test21CE, i32 0, inrange i32 0, i32 4), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test21CE, i32 0, inrange i32 0, i32 4)]
-// CHECK: @_ZTTN5Test31DE = linkonce_odr unnamed_addr addrspace(1) constant [13 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 0, i32 5), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE0_NS_2C1E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE0_NS_2C1E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 1, i32 6), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 1, i32 6), ptr addrspace(1) getelementptr inbounds ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, inrange i32 3, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE64_NS_2V2E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE64_NS_2V2E, i32 0, inrange i32 1, i32 3)]
+// CHECK: @_ZTTN5Test21CE = linkonce_odr unnamed_addr addrspace(1) constant [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test21CE, i32 0, i32 0, i32 4), ptr addrspace(1) getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test21CE, i32 0, i32 0, i32 4)]
+// CHECK: @_ZTTN5Test31DE = linkonce_odr unnamed_addr addrspace(1) constant [13 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds inrange(-40, 0) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, i32 0, i32 5), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE0_NS_2C1E, i32 0, i32 0, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE0_NS_2C1E, i32 0, i32 1, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-48, 8) ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 0, i32 6), ptr addrspace(1) getelementptr inbounds inrange(-48, 8) ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 0, i32 6), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 1, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [7 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 2, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, i32 2, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-48, 8) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, i32 1, i32 6), ptr addrspace(1) getelementptr inbounds inrange(-48, 8) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, i32 1, i32 6), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [5 x ptr addrspace(1)], [7 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test31DE, i32 0, i32 3, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE64_NS_2V2E, i32 0, i32 0, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test31DE64_NS_2V2E, i32 0, i32 1, i32 3)]
 // CHECK: @_ZTVN5Test41DE = linkonce_odr unnamed_addr addrspace(1) constant { [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] } { [6 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 72 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 16 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 56 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 40 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) @_ZTIN5Test41DE], [8 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 40 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 24 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 56 to ptr addrspace(1)), ptr addrspace(1) null, ptr addrspace(1) null, ptr addrspace(1) inttoptr (i64 -16 to ptr addrspace(1)), ptr addrspace(1) @_ZTIN5Test41DE, ptr addrspace(1) addrspacecast (ptr @_ZN5Test42V31gEv to ptr addrspace(1))], [3 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 16 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -40 to ptr addrspace(1)), ptr addrspace(1) @_ZTIN5Test41DE], [4 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) inttoptr (i64 -56 to ptr addrspace(1)), ptr addrspace(1) @_ZTIN5Test41DE, ptr addrspace(1) addrspacecast (ptr @_ZN5Test42A21fEv to ptr addrspace(1))], [4 x ptr addrspace(1)] [ptr addrspace(1) inttoptr (i64 -16 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -32 to ptr addrspace(1)), ptr addrspace(1) inttoptr (i64 -72 to ptr addrspace(1)), ptr addrspace(1) @_ZTIN5Test41DE] }
-// CHECK: @_ZTTN5Test41DE = linkonce_odr unnamed_addr addrspace(1) constant [19 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 0, i32 6), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE0_NS_2C1E, i32 0, inrange i32 0, i32 4), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE0_NS_2C1E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE0_NS_2C1E, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 0, i32 7), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 0, i32 7), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 1, i32 4), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 3, i32 3), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 2, i32 3), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 3, i32 3), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 1, i32 7), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 1, i32 7), ptr addrspace(1) getelementptr inbounds ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, inrange i32 4, i32 4), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE40_NS_2V1E, i32 0, inrange i32 0, i32 3), ptr addrspace(1) getelementptr inbounds ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE40_NS_2V1E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE72_NS_2V2E, i32 0, inrange i32 0, i32 4), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE72_NS_2V2E, i32 0, inrange i32 1, i32 3), ptr addrspace(1) getelementptr inbounds ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE72_NS_2V2E, i32 0, inrange i32 2, i32 3)]
+// CHECK: @_ZTTN5Test41DE = linkonce_odr unnamed_addr addrspace(1) constant [19 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds inrange(-48, 0) ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, i32 0, i32 6), ptr addrspace(1) getelementptr inbounds inrange(-32, 0) ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 0, i32 4), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 1, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 2, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-56, 8) ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 0, i32 7), ptr addrspace(1) getelementptr inbounds inrange(-56, 8) ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 0, i32 7), ptr addrspace(1) getelementptr inbounds inrange(-32, 0) ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 1, i32 4), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 2, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [8 x ptr addrspace(1)], [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 3, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, i32 2, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, i32 3, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-56, 8) ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, i32 1, i32 7), ptr addrspace(1) getelementptr inbounds inrange(-56, 8) ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, i32 1, i32 7), ptr addrspace(1) getelementptr inbounds inrange(-32, 0) ({ [6 x ptr addrspace(1)], [8 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5Test41DE, i32 0, i32 4, i32 4), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE40_NS_2V1E, i32 0, i32 0, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE40_NS_2V1E, i32 0, i32 1, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-32, 0) ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 0, i32 4), ptr addrspace(1) getelementptr inbounds inrange(-24, 0) ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 1, i32 3), ptr addrspace(1) getelementptr inbounds inrange(-24, 8) ({ [4 x ptr addrspace(1)], [3 x ptr addrspace(1)], [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 2, i32 3)]
 // CHECK: declare void @__cxa_pure_virtual() unnamed_addr
 // CHECK: declare void @__cxa_deleted_virtual() unnamed_addr
diff --git a/clang/test/CodeGenCXX/vtt-layout.cpp b/clang/test/CodeGenCXX/vtt-layout.cpp
index 595c64c6a33bd1..573a8de3094a12 100644
--- a/clang/test/CodeGenCXX/vtt-layout.cpp
+++ b/clang/test/CodeGenCXX/vtt-layout.cpp
@@ -78,12 +78,12 @@ namespace Test6 {
   }
 }
 
-// CHECK: @_ZTTN5Test11BE ={{.*}} unnamed_addr constant [1 x ptr] [ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTVN5Test11BE, i32 0, inrange i32 0, i32 3)]
+// CHECK: @_ZTTN5Test11BE ={{.*}} unnamed_addr constant [1 x ptr] [ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr] }, ptr @_ZTVN5Test11BE, i32 0, i32 0, i32 3)]
 // CHECK: @_ZTVN5Test51AE ={{.*}} unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTIN5Test51AE, ptr @__cxa_pure_virtual, ptr @_ZN5Test51A6anchorEv] }
 // CHECK: @_ZTVN5Test61AE ={{.*}} unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTIN5Test61AE, ptr @__cxa_deleted_virtual, ptr @_ZN5Test61A6anchorEv] }
-// CHECK: @_ZTTN5Test21CE = linkonce_odr unnamed_addr constant [2 x ptr] [ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTVN5Test21CE, i32 0, inrange i32 0, i32 4), ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTVN5Test21CE, i32 0, inrange i32 0, i32 4)]
-// CHECK: @_ZTTN5Test31DE = linkonce_odr unnamed_addr constant [13 x ptr] [ptr getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, inrange i32 0, i32 5), ptr getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE0_NS_2C1E, i32 0, inrange i32 0, i32 3), ptr getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE0_NS_2C1E, i32 0, inrange i32 1, i32 3), ptr getelementptr inbounds ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr getelementptr inbounds ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 0, i32 6), ptr getelementptr inbounds ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 1, i32 3), ptr getelementptr inbounds ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE16_NS_2C2E, i32 0, inrange i32 2, i32 3), ptr getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, inrange i32 2, i32 3), ptr getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, inrange i32 1, i32 6), ptr getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, inrange i32 1, i32 6), ptr getelementptr inbounds ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, inrange i32 3, i32 3), ptr getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE64_NS_2V2E, i32 0, inrange i32 0, i32 3), ptr getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE64_NS_2V2E, i32 0, inrange i32 1, i32 3)]
+// CHECK: @_ZTTN5Test21CE = linkonce_odr unnamed_addr constant [2 x ptr] [ptr getelementptr inbounds inrange(-32, 8) ({ [5 x ptr] }, ptr @_ZTVN5Test21CE, i32 0, i32 0, i32 4), ptr getelementptr inbounds inrange(-32, 8) ({ [5 x ptr] }, ptr @_ZTVN5Test21CE, i32 0, i32 0, i32 4)]
+// CHECK: @_ZTTN5Test31DE = linkonce_odr unnamed_addr constant [13 x ptr] [ptr getelementptr inbounds inrange(-40, 0) ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, i32 0, i32 5), ptr getelementptr inbounds inrange(-24, 0) ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE0_NS_2C1E, i32 0, i32 0, i32 3), ptr getelementptr inbounds inrange(-24, 8) ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE0_NS_2C1E, i32 0, i32 1, i32 3), ptr getelementptr inbounds inrange(-48, 8) ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 0, i32 6), ptr getelementptr inbounds inrange(-48, 8) ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 0, i32 6), ptr getelementptr inbounds inrange(-24, 0) ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 1, i32 3), ptr getelementptr inbounds inrange(-24, 8) ({ [7 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 2, i32 3), ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, i32 2, i32 3), ptr getelementptr inbounds inrange(-48, 8) ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, i32 1, i32 6), ptr getelementptr inbounds inrange(-48, 8) ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, i32 1, i32 6), ptr getelementptr inbounds inrange(-24, 0) ({ [5 x ptr], [7 x ptr], [4 x ptr], [3 x ptr] }, ptr @_ZTVN5Test31DE, i32 0, i32 3, i32 3), ptr getelementptr inbounds inrange(-24, 0) ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE64_NS_2V2E, i32 0, i32 0, i32 3), ptr getelementptr inbounds inrange(-24, 8) ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test31DE64_NS_2V2E, i32 0, i32 1, i32 3)]
 // CHECK: @_ZTVN5Test41DE = linkonce_odr unnamed_addr constant { [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] } { [6 x ptr] [ptr inttoptr (i64 72 to ptr), ptr inttoptr (i64 16 to ptr), ptr inttoptr (i64 56 to ptr), ptr inttoptr (i64 40 to ptr), ptr null, ptr @_ZTIN5Test41DE], [8 x ptr] [ptr inttoptr (i64 40 to ptr), ptr inttoptr (i64 24 to ptr), ptr inttoptr (i64 56 to ptr), ptr null, ptr null, ptr inttoptr (i64 -16 to ptr), ptr @_ZTIN5Test41DE, ptr @_ZN5Test42V31gEv], [3 x ptr] [ptr inttoptr (i64 16 to ptr), ptr inttoptr (i64 -40 to ptr), ptr @_ZTIN5Test41DE], [4 x ptr] [ptr null, ptr inttoptr (i64 -56 to ptr), ptr @_ZTIN5Test41DE, ptr @_ZN5Test42A21fEv], [4 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr inttoptr (i64 -32 to ptr), ptr inttoptr (i64 -72 to ptr), ptr @_ZTIN5Test41DE] }
-// CHECK: @_ZTTN5Test41DE = linkonce_odr unnamed_addr constant [19 x ptr] [ptr getelementptr inbounds ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, inrange i32 0, i32 6), ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE0_NS_2C1E, i32 0, inrange i32 0, i32 4), ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE0_NS_2C1E, i32 0, inrange i32 1, i32 3), ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE0_NS_2C1E, i32 0, inrange i32 2, i32 3), ptr getelementptr inbounds ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 0, i32 7), ptr getelementptr inbounds ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 0, i32 7), ptr getelementptr inbounds ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 1, i32 4), ptr getelementptr inbounds ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 2, i32 3), ptr getelementptr inbounds ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, inrange i32 3, i32 3), ptr getelementptr inbounds ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, inrange i32 2, i32 3), ptr getelementptr inbounds ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, inrange i32 3, i32 3), ptr getelementptr inbounds ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, inrange i32 1, i32 7), ptr getelementptr inbounds ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, inrange i32 1, i32 7), ptr getelementptr inbounds ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, inrange i32 4, i32 4), ptr getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE40_NS_2V1E, i32 0, inrange i32 0, i32 3), ptr getelementptr inbounds ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE40_NS_2V1E, i32 0, inrange i32 1, i32 3), ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE72_NS_2V2E, i32 0, inrange i32 0, i32 4), ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE72_NS_2V2E, i32 0, inrange i32 1, i32 3), ptr getelementptr inbounds ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE72_NS_2V2E, i32 0, inrange i32 2, i32 3)]
+// CHECK: @_ZTTN5Test41DE = linkonce_odr unnamed_addr constant [19 x ptr] [ptr getelementptr inbounds inrange(-48, 0) ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, i32 0, i32 6), ptr getelementptr inbounds inrange(-32, 0) ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 0, i32 4), ptr getelementptr inbounds inrange(-24, 0) ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 1, i32 3), ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 2, i32 3), ptr getelementptr inbounds inrange(-56, 8) ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 0, i32 7), ptr getelementptr inbounds inrange(-56, 8) ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 0, i32 7), ptr getelementptr inbounds inrange(-32, 0) ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 1, i32 4), ptr getelementptr inbounds inrange(-24, 0) ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 2, i32 3), ptr getelementptr inbounds inrange(-24, 8) ({ [8 x ptr], [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 3, i32 3), ptr getelementptr inbounds inrange(-24, 0) ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, i32 2, i32 3), ptr getelementptr inbounds inrange(-24, 8) ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, i32 3, i32 3), ptr getelementptr inbounds inrange(-56, 8) ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, i32 1, i32 7), ptr getelementptr inbounds inrange(-56, 8) ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, i32 1, i32 7), ptr getelementptr inbounds inrange(-32, 0) ({ [6 x ptr], [8 x ptr], [3 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTVN5Test41DE, i32 0, i32 4, i32 4), ptr getelementptr inbounds inrange(-24, 0) ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE40_NS_2V1E, i32 0, i32 0, i32 3), ptr getelementptr inbounds inrange(-24, 8) ({ [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE40_NS_2V1E, i32 0, i32 1, i32 3), ptr getelementptr inbounds inrange(-32, 0) ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 0, i32 4), ptr getelementptr inbounds inrange(-24, 0) ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 1, i32 3), ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [3 x ptr], [4 x ptr] }, ptr @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 2, i32 3)]
 // CHECK: declare void @__cxa_pure_virtual() unnamed_addr
 // CHECK: declare void @__cxa_deleted_virtual() unnamed_addr
diff --git a/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
new file mode 100644
index 00000000000000..e3ef26429e7e40
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/clamp-builtin.hlsl
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// CHECK-LABEL: builtin_test_clamp_int4
+// CHECK: %dx.clamp = call <4 x i32> @llvm.dx.clamp.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+// CHECK: ret <4 x i32> %dx.clamp
+int4 builtin_test_clamp_int4(int4 p0, int4 p1, int4 p2) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p2);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
new file mode 100644
index 00000000000000..029e48ffe25865
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
@@ -0,0 +1,134 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+#ifdef __HLSL_ENABLE_16_BIT
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.dx.clamp.i16(
+int16_t test_clamp_short(int16_t p0, int16_t p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.dx.clamp.v2i16(
+int16_t2 test_clamp_short2(int16_t2 p0, int16_t2 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.dx.clamp.v3i16
+int16_t3 test_clamp_short3(int16_t3 p0, int16_t3 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.dx.clamp.v4i16
+int16_t4 test_clamp_short4(int16_t4 p0, int16_t4 p1) { return clamp(p0, p1,p1); }
+
+// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF: call i16 @llvm.dx.uclamp.i16(
+uint16_t test_clamp_ushort(uint16_t p0, uint16_t p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF: call <2 x i16> @llvm.dx.uclamp.v2i16
+uint16_t2 test_clamp_ushort2(uint16_t2 p0, uint16_t2 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF: call <3 x i16> @llvm.dx.uclamp.v3i16
+uint16_t3 test_clamp_ushort3(uint16_t3 p0, uint16_t3 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF: call <4 x i16> @llvm.dx.uclamp.v4i16
+uint16_t4 test_clamp_ushort4(uint16_t4 p0, uint16_t4 p1) { return clamp(p0, p1,p1); }
+#endif
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.dx.clamp.i32(
+int test_clamp_int(int p0, int p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.dx.clamp.v2i32
+int2 test_clamp_int2(int2 p0, int2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.dx.clamp.v3i32
+int3 test_clamp_int3(int3 p0, int3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.dx.clamp.v4i32
+int4 test_clamp_int4(int4 p0, int4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef i32 @
+// CHECK: call i32 @llvm.dx.uclamp.i32(
+int test_clamp_uint(uint p0, uint p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i32> @
+// CHECK: call <2 x i32> @llvm.dx.uclamp.v2i32
+uint2 test_clamp_uint2(uint2 p0, uint2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i32> @
+// CHECK: call <3 x i32> @llvm.dx.uclamp.v3i32
+uint3 test_clamp_uint3(uint3 p0, uint3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i32> @
+// CHECK: call <4 x i32> @llvm.dx.uclamp.v4i32
+uint4 test_clamp_uint4(uint4 p0, uint4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.dx.clamp.i64(
+int64_t test_clamp_long(int64_t p0, int64_t p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.dx.clamp.v2i64
+int64_t2 test_clamp_long2(int64_t2 p0, int64_t2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.dx.clamp.v3i64
+int64_t3 test_clamp_long3(int64_t3 p0, int64_t3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.dx.clamp.v4i64
+int64_t4 test_clamp_long4(int64_t4 p0, int64_t4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef i64 @
+// CHECK: call i64 @llvm.dx.uclamp.i64(
+uint64_t test_clamp_long(uint64_t p0, uint64_t p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x i64> @
+// CHECK: call <2 x i64> @llvm.dx.uclamp.v2i64
+uint64_t2 test_clamp_long2(uint64_t2 p0, uint64_t2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x i64> @
+// CHECK: call <3 x i64> @llvm.dx.uclamp.v3i64
+uint64_t3 test_clamp_long3(uint64_t3 p0, uint64_t3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x i64> @
+// CHECK: call <4 x i64> @llvm.dx.uclamp.v4i64
+uint64_t4 test_clamp_long4(uint64_t4 p0, uint64_t4 p1) { return clamp(p0, p1,p1); }
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.dx.clamp.f16(
+// NO_HALF: define noundef float @"?test_clamp_half
+// NO_HALF: call float @llvm.dx.clamp.f32(
+half test_clamp_half(half p0, half p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.dx.clamp.v2f16
+// NO_HALF: define noundef <2 x float> @"?test_clamp_half2
+// NO_HALF: call <2 x float> @llvm.dx.clamp.v2f32(
+half2 test_clamp_half2(half2 p0, half2 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.dx.clamp.v3f16
+// NO_HALF: define noundef <3 x float> @"?test_clamp_half3
+// NO_HALF: call <3 x float> @llvm.dx.clamp.v3f32(
+half3 test_clamp_half3(half3 p0, half3 p1) { return clamp(p0, p1,p1); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.dx.clamp.v4f16
+// NO_HALF: define noundef <4 x float> @"?test_clamp_half4
+// NO_HALF: call <4 x float> @llvm.dx.clamp.v4f32(
+half4 test_clamp_half4(half4 p0, half4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef float @"?test_clamp_float
+// CHECK: call float @llvm.dx.clamp.f32(
+float test_clamp_float(float p0, float p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x float> @"?test_clamp_float2
+// CHECK: call <2 x float> @llvm.dx.clamp.v2f32
+float2 test_clamp_float2(float2 p0, float2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x float> @"?test_clamp_float3
+// CHECK: call <3 x float> @llvm.dx.clamp.v3f32
+float3 test_clamp_float3(float3 p0, float3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x float> @"?test_clamp_float4
+// CHECK: call <4 x float> @llvm.dx.clamp.v4f32
+float4 test_clamp_float4(float4 p0, float4 p1) { return clamp(p0, p1,p1); }
+
+// CHECK: define noundef double @
+// CHECK: call double @llvm.dx.clamp.f64(
+double test_clamp_double(double p0, double p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <2 x double> @
+// CHECK: call <2 x double> @llvm.dx.clamp.v2f64
+double2 test_clamp_double2(double2 p0, double2 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <3 x double> @
+// CHECK: call <3 x double> @llvm.dx.clamp.v3f64
+double3 test_clamp_double3(double3 p0, double3 p1) { return clamp(p0, p1,p1); }
+// CHECK: define noundef <4 x double> @
+// CHECK: call <4 x double> @llvm.dx.clamp.v4f64
+double4 test_clamp_double4(double4 p0, double4 p1) { return clamp(p0, p1,p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl
index c064d118caf3e7..0f993193c00cce 100644
--- a/clang/test/CodeGenHLSL/builtins/dot.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl
@@ -11,15 +11,15 @@
 // NATIVE_HALF: ret i16 %dx.dot
 int16_t test_dot_short(int16_t p0, int16_t p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v2i16(<2 x i16> %0, <2 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.sdot.v2i16(<2 x i16> %0, <2 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 int16_t test_dot_short2(int16_t2 p0, int16_t2 p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v3i16(<3 x i16> %0, <3 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.sdot.v3i16(<3 x i16> %0, <3 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 int16_t test_dot_short3(int16_t3 p0, int16_t3 p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v4i16(<4 x i16> %0, <4 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.sdot.v4i16(<4 x i16> %0, <4 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 int16_t test_dot_short4(int16_t4 p0, int16_t4 p1) { return dot(p0, p1); }
 
@@ -27,15 +27,15 @@ int16_t test_dot_short4(int16_t4 p0, int16_t4 p1) { return dot(p0, p1); }
 // NATIVE_HALF: ret i16 %dx.dot
 uint16_t test_dot_ushort(uint16_t p0, uint16_t p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v2i16(<2 x i16> %0, <2 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.udot.v2i16(<2 x i16> %0, <2 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 uint16_t test_dot_ushort2(uint16_t2 p0, uint16_t2 p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v3i16(<3 x i16> %0, <3 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.udot.v3i16(<3 x i16> %0, <3 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 uint16_t test_dot_ushort3(uint16_t3 p0, uint16_t3 p1) { return dot(p0, p1); }
 
-// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v4i16(<4 x i16> %0, <4 x i16> %1)
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.udot.v4i16(<4 x i16> %0, <4 x i16> %1)
 // NATIVE_HALF: ret i16 %dx.dot
 uint16_t test_dot_ushort4(uint16_t4 p0, uint16_t4 p1) { return dot(p0, p1); }
 #endif
@@ -44,15 +44,15 @@ uint16_t test_dot_ushort4(uint16_t4 p0, uint16_t4 p1) { return dot(p0, p1); }
 // CHECK: ret i32 %dx.dot
 int test_dot_int(int p0, int p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v2i32(<2 x i32> %0, <2 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.sdot.v2i32(<2 x i32> %0, <2 x i32> %1)
 // CHECK: ret i32 %dx.dot
 int test_dot_int2(int2 p0, int2 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v3i32(<3 x i32> %0, <3 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.sdot.v3i32(<3 x i32> %0, <3 x i32> %1)
 // CHECK: ret i32 %dx.dot
 int test_dot_int3(int3 p0, int3 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v4i32(<4 x i32> %0, <4 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.sdot.v4i32(<4 x i32> %0, <4 x i32> %1)
 // CHECK: ret i32 %dx.dot
 int test_dot_int4(int4 p0, int4 p1) { return dot(p0, p1); }
 
@@ -60,15 +60,15 @@ int test_dot_int4(int4 p0, int4 p1) { return dot(p0, p1); }
 // CHECK: ret i32 %dx.dot
 uint test_dot_uint(uint p0, uint p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v2i32(<2 x i32> %0, <2 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.udot.v2i32(<2 x i32> %0, <2 x i32> %1)
 // CHECK: ret i32 %dx.dot
 uint test_dot_uint2(uint2 p0, uint2 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v3i32(<3 x i32> %0, <3 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.udot.v3i32(<3 x i32> %0, <3 x i32> %1)
 // CHECK: ret i32 %dx.dot
 uint test_dot_uint3(uint3 p0, uint3 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i32 @llvm.dx.dot.v4i32(<4 x i32> %0, <4 x i32> %1)
+// CHECK: %dx.dot = call i32 @llvm.dx.udot.v4i32(<4 x i32> %0, <4 x i32> %1)
 // CHECK: ret i32 %dx.dot
 uint test_dot_uint4(uint4 p0, uint4 p1) { return dot(p0, p1); }
 
@@ -76,15 +76,15 @@ uint test_dot_uint4(uint4 p0, uint4 p1) { return dot(p0, p1); }
 // CHECK: ret i64 %dx.dot
 int64_t test_dot_long(int64_t p0, int64_t p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v2i64(<2 x i64> %0, <2 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.sdot.v2i64(<2 x i64> %0, <2 x i64> %1)
 // CHECK: ret i64 %dx.dot
 int64_t test_dot_long2(int64_t2 p0, int64_t2 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v3i64(<3 x i64> %0, <3 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.sdot.v3i64(<3 x i64> %0, <3 x i64> %1)
 // CHECK: ret i64 %dx.dot
 int64_t test_dot_long3(int64_t3 p0, int64_t3 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v4i64(<4 x i64> %0, <4 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.sdot.v4i64(<4 x i64> %0, <4 x i64> %1)
 // CHECK: ret i64 %dx.dot
 int64_t test_dot_long4(int64_t4 p0, int64_t4 p1) { return dot(p0, p1); }
 
@@ -92,15 +92,15 @@ int64_t test_dot_long4(int64_t4 p0, int64_t4 p1) { return dot(p0, p1); }
 // CHECK: ret i64 %dx.dot
 uint64_t test_dot_ulong(uint64_t p0, uint64_t p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v2i64(<2 x i64> %0, <2 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.udot.v2i64(<2 x i64> %0, <2 x i64> %1)
 // CHECK: ret i64 %dx.dot
 uint64_t test_dot_ulong2(uint64_t2 p0, uint64_t2 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v3i64(<3 x i64> %0, <3 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.udot.v3i64(<3 x i64> %0, <3 x i64> %1)
 // CHECK: ret i64 %dx.dot
 uint64_t test_dot_ulong3(uint64_t3 p0, uint64_t3 p1) { return dot(p0, p1); }
 
-// CHECK: %dx.dot = call i64 @llvm.dx.dot.v4i64(<4 x i64> %0, <4 x i64> %1)
+// CHECK: %dx.dot = call i64 @llvm.dx.udot.v4i64(<4 x i64> %0, <4 x i64> %1)
 // CHECK: ret i64 %dx.dot
 uint64_t test_dot_ulong4(uint64_t4 p0, uint64_t4 p1) { return dot(p0, p1); }
 
diff --git a/clang/test/CodeGenHLSL/builtins/isinf.hlsl b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
new file mode 100644
index 00000000000000..df44fc4a91dfd0
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// CHECK: define noundef i1 @
+// NATIVE_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f16(
+// NO_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f32(
+// CHECK: ret i1 %dx.isinf
+bool test_isinf_half(half p0) { return isinf(p0); }
+// CHECK: define noundef <2 x i1> @
+// NATIVE_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f16
+// NO_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32(
+// CHECK: ret <2 x i1> %dx.isinf
+bool2 test_isinf_half2(half2 p0) { return isinf(p0); }
+// NATIVE_HALF: define noundef <3 x i1> @
+// NATIVE_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f16
+// NO_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32(
+// CHECK: ret <3 x i1> %dx.isinf
+bool3 test_isinf_half3(half3 p0) { return isinf(p0); }
+// NATIVE_HALF: define noundef <4 x i1> @
+// NATIVE_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f16
+// NO_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32(
+// CHECK: ret <4 x i1> %dx.isinf
+bool4 test_isinf_half4(half4 p0) { return isinf(p0); }
+
+// CHECK: define noundef i1 @
+// CHECK: %dx.isinf = call i1 @llvm.dx.isinf.f32(
+// CHECK: ret i1 %dx.isinf
+bool test_isinf_float(float p0) { return isinf(p0); }
+// CHECK: define noundef <2 x i1> @
+// CHECK: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32
+// CHECK: ret <2 x i1> %dx.isinf
+bool2 test_isinf_float2(float2 p0) { return isinf(p0); }
+// CHECK: define noundef <3 x i1> @
+// CHECK: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32
+// CHECK: ret <3 x i1> %dx.isinf
+bool3 test_isinf_float3(float3 p0) { return isinf(p0); }
+// CHECK: define noundef <4 x i1> @
+// CHECK: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32
+// CHECK: ret <4 x i1> %dx.isinf
+bool4 test_isinf_float4(float4 p0) { return isinf(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
index 1f16dec68212e4..2fd5a19fc33521 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
@@ -1,27 +1,5 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-
-
-// CHECK-LABEL: builtin_lerp_half_scalar
-// CHECK: %3 = fsub double %conv1, %conv
-// CHECK: %4 = fmul double %conv2, %3
-// CHECK: %dx.lerp = fadd double %conv, %4
-// CHECK: %conv3 = fptrunc double %dx.lerp to half
-// CHECK: ret half %conv3
-half builtin_lerp_half_scalar (half p0) {
-  return __builtin_hlsl_lerp ( p0, p0, p0 );
-}
-
-// CHECK-LABEL: builtin_lerp_float_scalar
-// CHECK: %3 = fsub double %conv1, %conv
-// CHECK: %4 = fmul double %conv2, %3
-// CHECK: %dx.lerp = fadd double %conv, %4
-// CHECK: %conv3 = fptrunc double %dx.lerp to float
-// CHECK: ret float %conv3
-float builtin_lerp_float_scalar ( float p0) {
-  return __builtin_hlsl_lerp ( p0, p0, p0 );
-}
-
 // CHECK-LABEL: builtin_lerp_half_vector
 // CHECK: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
 // CHECK: ret <3 x half> %dx.lerp
diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
index a6b3d9643d674c..49cd04a10115ae 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
@@ -6,13 +6,10 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: %3 = fsub half %1, %0
-// NATIVE_HALF: %4 = fmul half %2, %3
-// NATIVE_HALF: %dx.lerp = fadd half %0, %4
+
+// NATIVE_HALF: %dx.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
 // NATIVE_HALF: ret half %dx.lerp
-// NO_HALF: %3 = fsub float %1, %0
-// NO_HALF: %4 = fmul float %2, %3
-// NO_HALF: %dx.lerp = fadd float %0, %4
+// NO_HALF: %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
 // NO_HALF: ret float %dx.lerp
 half test_lerp_half(half p0) { return lerp(p0, p0, p0); }
 
@@ -20,37 +17,35 @@ half test_lerp_half(half p0) { return lerp(p0, p0, p0); }
 // NATIVE_HALF: ret <2 x half> %dx.lerp
 // NO_HALF: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
 // NO_HALF: ret <2 x float> %dx.lerp
-half2 test_lerp_half2(half2 p0, half2 p1) { return lerp(p0, p0, p0); }
+half2 test_lerp_half2(half2 p0) { return lerp(p0, p0, p0); }
 
 // NATIVE_HALF: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
 // NATIVE_HALF: ret <3 x half> %dx.lerp
 // NO_HALF: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
 // NO_HALF: ret <3 x float> %dx.lerp
-half3 test_lerp_half3(half3 p0, half3 p1) { return lerp(p0, p0, p0); }
+half3 test_lerp_half3(half3 p0) { return lerp(p0, p0, p0); }
 
 // NATIVE_HALF: %dx.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
 // NATIVE_HALF: ret <4 x half> %dx.lerp
 // NO_HALF: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
 // NO_HALF: ret <4 x float> %dx.lerp
-half4 test_lerp_half4(half4 p0, half4 p1) { return lerp(p0, p0, p0); }
+half4 test_lerp_half4(half4 p0) { return lerp(p0, p0, p0); }
 
-// CHECK: %3 = fsub float %1, %0
-// CHECK: %4 = fmul float %2, %3
-// CHECK: %dx.lerp = fadd float %0, %4
+// CHECK: %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
 // CHECK: ret float %dx.lerp
-float test_lerp_float(float p0, float p1) { return lerp(p0, p0, p0); }
+float test_lerp_float(float p0) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
 // CHECK: ret <2 x float> %dx.lerp
-float2 test_lerp_float2(float2 p0, float2 p1) { return lerp(p0, p0, p0); }
+float2 test_lerp_float2(float2 p0) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
 // CHECK: ret <3 x float> %dx.lerp
-float3 test_lerp_float3(float3 p0, float3 p1) { return lerp(p0, p0, p0); }
+float3 test_lerp_float3(float3 p0) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
 // CHECK: ret <4 x float> %dx.lerp
-float4 test_lerp_float4(float4 p0, float4 p1) { return lerp(p0, p0, p0); }
+float4 test_lerp_float4(float4 p0) { return lerp(p0, p0, p0); }
 
 // CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
 // CHECK: ret <2 x float> %dx.lerp
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
new file mode 100644
index 00000000000000..c87a8c404b08e1
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: %dx.rsqrt = call half @llvm.dx.rsqrt.f16(
+// NATIVE_HALF: ret half %dx.rsqrt
+// NO_HALF: define noundef float @"?test_rsqrt_half@@YA$halff@$halff@@Z"(
+// NO_HALF: %dx.rsqrt = call float @llvm.dx.rsqrt.f32(
+// NO_HALF: ret float %dx.rsqrt
+half test_rsqrt_half(half p0) { return rsqrt(p0); }
+// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF: %dx.rsqrt = call <2 x half> @llvm.dx.rsqrt.v2f16
+// NATIVE_HALF: ret <2 x half> %dx.rsqrt
+// NO_HALF: define noundef <2 x float> @
+// NO_HALF: %dx.rsqrt = call <2 x float> @llvm.dx.rsqrt.v2f32(
+// NO_HALF: ret <2 x float> %dx.rsqrt
+half2 test_rsqrt_half2(half2 p0) { return rsqrt(p0); }
+// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF: %dx.rsqrt = call <3 x half> @llvm.dx.rsqrt.v3f16
+// NATIVE_HALF: ret <3 x half> %dx.rsqrt
+// NO_HALF: define noundef <3 x float> @
+// NO_HALF: %dx.rsqrt = call <3 x float> @llvm.dx.rsqrt.v3f32(
+// NO_HALF: ret <3 x float> %dx.rsqrt
+half3 test_rsqrt_half3(half3 p0) { return rsqrt(p0); }
+// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF: %dx.rsqrt = call <4 x half> @llvm.dx.rsqrt.v4f16
+// NATIVE_HALF: ret <4 x half> %dx.rsqrt
+// NO_HALF: define noundef <4 x float> @
+// NO_HALF: %dx.rsqrt = call <4 x float> @llvm.dx.rsqrt.v4f32(
+// NO_HALF: ret <4 x float> %dx.rsqrt
+half4 test_rsqrt_half4(half4 p0) { return rsqrt(p0); }
+
+// CHECK: define noundef float @
+// CHECK: %dx.rsqrt = call float @llvm.dx.rsqrt.f32(
+// CHECK: ret float %dx.rsqrt
+float test_rsqrt_float(float p0) { return rsqrt(p0); }
+// CHECK: define noundef <2 x float> @
+// CHECK: %dx.rsqrt = call <2 x float> @llvm.dx.rsqrt.v2f32
+// CHECK: ret <2 x float> %dx.rsqrt
+float2 test_rsqrt_float2(float2 p0) { return rsqrt(p0); }
+// CHECK: define noundef <3 x float> @
+// CHECK: %dx.rsqrt = call <3 x float> @llvm.dx.rsqrt.v3f32
+// CHECK: ret <3 x float> %dx.rsqrt
+float3 test_rsqrt_float3(float3 p0) { return rsqrt(p0); }
+// CHECK: define noundef <4 x float> @
+// CHECK: %dx.rsqrt = call <4 x float> @llvm.dx.rsqrt.v4f32
+// CHECK: ret <4 x float> %dx.rsqrt
+float4 test_rsqrt_float4(float4 p0) { return rsqrt(p0); }
diff --git a/clang/test/CodeGenObjC/debug-info-blocks.m b/clang/test/CodeGenObjC/debug-info-blocks.m
index 14b29f222fbe8e..59171da016da1e 100644
--- a/clang/test/CodeGenObjC/debug-info-blocks.m
+++ b/clang/test/CodeGenObjC/debug-info-blocks.m
@@ -5,8 +5,8 @@
 
 // CHECK: define {{.*}}_block_invoke
 // CHECK: store ptr %.block_descriptor, ptr %[[ALLOCA:block.addr]], align
-// CHECK-NEXT: call void @llvm.dbg.declare(metadata ptr %[[ALLOCA]], metadata ![[SELF:[0-9]+]], metadata !{{.*}})
 // CHECK-NEXT: call void @llvm.dbg.declare(metadata ptr %d, metadata ![[D:[0-9]+]], metadata !{{.*}})
+// CHECK-NEXT: call void @llvm.dbg.declare(metadata ptr %[[ALLOCA]], metadata ![[SELF:[0-9]+]], metadata !{{.*}})
 
 // Test that we do emit scope info for the helper functions, and that the
 // parameters to these functions are marked as artificial (so the debugger
diff --git a/clang/test/Driver/aarch64-mcpu.c b/clang/test/Driver/aarch64-mcpu.c
index cacfc691058d13..77ba43122b2453 100644
--- a/clang/test/Driver/aarch64-mcpu.c
+++ b/clang/test/Driver/aarch64-mcpu.c
@@ -56,6 +56,8 @@
 // CORTEX-A715: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a715"
 // RUN: %clang --target=aarch64 -mcpu=cortex-a720  -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A720 %s
 // CORTEX-A720: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a720"
+// RUN: %clang --target=aarch64 -mcpu=cortex-a720ae  -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A720AE %s
+// CORTEX-A720AE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a720ae"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-e1  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-E1 %s
 // NEOVERSE-E1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-e1"
 // RUN: %clang --target=aarch64 -mcpu=neoverse-v1  -### -c %s 2>&1 | FileCheck -check-prefix=NEOVERSE-V1 %s
@@ -70,6 +72,8 @@
 // NEOVERSE-512TVB: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-512tvb"
 // RUN: %clang --target=aarch64 -mcpu=cortex-a520 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A520 %s
 // CORTEX-A520: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a520"
+// RUN: %clang --target=aarch64 -mcpu=cortex-a520ae -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A520AE %s
+// CORTEX-A520AE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a520ae"
 
 // RUN: %clang --target=aarch64 -mcpu=cortex-r82  -### -c %s 2>&1 | FileCheck -check-prefix=CORTEXR82 %s
 // CORTEXR82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-r82"
diff --git a/clang/test/Driver/aarch64-soft-float-abi.c b/clang/test/Driver/aarch64-soft-float-abi.c
new file mode 100644
index 00000000000000..0486d94e66072e
--- /dev/null
+++ b/clang/test/Driver/aarch64-soft-float-abi.c
@@ -0,0 +1,26 @@
+// REQUIRES: aarch64-registered-target
+
+// Hard-float, valid
+// RUN: %clang --target=aarch64-none-elf                               -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf                   -mabi=aapcs -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r                -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r    -mabi=aapcs -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r+fp -mabi=aapcs -c %s -o /dev/null
+
+// Soft-float, no FP
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r+nofp -mabi=aapcs-soft -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -mgeneral-regs-only -mabi=aapcs-soft -c %s -o /dev/null
+
+// Soft-float, FP hardware: Rejected, to avoid having two incompatible ABIs for common targets.
+// RUN: not %clang --target=aarch64-none-elf                        -mabi=aapcs-soft -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SOFT
+// RUN: not %clang --target=aarch64-none-elf -march=armv8-r+fp      -mabi=aapcs-soft -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SOFT
+// RUN: not %clang --target=aarch64-none-elf -march=armv8-r+nofp+fp -mabi=aapcs-soft -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID-SOFT
+
+// No FP, hard-float. This is accepted by the driver, but functions which
+// require arguments or returns to be passed in FP registers will be rejected
+// (tested elsewhere).
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r+nofp             -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -march=armv8-r+nofp -mabi=aapcs -c %s -o /dev/null
+// RUN: %clang --target=aarch64-none-elf -mgeneral-regs-only -mabi=aapcs -c %s -o /dev/null
+
+// INVALID-SOFT: error: 'aapcs-soft' ABI is not supported with FPU
diff --git a/clang/test/Driver/apple-kext-mkernel.c b/clang/test/Driver/apple-kext-mkernel.c
index f03ed4a09b47d3..de905e169c139e 100644
--- a/clang/test/Driver/apple-kext-mkernel.c
+++ b/clang/test/Driver/apple-kext-mkernel.c
@@ -13,8 +13,8 @@
 // CHECK-X86-2: "-fno-rtti"
 // CHECK-X86-2-NOT: "-fno-common"
 
-// RUN: not %clang -target x86_64-apple-darwin11 -arch armv7 -mkernel -mstrict-align -### -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
-// RUN: not %clang -target x86_64-apple-darwin11 -arch armv7 -mkernel -mstrict-align -### -fsyntax-only -fbuiltin -fno-builtin -fcommon -fno-common %s 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
+// RUN: %clang --target=x86_64-apple-darwin11 -arch armv7 -mkernel -mstrict-align -### -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
+// RUN: %clang --target=x86_64-apple-darwin11 -arch armv7 -mkernel -mstrict-align -### -fsyntax-only -fbuiltin -fno-builtin -fcommon -fno-common %s 2>&1 | FileCheck --check-prefix=CHECK-ARM %s
 
 // CHECK-ARM: "-target-feature" "+long-calls"
 // CHECK-ARM: "-target-feature" "+strict-align"
diff --git a/clang/test/Driver/clang-g-opts.c b/clang/test/Driver/clang-g-opts.c
index b73602a155b009..fdbe0b96420c51 100644
--- a/clang/test/Driver/clang-g-opts.c
+++ b/clang/test/Driver/clang-g-opts.c
@@ -42,8 +42,3 @@
 
 // CHECK-WITH-G-STANDALONE: "-debug-info-kind=standalone"
 // CHECK-WITH-G-STANDALONE: "-dwarf-version=2"
-
-/// TODO: Special case before R_RISCV_SET_ULEB128 linker support becomes more widely available.
-// RUN: %clang -### -S %s -g --target=riscv64-linux-gnu 2>&1 | FileCheck --check-prefix=VERSION4 %s
-// RUN: %clang -### -S %s -g --target=riscv64-unknown-elf 2>&1 | FileCheck --check-prefix=VERSION4 %s
-// VERSION4: "-dwarf-version=4"
diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index 086840accebe7f..a1719a6fbe042b 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -80,11 +80,15 @@
 //
 // RUN: not %clang -target nvptx64-nvidia-cuda %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=MISSING %s
+// RUN: not %clang -target nvptx64-nvidia-cuda -march=generic %s -### 2>&1 \
+// RUN:   | FileCheck -check-prefix=MISSING %s
 
 // MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'ptxas'
 // MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'nvlink'
 
 // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=GENERIC %s
+// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_52 -march=generic -flto -c %s -### 2>&1 \
+// RUN:   | FileCheck -check-prefix=GENERIC %s
 
 // GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"
diff --git a/clang/test/Driver/fsanitize-signed-integer-overflow.c b/clang/test/Driver/fsanitize-signed-integer-overflow.c
new file mode 100644
index 00000000000000..4a9345493b6aa8
--- /dev/null
+++ b/clang/test/Driver/fsanitize-signed-integer-overflow.c
@@ -0,0 +1,28 @@
+/// When -fwrapv (implied by -fno-strict-overflow) is enabled,
+/// -fsanitize=undefined does not expand to signed-integer-overflow.
+/// -fsanitize=signed-integer-overflow is unaffected by -fwrapv.
+
+// RUN: %clang -### --target=x86_64-linux -fwrapv -fsanitize=signed-integer-overflow %s 2>&1 | FileCheck %s
+// CHECK: -fsanitize=signed-integer-overflow
+// CHECK: -fsanitize-recover=signed-integer-overflow
+
+// RUN: %clang -### --target=x86_64-linux -fno-strict-overflow -fsanitize=undefined %s 2>&1 | FileCheck %s --check-prefix=EXCLUDE
+// RUN: %clang -### --target=x86_64-linux -fstrict-overflow -fwrapv -fsanitize=undefined %s 2>&1 | FileCheck %s --check-prefix=EXCLUDE
+// EXCLUDE:     -fsanitize=alignment,array-bounds,
+// EXCLUDE-NOT: signed-integer-overflow,
+// EXCLUDE:      -fsanitize-recover=alignment,array-bounds,
+// EXCLUDE-SAME: signed-integer-overflow
+
+// RUN: %clang -### --target=x86_64-linux -fwrapv -fsanitize=undefined -fsanitize=signed-integer-overflow %s 2>&1 | FileCheck %s --check-prefix=INCLUDE
+// RUN: %clang -### --target=x86_64-linux -fno-strict-overflow -fno-sanitize=signed-integer-overflow -fsanitize=undefined -fsanitize=signed-integer-overflow %s 2>&1 | FileCheck %s --check-prefix=INCLUDE
+// INCLUDE:      -fsanitize=alignment,array-bounds,
+// INCLUDE-SAME: signed-integer-overflow
+// INCLUDE:      -fsanitize-recover=alignment,array-bounds,
+// INCLUDE-SAME: signed-integer-overflow
+
+/// -fsanitize-trap=undefined expands to signed-integer-overflow regardless of -fwrapv.
+// RUN: %clang -### --target=x86_64-linux -fwrapv -fsanitize=undefined -fsanitize=signed-integer-overflow -fsanitize-trap=undefined %s 2>&1 | FileCheck %s --check-prefix=INCLUDE-TRAP
+// INCLUDE-TRAP:      -fsanitize=alignment,array-bounds,
+// INCLUDE-TRAP-SAME: signed-integer-overflow
+// INCLUDE-TRAP:      -fsanitize-trap=alignment,array-bounds,
+// INCLUDE-TRAP-SAME: signed-integer-overflow
diff --git a/clang/test/Driver/hexagon-default-build-attributes.s b/clang/test/Driver/hexagon-default-build-attributes.s
new file mode 100644
index 00000000000000..58f246c2f805da
--- /dev/null
+++ b/clang/test/Driver/hexagon-default-build-attributes.s
@@ -0,0 +1,21 @@
+/// Enabled by default for assembly
+// RUN: %clang --target=hexagon-unknown-elf -### %s 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-ENABLED
+
+/// Can be forced on or off for assembly.
+// RUN: %clang --target=hexagon-unknown-elf -### %s 2>&1 -mno-default-build-attributes \
+// RUN:    | FileCheck %s -check-prefix CHECK-DISABLED
+// RUN: %clang --target=hexagon-unknown-elf -### %s 2>&1 -mdefault-build-attributes \
+// RUN:    | FileCheck %s -check-prefix CHECK-ENABLED
+
+/// Option ignored C/C++ (since we always emit hardware and ABI build attributes
+/// during codegen).
+// RUN: %clang --target=hexagon-unknown-elf -### -x c %s -mdefault-build-attributes 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-DISABLED-C
+// RUN: %clang --target=hexagon-unknown-elf -### -x c++ %s -mdefault-build-attributes 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-DISABLED-C
+
+// CHECK-DISABLED-NOT: "-hexagon-add-build-attributes"
+// CHECK-DISABLED-C-NOT: "-hexagon-add-build-attributes"
+// CHECK-ENABLED: "-hexagon-add-build-attributes"
+// CHECK-DISABLED-C: argument unused during compilation: '-mdefault-build-attributes'
diff --git a/clang/test/Driver/hip-partial-link.hip b/clang/test/Driver/hip-partial-link.hip
index faa185972abc33..c8451ec81ed37e 100644
--- a/clang/test/Driver/hip-partial-link.hip
+++ b/clang/test/Driver/hip-partial-link.hip
@@ -47,7 +47,7 @@
 // OBJ:  D __hip_gpubin_handle_[[ID2]]
 
 // RUN: %clang -v --target=x86_64-unknown-linux-gnu --no-offload-new-driver \
-// RUN:   --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \
+// RUN:   --hip-link -fgpu-rdc --offload-arch=gfx906 \
 // RUN:   -fuse-ld=lld -nostdlib -r %t.main.o %t.lib.o -o %t.final.o \
 // RUN:   2>&1 | FileCheck -check-prefix=LINK-O %s
 // LINK-O-NOT: Found undefined HIP {{.*}}symbol
diff --git a/clang/test/Driver/hip-runtime-libs-linux.hip b/clang/test/Driver/hip-runtime-libs-linux.hip
index 142582963c958f..a4cd2733114b69 100644
--- a/clang/test/Driver/hip-runtime-libs-linux.hip
+++ b/clang/test/Driver/hip-runtime-libs-linux.hip
@@ -43,6 +43,11 @@
 // RUN:   --rocm-path=%S/Inputs/rocm %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefixes=NOHIPRT %s
 
+// Test HIP runtime lib is not linked with -r.
+// RUN: %clang -### --hip-link -r --target=x86_64-linux-gnu \
+// RUN:   --rocm-path=%S/Inputs/rocm %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefixes=NOHIPRT %s
+
 // Test HIP runtime lib is linked without hip-link if there is HIP input file.
 // RUN: %clang -### --target=x86_64-linux-gnu -nogpuinc -nogpulib \
 // RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
diff --git a/clang/test/Driver/loongarch-munaligned-access.c b/clang/test/Driver/loongarch-munaligned-access.c
index 44edb2eb17e6ab..a545679949973a 100644
--- a/clang/test/Driver/loongarch-munaligned-access.c
+++ b/clang/test/Driver/loongarch-munaligned-access.c
@@ -1,54 +1,25 @@
 /// Test -m[no-]unaligned-access and -m[no-]strict-align options.
 
-// RUN: %clang --target=loongarch64 -munaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
 // RUN: %clang --target=loongarch64 -mstrict-align -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
 // RUN: %clang --target=loongarch64 -mno-strict-align -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
-// RUN: %clang --target=loongarch64 -munaligned-access -mno-unaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -munaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
 // RUN: %clang --target=loongarch64 -mstrict-align -mno-strict-align -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
 // RUN: %clang --target=loongarch64 -mno-strict-align -mstrict-align -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -munaligned-access -mstrict-align -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -mstrict-align -munaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -mno-strict-align -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-strict-align -mno-unaligned-access -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
 
-// RUN: %clang --target=loongarch64 -munaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
 // RUN: %clang --target=loongarch64 -mstrict-align -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
 // RUN: %clang --target=loongarch64 -mno-strict-align -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
-// RUN: %clang --target=loongarch64 -munaligned-access -mno-unaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -munaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
 // RUN: %clang --target=loongarch64 -mstrict-align -mno-strict-align -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
 // RUN: %clang --target=loongarch64 -mno-strict-align -mstrict-align -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -munaligned-access -mstrict-align -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
-// RUN: %clang --target=loongarch64 -mstrict-align -munaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-unaligned-access -mno-strict-align -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
-// RUN: %clang --target=loongarch64 -mno-strict-align -mno-unaligned-access -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
+
+// RUN: not %clang -### --target=loongarch64 -mno-unaligned-access -munaligned-access %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR
 
 // CC1-UNALIGNED: "-target-feature" "+ual"
 // CC1-NO-UNALIGNED: "-target-feature" "-ual"
@@ -56,6 +27,9 @@
 // IR-UNALIGNED: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+ual{{(,.*)?}}"
 // IR-NO-UNALIGNED: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-ual{{(,.*)?}}"
 
+// ERR: error: unsupported option '-mno-unaligned-access' for target 'loongarch64'
+// ERR: error: unsupported option '-munaligned-access' for target 'loongarch64'
+
 int foo(void) {
   return 3;
 }
diff --git a/clang/test/Driver/mips-features.c b/clang/test/Driver/mips-features.c
index fd06b1400c3123..5e92dccaa02abb 100644
--- a/clang/test/Driver/mips-features.c
+++ b/clang/test/Driver/mips-features.c
@@ -462,3 +462,29 @@
 // RUN:     -mrelax-pic-calls -mno-relax-pic-calls 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NO-RELAX-PIC-CALLS %s
 // CHECK-NO-RELAX-PIC-CALLS: "-mllvm" "-mips-jalr-reloc=0"
+//
+// -mno-unaligned-access
+// RUN: %clang -target mips-unknown-linux-gnu -### -c %s \
+// RUN:     -munaligned-access -mno-strict-align \
+// RUN:     -mno-unaligned-access 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-STRICT-ALIGN %s
+// CHECK-STRICT-ALIGN: "-target-feature" "+strict-align"
+//
+// -munaligned-access
+// RUN: %clang -target mips-unknown-linux-gnu -### -c %s \
+// RUN:     -mno-unaligned-access -mstrict-align \
+// RUN:     -munaligned-access 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NO-STRICT-ALIGN %s
+// CHECK-NO-STRICT-ALIGN: "-target-feature" "-strict-align"
+//
+// -mstrict-align
+// RUN: %clang -target mips-unknown-linux-gnu -### -c %s \
+// RUN:     -munaligned-access -mno-strict-align \
+// RUN:     -mstrict-align 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-STRICT-ALIGN %s
+//
+// -mno-strict-align
+// RUN: %clang -target mips-unknown-linux-gnu -### -c %s \
+// RUN:     -mno-unaligned-access -mstrict-align \
+// RUN:     -mno-strict-align 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NO-STRICT-ALIGN %s
diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
new file mode 100644
index 00000000000000..24797002b80f53
--- /dev/null
+++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
@@ -0,0 +1,36 @@
+// Test that -print-library-module-manifest-path finds the correct file.
+
+// RUN: rm -rf %t && split-file %s %t && cd %t
+// RUN: mkdir -p %t/Inputs/usr/lib/x86_64-linux-gnu
+// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.so
+
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libc++ \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libcxx-no-module-json.cpp
+
+// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/modules.json
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libc++ \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libcxx.cpp
+
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libstdc++ \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libstdcxx.cpp
+
+//--- libcxx-no-module-json.cpp
+
+// CHECK: <NOT PRESENT>
+
+//--- libcxx.cpp
+
+// CHECK: {{.*}}/Inputs/usr/lib/x86_64-linux-gnu{{/|\\}}modules.json
+
+//--- libstdcxx.cpp
+
+// CHECK: <NOT PRESENT>
diff --git a/clang/test/Driver/munaligned-access-unused.c b/clang/test/Driver/munaligned-access-unused.c
index 1d86edb798ef2d..4060b53c42fe5f 100644
--- a/clang/test/Driver/munaligned-access-unused.c
+++ b/clang/test/Driver/munaligned-access-unused.c
@@ -1,8 +1,9 @@
-/// Check -m[no-]unaligned-access and -m[no-]strict-align are warned unused on a target that does not support them.
+/// Check -m[no-]unaligned-access and -m[no-]strict-align are errored on a target that does not support them.
 
 // RUN: not %clang --target=x86_64 -munaligned-access -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=unaligned-access
 // RUN: not %clang --target=x86_64 -mno-unaligned-access -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=no-unaligned-access
-// RUN: not %clang --target=x86_64 -mstrict-align -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=strict-align
-// RUN: not %clang --target=x86_64 -mno-strict-align -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=no-strict-align
+// RUN: not %clang --target=x86_64 -mno-strict-align -mstrict-align -fsyntax-only %s -### 2>&1 | FileCheck %s --check-prefix=ALIGN
 
 // CHECK: error: unsupported option '-m{{(no-)?}}unaligned-access' for target '{{.*}}'
+// ALIGN: error: unsupported option '-mno-strict-align' for target '{{.*}}'
+// ALIGN: error: unsupported option '-mstrict-align' for target '{{.*}}'
diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c
index fc5fb0f27e3af4..fe74ac773ef8ca 100644
--- a/clang/test/Driver/riscv-features.c
+++ b/clang/test/Driver/riscv-features.c
@@ -33,8 +33,6 @@
 // NO-FORCE-SW-SCS: "-target-feature" "-forced-sw-shadow-stack"
 // DEFAULT-NOT: "-target-feature" "+forced-sw-shadow-stack"
 
-// RUN: %clang --target=riscv32-unknown-elf -### %s -munaligned-access 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
-// RUN: %clang --target=riscv32-unknown-elf -### %s -mno-unaligned-access 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mstrict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS
 
diff --git a/clang/test/Driver/tls-dialect.c b/clang/test/Driver/tls-dialect.c
index f73915b28ec2a3..a808dd81531ce7 100644
--- a/clang/test/Driver/tls-dialect.c
+++ b/clang/test/Driver/tls-dialect.c
@@ -2,6 +2,7 @@
 // RUN: %clang -### --target=riscv64-linux -mtls-dialect=trad %s 2>&1 | FileCheck --check-prefix=NODESC %s
 // RUN: %clang -### --target=riscv64-linux %s 2>&1 | FileCheck --check-prefix=NODESC %s
 // RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu %s 2>&1 | FileCheck --check-prefix=NODESC %s
+// RUN: %clang -### --target=x86_64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=DESC %s
 
 /// Android supports TLSDESC by default on RISC-V
 /// TLSDESC is not on by default in Linux, even on RISC-V, and is covered above
@@ -18,7 +19,6 @@
 
 /// Unsupported argument
 // RUN: not %clang -### --target=riscv64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s
-// RUN: not %clang -### --target=x86_64-linux -mtls-dialect=gnu2 %s 2>&1 | FileCheck --check-prefix=UNSUPPORTED-ARG %s
 
 // DESC:       "-cc1" {{.*}}"-enable-tlsdesc"
 // NODESC-NOT: "-enable-tlsdesc"
diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c
index f950283ec42aa0..88590a3ba4c453 100644
--- a/clang/test/Driver/wasm-toolchain.c
+++ b/clang/test/Driver/wasm-toolchain.c
@@ -197,3 +197,27 @@
 // RUN: not %clang -### %s --target=wasm32-unknown-unknown --sysroot=%s/no-sysroot-there -fPIC -mno-mutable-globals %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=PIC_NO_MUTABLE_GLOBALS %s
 // PIC_NO_MUTABLE_GLOBALS: error: invalid argument '-fPIC' not allowed with '-mno-mutable-globals'
+
+// Test that `wasm32-wasip2` invokes the `wasm-component-ld` linker by default
+// instead of `wasm-ld`.
+
+// RUN: %clang -### -O2 --target=wasm32-wasip2 %s --sysroot /foo 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK_WASIP2 %s
+// LINK_WASIP2: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_WASIP2: wasm-component-ld{{.*}}" "-L/foo/lib/wasm32-wasip2" "crt1.o" "[[temp]]" "-lc" "{{.*[/\\]}}libclang_rt.builtins-wasm32.a" "-o" "a.out"
+
+// Test that on `wasm32-wasip2` the `wasm-component-ld` programs is told where
+// to find `wasm-ld` by default.
+
+// RUN: %clang -### -O2 --target=wasm32-wasip2 %s --sysroot /foo 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK_WASIP2_FIND_WASMLD %s
+// LINK_WASIP2_FIND_WASMLD: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_WASIP2_FIND_WASMLD: wasm-component-ld{{.*}}" {{.*}} "--wasm-ld-path" "{{.*}}wasm-ld{{.*}}" {{.*}} "[[temp]]" {{.*}}
+
+// If `wasm32-wasip2` is configured with `wasm-ld` as a linker then don't pass
+// the `--wasm-ld-path` flag.
+
+// RUN: %clang -### -O2 --target=wasm32-wasip2 -fuse-ld=lld %s --sysroot /foo 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK_WASIP2_USE_WASMLD %s
+// LINK_WASIP2_USE_WASMLD: "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
+// LINK_WASIP2_USE_WASMLD: wasm-ld{{.*}}" "-m" "wasm32" {{.*}} "[[temp]]" {{.*}}
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 701f93853ab93c..37099de74fb8ec 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -1685,7 +1685,7 @@ extern "C" __device__ double test_j1(double x) {
 // DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to float
+// DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
 // DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -1718,7 +1718,7 @@ extern "C" __device__ double test_j1(double x) {
 // FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to float
+// FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]]
 // FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_0_I3]], [[DIV_I]]
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -1751,7 +1751,7 @@ extern "C" __device__ double test_j1(double x) {
 // APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// APPROX-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to float
+// APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
 // APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
 // APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -1788,7 +1788,7 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to double
+// DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
 // DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -1821,7 +1821,7 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to double
+// FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]]
 // FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_0_I3]], [[DIV_I]]
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -1854,7 +1854,7 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// APPROX-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to double
+// APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
 // APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
 // APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -4222,7 +4222,7 @@ extern "C" __device__ double test_y1(double x) {
 // DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to float
+// DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
 // DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -4255,7 +4255,7 @@ extern "C" __device__ double test_y1(double x) {
 // FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to float
+// FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]]
 // FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_0_I3]], [[DIV_I]]
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -4288,7 +4288,7 @@ extern "C" __device__ double test_y1(double x) {
 // APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// APPROX-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to float
+// APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
 // APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
 // APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -4325,7 +4325,7 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to double
+// DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
 // DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -4358,7 +4358,7 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to double
+// FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]]
 // FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_0_I3]], [[DIV_I]]
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -4391,7 +4391,7 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// APPROX-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[MUL_I]] to double
+// APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
 // APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
 // APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
diff --git a/clang/test/InstallAPI/asm.test b/clang/test/InstallAPI/asm.test
new file mode 100644
index 00000000000000..b6af7f643d72f2
--- /dev/null
+++ b/clang/test/InstallAPI/asm.test
@@ -0,0 +1,90 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+// RUN: clang-installapi -target arm64-apple-macos13.1 \
+// RUN: -I%t/usr/include \
+// RUN: -install_name @rpath/lib/libasm.dylib \
+// RUN: %t/inputs.json -o %t/output.tbd 2>&1 | FileCheck %s --allow-empty
+// RUN: llvm-readtapi -compare %t/output.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
+
+// CHECK-NOT: error: 
+// CHECK-NOT: warning: 
+
+//--- usr/include/asm.h
+#ifndef ASM_H
+#define ASM_H
+
+extern int ivar __asm("_OBJC_IVAR_$_SomeClass._ivar1");
+extern int objcClass1 __asm("_OBJC_CLASS_$_SomeClass");
+extern int objcClass2 __asm("_OBJC_METACLASS_$_SomeClass");
+extern int objcClass3 __asm("_OBJC_EHTYPE_$_SomeClass");
+extern int objcClass4 __asm(".objc_class_name_SomeClass");
+
+__attribute__((visibility("hidden")))
+@interface NSString {
+}
+@end
+
+extern int ivarExtra __asm("_OBJC_IVAR_$_NSString._ivar1");
+#endif // ASM_H
+
+//--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/usr/include/asm.h",
+    "type" : "public"
+  }],
+  "version": "3"
+}
+
+//--- expected.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "objc_class": [
+            "SomeClass"
+          ],
+          "objc_eh_type": [
+            "SomeClass"
+          ],
+          "objc_ivar": [
+            "NSString._ivar1",
+            "SomeClass._ivar1"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/lib/libasm.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13.1",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/clang/test/InstallAPI/availability.test b/clang/test/InstallAPI/availability.test
new file mode 100644
index 00000000000000..55c890b6d882ae
--- /dev/null
+++ b/clang/test/InstallAPI/availability.test
@@ -0,0 +1,631 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+; RUN: yaml2obj %t/Availability.yaml -o %t/System/Library/Frameworks/Availability.framework/Availability
+
+; RUN: clang-installapi \
+; RUN: --target=x86_64-apple-macos13 \
+; RUN: -install_name /System/Library/Frameworks/Availability.framework/Versions/A/Availability \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -F %t/System/Library/Frameworks \
+; RUN: %t/inputs.json -o %t/output.tbd \
+; RUN: --verify-against=%t/System/Library/Frameworks/Availability.framework/Availability \
+; RUN: --verify-mode=ErrorsOnly --filetype=tbd-v5 2> %t/errors.log
+; RUN: FileCheck -allow-empty -check-prefix=ERRORSONLY -input-file %t/errors.log %s
+
+; RUN: clang-installapi \
+; RUN: --target=x86_64-apple-macos13 \
+; RUN: -install_name /System/Library/Frameworks/Availability.framework/Versions/A/Availability \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -F %t/System/Library/Frameworks \
+; RUN: %t/inputs.json -o %t/output-warnings.tbd \
+; RUN: --verify-against=%t/System/Library/Frameworks/Availability.framework/Availability \
+; RUN: --verify-mode=ErrorsAndWarnings 2> %t/errors.log
+; RUN: FileCheck -check-prefixes=VIOLATIONS,ERRORSANDWARNINGS -input-file %t/errors.log %s
+
+; RUN: not clang-installapi \
+; RUN: --target=x86_64-apple-macos13 \
+; RUN: -install_name /System/Library/Frameworks/Availability.framework/Versions/A/Availability \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: -F %t/System/Library/Frameworks \
+; RUN: %t/inputs.json -o %t/output-pedantic.tbd \
+; RUN: --verify-against=%t/System/Library/Frameworks/Availability.framework/Availability \
+; RUN: --verify-mode=Pedantic 2> %t/errors.log
+; RUN: FileCheck -check-prefixes=VIOLATIONS,PEDANTIC -input-file %t/errors.log %s
+
+; ERRORSONLY-NOT:        error
+; ERRORSONLY-NOT:        warning
+
+; ERRORSANDWARNINGS-NOT: error
+; VIOLATIONS:            warning: violations found for x86_64-apple-macos
+; VIOLATIONS:            declaration 'publicGlobalVariable' is marked unavailable, but symbol is exported in dynamic library
+; VIOLATIONS-NEXT:       extern int publicGlobalVariable NS_AVAILABLE
+; VIOLATIONS:            declaration 'Foo' is marked unavailable, but symbol is exported in dynamic library
+; VIOLATIONS-NEXT:       @interface Foo : NSObject
+; VIOLATIONS:            declaration 'publicGlobalVariable3' is marked unavailable, but symbol is exported in dynamic library
+; VIOLATIONS-NEXT:       extern int publicGlobalVariable3 __attribute__((unavailable))
+; VIOLATIONS:            declaration 'privateGlobalVariable' is marked unavailable, but symbol is exported in dynamic library
+; VIOLATIONS-NEXT:       extern int privateGlobalVariable;
+
+; ERRORSANDWARNINGS-NOT: warning 
+; PEDANTIC-NOT:          error
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/System/Library/Frameworks/Availability.framework/Headers/Availability.h",
+    "type" : "public"
+  }, 
+  {
+    "path" : "DSTROOT/System/Library/Frameworks/Availability.framework/PrivateHeaders/AvailabilityPrivate.h",
+    "type" : "private"
+  }
+  ],
+  "version": "3"
+}
+
+;--- System/Library/Frameworks/Availability.framework/Headers/AV_Defines.h
+#ifndef AV_DEFINES
+#define AV_DEFINES 
+
+#define NS_AVAILABLE __attribute__((availability(macosx,introduced=NA)))
+
+@interface NSObject 
+@end
+
+#endif //AV_DEFINES
+
+;--- System/Library/Frameworks/Availability.framework/PrivateHeaders/AvailabilityPrivate.h
+#import <Availability/AV_Defines.h>
+// Test private global variable.
+NS_AVAILABLE 
+extern int privateGlobalVariable;
+
+;--- System/Library/Frameworks/Availability.framework/Headers/Availability.h
+#import <Availability/AV_Defines.h>
+extern int publicGlobalVariable NS_AVAILABLE;
+
+// Test public ObjC class
+NS_AVAILABLE
+@interface Foo : NSObject
+@end
+
+// Test unavailable attribute.
+#ifdef __i386__
+#define UNAVAILABLE_I386 __attribute__((unavailable))
+#else
+#define UNAVAILABLE_I386
+#endif
+extern int publicGlobalVariable2 UNAVAILABLE_I386;
+
+extern int publicGlobalVariable3 __attribute__((unavailable))
+__attribute__((availability(macosx, introduced = 10.9)));
+
+// Test obsoleted with exported variable.
+extern int publicGlobalVariable4 __attribute__((availability(
+    macosx, introduced = 10.9, deprecated = 10.10, obsoleted = 10.11)));
+// Test obsoleted with non-existent variable.
+extern int publicGlobalVariable5 __attribute__((availability(
+    macosx, introduced = 10.9, deprecated = 10.10, obsoleted = 10.11)));
+
+#ifdef __i386__
+#define OBSOLETE_I386 __attribute__((availability(macosx, obsoleted = 10.11)))
+#else
+#define OBSOLETE_I386
+#endif
+extern int publicGlobalVariable6 OBSOLETE_I386;
+
+
+/// Created from: 
+// int publicGlobalVariable; int privateGlobalVariable;
+// 
+// @implementation Foo
+// @end
+// 
+// #ifndef __i386__
+// int publicGlobalVariable2;
+// #endif
+// 
+// int publicGlobalVariable3;
+// int publicGlobalVariable4;
+// 
+// #ifndef __i386__
+// int publicGlobalVariable6;
+// #endif
+;--- Availability.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x6
+  ncmds:           14
+  sizeofcmds:      1312
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          8192
+    fileoff:         0
+    filesize:        8192
+    maxprot:         5
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x1140
+        size:            0
+        offset:          0x1140
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         ''
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x1140
+        size:            4
+        offset:          0x1140
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x2
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         466F6F00
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA_CONST
+    vmaddr:          8192
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          2
+    flags:           16
+    Sections:
+      - sectname:        __objc_classlist
+        segname:         __DATA_CONST
+        addr:            0x2000
+        size:            8
+        offset:          0x2000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x10000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         B830000000000000
+      - sectname:        __objc_imageinfo
+        segname:         __DATA_CONST
+        addr:            0x2008
+        size:            8
+        offset:          0x2008
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000040000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         __DATA
+    vmaddr:          12288
+    vmsize:          4096
+    fileoff:         12288
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __objc_const
+        segname:         __DATA
+        addr:            0x3000
+        size:            144
+        offset:          0x3000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '010000002800000028000000000000000000000000000000401100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000800000008000000000000000000000000000000401100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'
+      - sectname:        __objc_data
+        segname:         __DATA
+        addr:            0x3090
+        size:            80
+        offset:          0x3090
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000000000000000000000000000000000000000000000000000000000000003000000000000090300000000000000000000000000000000000000000000000000000000000004830000000000000'
+      - sectname:        __common
+        segname:         __DATA
+        addr:            0x30E0
+        size:            24
+        offset:          0x0
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x1
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          16384
+    vmsize:          824
+    fileoff:         16384
+    filesize:        824
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      16384
+    rebase_size:     16
+    bind_off:        16400
+    bind_size:       104
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      16504
+    export_size:     152
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          16664
+    nsyms:           14
+    stroff:          16888
+    strsize:         320
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       2
+    iextdefsym:      2
+    nextdefsym:      8
+    iundefsym:       10
+    nundefsym:       4
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         112
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/Availability.framework/Versions/A/Availability'
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C4470-5555-3144-A142-4EE44DA08D2F
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 14942208
+      compatibility_version: 65536
+    Content:         '/usr/lib/libobjc.A.dylib'
+    ZeroPadBytes:    8
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88473600
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         16656
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         16664
+    datasize:        0
+LinkEditData:
+  RebaseOpcodes:
+    - Opcode:          REBASE_OPCODE_SET_TYPE_IMM
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ExtraData:       [ 0x0 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ExtraData:       [ 0x18 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB
+      Imm:             0
+      ExtraData:       [ 0x2, 0x40 ]
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             2
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             3
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DONE
+      Imm:             0
+  BindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          '_OBJC_METACLASS_$_NSObject'
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ULEBExtraData:   [ 0x90 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __objc_empty_cache
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_ADD_ADDR_ULEB
+      Imm:             0
+      ULEBExtraData:   [ 0x20 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          '_OBJC_CLASS_$_NSObject'
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_ADD_ADDR_ULEB
+      Imm:             0
+      ULEBExtraData:   [ 0xFFFFFFFFFFFFFFF0 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      5
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    0
+            NodeOffset:      17
+            Name:            OBJC_
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      49
+                Name:            'METACLASS_$_Foo'
+                Flags:           0x0
+                Address:         0x3090
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      54
+                Name:            'CLASS_$_Foo'
+                Flags:           0x0
+                Address:         0x30B8
+                Other:           0x0
+                ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      59
+            Name:            p
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      104
+                Name:            rivateGlobalVariable
+                Flags:           0x0
+                Address:         0x30E0
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      109
+                Name:            ublicGlobalVariable
+                Flags:           0x0
+                Address:         0x30E4
+                Other:           0x0
+                ImportName:      ''
+                Children:
+                  - TerminalSize:    3
+                    NodeOffset:      130
+                    Name:            '4'
+                    Flags:           0x0
+                    Address:         0x30F0
+                    Other:           0x0
+                    ImportName:      ''
+                  - TerminalSize:    3
+                    NodeOffset:      135
+                    Name:            '3'
+                    Flags:           0x0
+                    Address:         0x30EC
+                    Other:           0x0
+                    ImportName:      ''
+                  - TerminalSize:    3
+                    NodeOffset:      140
+                    Name:            '2'
+                    Flags:           0x0
+                    Address:         0x30E8
+                    Other:           0x0
+                    ImportName:      ''
+                  - TerminalSize:    3
+                    NodeOffset:      145
+                    Name:            '6'
+                    Flags:           0x0
+                    Address:         0x30F4
+                    Other:           0x0
+                    ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xE
+      n_sect:          5
+      n_desc:          0
+      n_value:         12288
+    - n_strx:          28
+      n_type:          0xE
+      n_sect:          5
+      n_desc:          0
+      n_value:         12360
+    - n_strx:          50
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12512
+    - n_strx:          73
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12516
+    - n_strx:          95
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12520
+    - n_strx:          118
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12524
+    - n_strx:          141
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12528
+    - n_strx:          164
+      n_type:          0xF
+      n_sect:          7
+      n_desc:          0
+      n_value:         12532
+    - n_strx:          187
+      n_type:          0xF
+      n_sect:          6
+      n_desc:          0
+      n_value:         12432
+    - n_strx:          209
+      n_type:          0xF
+      n_sect:          6
+      n_desc:          0
+      n_value:         12472
+    - n_strx:          227
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          250
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          277
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          296
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+  StringTable:
+    - ' '
+    - '__OBJC_METACLASS_RO_$_Foo'
+    - '__OBJC_CLASS_RO_$_Foo'
+    - _privateGlobalVariable
+    - _publicGlobalVariable
+    - _publicGlobalVariable2
+    - _publicGlobalVariable3
+    - _publicGlobalVariable4
+    - _publicGlobalVariable6
+    - '_OBJC_METACLASS_$_Foo'
+    - '_OBJC_CLASS_$_Foo'
+    - '_OBJC_CLASS_$_NSObject'
+    - '_OBJC_METACLASS_$_NSObject'
+    - __objc_empty_cache
+    - dyld_stub_binder
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+...
diff --git a/clang/test/InstallAPI/basic.test b/clang/test/InstallAPI/basic.test
index 5b41ccd517b0af..096911039d114e 100644
--- a/clang/test/InstallAPI/basic.test
+++ b/clang/test/InstallAPI/basic.test
@@ -2,7 +2,8 @@
 // RUN: split-file %s %t
 /// Check basic arguments are captured.
 // RUN: clang-installapi -x objective-c -target arm64-apple-ios13.0.0 \
-// RUN: -fapplication-extension -current_version 1 -install_name /usr/lib/basic.dylib \
+// RUN: -fapplication-extension -current_version 1 -compatibility_version 1 \
+// RUN: -install_name /usr/lib/basic.dylib \
 // RUN: %t/basic_inputs.json -o %t/basic.tbd 2>&1 | FileCheck %s --allow-empty
 // RUN: llvm-readtapi -compare %t/basic.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
 
@@ -25,11 +26,6 @@
 //--- expected.tbd
 {
   "main_library": {
-    "compatibility_versions": [
-      {
-        "version": "0"
-      }
-    ],
     "install_names": [
       {
         "name": "/usr/lib/basic.dylib"
diff --git a/clang/test/InstallAPI/diagnostics-cpp.test b/clang/test/InstallAPI/diagnostics-cpp.test
new file mode 100644
index 00000000000000..9319a7a61d4839
--- /dev/null
+++ b/clang/test/InstallAPI/diagnostics-cpp.test
@@ -0,0 +1,462 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+// RUN: yaml2obj %t/Mismatch.yaml -o %t/System/Library/Frameworks/MismatchCpp.framework/MismatchCpp
+
+// RUN: not clang-installapi --target=arm64-apple-macos13 -x objective-c++  \
+// RUN: -F %t/System/Library/Frameworks \
+// RUN: -install_name /System/Library/Frameworks/MismatchCpp.framework/Versions/A/MismatchCpp \
+// RUN: -current_version 1 -compatibility_version 1 %t/inputs.json \
+// RUN: --verify-against=%t/System/Library/Frameworks/MismatchCpp.framework/MismatchCpp \
+// RUN: --verify-mode=Pedantic -o %t/output.tbd --demangle 2> %t/errors.log
+// RUN: FileCheck -input-file %t/errors.log %s 
+
+CHECK: warning: violations found for arm64-apple-macos13
+CHECK: CPP.h:5:7: error: declaration has external linkage, but symbol has internal linkage in dynamic library 'vtable for Bar'
+CHECK-NEXT: class Bar : Foo {
+CHECK-NEXT:       ^
+CHECK-NEXT: CPP.h:5:7: error: declaration has external linkage, but symbol has internal linkage in dynamic library 'typeinfo for Bar'
+CHECK-NEXT: CPP.h:5:7: error: declaration has external linkage, but symbol has internal linkage in dynamic library 'typeinfo name for Bar'
+CHECK-NEXT: CPP.h:6:7: error: dynamic library symbol '(weak-def) Bar::init()' is weak defined, but its declaration is not
+CHECK-NEXT:   int init();
+CHECK-NEXT:       ^
+
+//--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/System/Library/Frameworks/MismatchCpp.framework/Headers/CPP.h",
+    "type" : "public"
+  }
+  ],
+  "version": "3"
+}
+
+//--- System/Library/Frameworks/MismatchCpp.framework/Headers/CPP.h
+class Foo {
+  virtual int init() = 0;
+};
+
+class Bar : Foo { 
+  int init();
+};
+
+/// Created from: 
+// With LD flags: -exported_symbol,"__ZN3Bar4initEv" -exported_symbol,"__Z3fooIjEiT_"
+// class Foo { virtual int init() = 0;};
+// 
+// class Bar : Foo {int init() { return 1;}};
+// Bar bar;
+// 
+// template <typename T> int foo(T val) { return 1; }
+// template <> int foo(unsigned val) { return 1; }
+
+//--- Mismatch.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x6
+  ncmds:           15
+  sizeofcmds:      1224
+  flags:           0x118085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          16384
+    fileoff:         0
+    filesize:        16384
+    maxprot:         5
+    initprot:        5
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x10E8
+        size:            16
+        offset:          0x10E8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         20008052C0035FD620008052C0035FD6
+      - sectname:        __const
+        segname:         __TEXT
+        addr:            0x10F8
+        size:            10
+        offset:          0x10F8
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         334261720033466F6F00
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x1104
+        size:            4152
+        offset:          0x1104
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000010000002000000000000000200000000200000000000002E81000003800000038000000F81000000000000038000000030000000C0001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA_CONST
+    vmaddr:          16384
+    vmsize:          16384
+    fileoff:         16384
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          2
+    flags:           16
+    Sections:
+      - sectname:        __const
+        segname:         __DATA_CONST
+        addr:            0x4000
+        size:            80
+        offset:          0x4000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         00000000000000002840000000000000F0100000000000001000000000000000FD100000000000801000000000000000F810000000000080000000000100000018400000000000000000000000000000
+      - sectname:        __objc_imageinfo
+        segname:         __DATA_CONST
+        addr:            0x4050
+        size:            8
+        offset:          0x4050
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000040000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA
+    vmaddr:          32768
+    vmsize:          16384
+    fileoff:         32768
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x8000
+        size:            8
+        offset:          0x8000
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '1040000000000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          49152
+    vmsize:          1104
+    fileoff:         49152
+    filesize:        1104
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      49152
+    rebase_size:     16
+    bind_off:        49168
+    bind_size:       96
+    weak_bind_off:   49264
+    weak_bind_size:  24
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      49288
+    export_size:     48
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          49344
+    nsyms:           11
+    stroff:          49520
+    strsize:         192
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       6
+    iextdefsym:      6
+    nextdefsym:      2
+    iundefsym:       8
+    nundefsym:       3
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         96
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/MismatchCpp.framework/Versions/A/MismatchCpp'
+    ZeroPadBytes:    1
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C44F3-5555-3144-A13F-B3FE15787197
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         48
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 117985024
+      compatibility_version: 65536
+    Content:         '/usr/lib/libc++.1.dylib'
+    ZeroPadBytes:    1
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88473600
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         49336
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         49344
+    datasize:        0
+  - cmd:             LC_CODE_SIGNATURE
+    cmdsize:         16
+    dataoff:         49712
+    datasize:        544
+LinkEditData:
+  RebaseOpcodes:
+    - Opcode:          REBASE_OPCODE_SET_TYPE_IMM
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ExtraData:       [ 0x8 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             2
+    - Opcode:          REBASE_OPCODE_ADD_ADDR_IMM_SCALED
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB
+      Imm:             0
+      ExtraData:       [ 0x3, 0x8 ]
+    - Opcode:          REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             2
+      ExtraData:       [ 0x0 ]
+    - Opcode:          REBASE_OPCODE_DO_REBASE_IMM_TIMES
+      Imm:             1
+    - Opcode:          REBASE_OPCODE_DONE
+      Imm:             0
+  BindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __ZTVN10__cxxabiv117__class_type_infoE
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ULEBExtraData:   [ 0x18 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_ADDEND_SLEB
+      Imm:             0
+      SLEBExtraData:   [ 16 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __ZTVN10__cxxabiv121__vmi_class_type_infoE
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_ADD_ADDR_ULEB
+      Imm:             0
+      ULEBExtraData:   [ 0x8 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  WeakBindOpcodes:
+    - Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+      Imm:             0
+      Symbol:          __ZN3Bar4initEv
+    - Opcode:          BIND_OPCODE_SET_TYPE_IMM
+      Imm:             1
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
+      Imm:             1
+      ULEBExtraData:   [ 0x10 ]
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DO_BIND
+      Imm:             0
+      Symbol:          ''
+    - Opcode:          BIND_OPCODE_DONE
+      Imm:             0
+      Symbol:          ''
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      7
+        Name:            __Z
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      35
+            Name:            3fooIjEiT_
+            Flags:           0x0
+            Address:         0x10E8
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    3
+            NodeOffset:      40
+            Name:            N3Bar4initEv
+            Flags:           0x4
+            Address:         0x10F0
+            Other:           0x0
+            ImportName:      ''
+  NameList:
+    - n_strx:          32
+      n_type:          0x1E
+      n_sect:          4
+      n_desc:          0
+      n_value:         16384
+    - n_strx:          42
+      n_type:          0x1E
+      n_sect:          4
+      n_desc:          0
+      n_value:         16408
+    - n_strx:          52
+      n_type:          0x1E
+      n_sect:          4
+      n_desc:          0
+      n_value:         16424
+    - n_strx:          62
+      n_type:          0x1E
+      n_sect:          6
+      n_desc:          0
+      n_value:         32768
+    - n_strx:          67
+      n_type:          0x1E
+      n_sect:          2
+      n_desc:          0
+      n_value:         4344
+    - n_strx:          77
+      n_type:          0x1E
+      n_sect:          2
+      n_desc:          0
+      n_value:         4349
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4328
+    - n_strx:          16
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          128
+      n_value:         4336
+    - n_strx:          87
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          126
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+    - n_strx:          169
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+  StringTable:
+    - ' '
+    - __Z3fooIjEiT_
+    - __ZN3Bar4initEv
+    - __ZTV3Bar
+    - __ZTI3Foo
+    - __ZTI3Bar
+    - _bar
+    - __ZTS3Bar
+    - __ZTS3Foo
+    - __ZTVN10__cxxabiv117__class_type_infoE
+    - __ZTVN10__cxxabiv121__vmi_class_type_infoE
+    - dyld_stub_binder
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+  FunctionStarts:  [ 0x10E8, 0x10F0 ]
+...
diff --git a/clang/test/InstallAPI/driver-invalid-options.test b/clang/test/InstallAPI/driver-invalid-options.test
index a2e008e1eb03e7..69f3b2d66ab8b6 100644
--- a/clang/test/InstallAPI/driver-invalid-options.test
+++ b/clang/test/InstallAPI/driver-invalid-options.test
@@ -1,4 +1,9 @@
 /// Check non-darwin triple is rejected.
-// RUN: not clang-installapi -target x86_64-unknown-unknown %s 2> %t 
+// RUN: not clang-installapi -target x86_64-unknown-unknown %s -o tmp.tbd 2> %t 
 // RUN: FileCheck --check-prefix INVALID_INSTALLAPI -input-file %t %s
 // INVALID_INSTALLAPI: error: unsupported option 'installapi' for target 'x86_64-unknown-unknown'
+
+/// Check that missing install_name is reported.
+// RUN: not clang-installapi -target x86_64-apple-ios-simulator  %s -o tmp.tbd 2> %t 
+// RUN: FileCheck --check-prefix INVALID_INSTALL_NAME -input-file %t %s
+// INVALID_INSTALL_NAME: error: no install name specified: add -install_name <path>
diff --git a/clang/test/InstallAPI/functions.test b/clang/test/InstallAPI/functions.test
index 527965303cb351..5b5fd1308842ed 100644
--- a/clang/test/InstallAPI/functions.test
+++ b/clang/test/InstallAPI/functions.test
@@ -4,7 +4,7 @@
 
 // RUN: clang-installapi -target arm64-apple-macos13.1 \
 // RUN: -I%t/usr/include -I%t/usr/local/include \
-// RUN: -install_name @rpath/lib/libfunctions.dylib \
+// RUN: -install_name @rpath/lib/libfunctions.dylib --filetype=tbd-v4 \
 // RUN: %t/inputs.json -o %t/outputs.tbd 2>&1 | FileCheck %s --allow-empty
 // RUN: llvm-readtapi -compare %t/outputs.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
 
diff --git a/clang/test/InstallAPI/hiddens.test b/clang/test/InstallAPI/hiddens.test
new file mode 100644
index 00000000000000..b3196ef945cd12
--- /dev/null
+++ b/clang/test/InstallAPI/hiddens.test
@@ -0,0 +1,262 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+// RUN: yaml2obj %t/Hidden.yaml -o %t/System/Library/Frameworks/Hidden.framework/Hidden
+
+// RUN: clang-installapi --target=x86_64-apple-macos13 -x objective-c \
+// RUN: -F %t/System/Library/Frameworks \
+// RUN: -install_name /System/Library/Frameworks/Hidden.framework/Versions/A/Hidden\
+// RUN: -current_version 1 -compatibility_version 1 %t/inputs.json \
+// RUN: --verify-against=%t/System/Library/Frameworks/Hidden.framework/Hidden \
+// RUN: --verify-mode=Pedantic -o %t/output.tbd 2>&1 | FileCheck %s 
+
+// CHECK-NOT: error
+// CHECK:  warning: use of __private_extern__ 
+
+// RUN: llvm-readtapi --compare %t/output.tbd %t/expected.tbd 
+
+//--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/System/Library/Frameworks/Hidden.framework/Headers/Hidden.h",
+    "type" : "public"
+  }
+  ],
+  "version": "3"
+}
+
+//--- System/Library/Frameworks/Hidden.framework/Headers/Hidden.h
+__private_extern__ int foo(); // Clang doesn't warn on this, but should.
+__private_extern__ int baz;
+__attribute__((visibility("hidden"))) int bar();
+
+/// Created from: 
+/// #import "Hidden.h" int foo(void) { return 1; } int bar(void) { return 1; }
+//--- Hidden.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x6
+  ncmds:           12
+  sizeofcmds:      920
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          8192
+    fileoff:         0
+    filesize:        8192
+    maxprot:         5
+    initprot:        5
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0xBB8
+        size:            22
+        offset:          0xBB8
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         554889E5B8010000005DC3554889E5B8010000005DC3
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0xBD0
+        size:            4152
+        offset:          0xBD0
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000010000002000000000000000200000000200000000000001B80B00003800000038000000CE0B00000000000038000000030000000C0001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+      - sectname:        __eh_frame
+        segname:         __TEXT
+        addr:            0x1C08
+        size:            24
+        offset:          0x1C08
+        align:           3
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x6000000B
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         1400000000000000017A520001781001100C070890010000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA_CONST
+    vmaddr:          8192
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           16
+    Sections:
+      - sectname:        __objc_imageinfo
+        segname:         __DATA_CONST
+        addr:            0x2000
+        size:            8
+        offset:          0x2000
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000040000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          12288
+    vmsize:          104
+    fileoff:         12288
+    filesize:        104
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      0
+    rebase_size:     0
+    bind_off:        0
+    bind_size:       0
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      12288
+    export_size:     16
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          12312
+    nsyms:           3
+    stroff:          12360
+    strsize:         32
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       1
+    iextdefsym:      1
+    nextdefsym:      1
+    iundefsym:       2
+    nundefsym:       1
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         88
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/Hidden.framework/Versions/A/Hidden'
+    ZeroPadBytes:    3
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C44E7-5555-3144-A133-0271E799C487
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88473600
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         12304
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         12312
+    datasize:        0
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    3
+        NodeOffset:      8
+        Name:            _foo
+        Flags:           0x0
+        Address:         0xBB8
+        Other:           0x0
+        ImportName:      ''
+  NameList:
+    - n_strx:          7
+      n_type:          0x1E
+      n_sect:          1
+      n_desc:          0
+      n_value:         3011
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         3000
+    - n_strx:          12
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+  StringTable:
+    - ' '
+    - _foo
+    - _bar
+    - dyld_stub_binder
+    - ''
+    - ''
+    - ''
+  FunctionStarts:  [ 0xBB8, 0xBC3 ]
+...
+
+//--- expected.tbd
+--- !tapi-tbd
+tbd-version:     4
+targets:         [ x86_64-macos ]
+flags:           [ not_app_extension_safe ]
+install-name:    '/System/Library/Frameworks/Hidden.framework/Versions/A/Hidden'
+...
+
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index b65a8fb057ee53..9c91c4157cd6a0 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -5,11 +5,11 @@
 
 // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64
 // AARCH64: error: unknown target CPU 'not-a-cpu'
-// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
+// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64
 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu'
-// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
+// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, neoverse-v2, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}}
 
 // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86
 // X86: error: unknown target CPU 'not-a-cpu'
diff --git a/clang/test/Modules/bounds-safety-attributed-type.c b/clang/test/Modules/bounds-safety-attributed-type.c
new file mode 100644
index 00000000000000..0aa2f16b321c5f
--- /dev/null
+++ b/clang/test/Modules/bounds-safety-attributed-type.c
@@ -0,0 +1,27 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -verify %s
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -ast-dump-all %s | FileCheck %s
+// expected-no-diagnostics
+
+#pragma clang module build bounds_safety
+module bounds_safety {}
+#pragma clang module contents
+#pragma clang module begin bounds_safety
+struct Test {
+  int count;
+  int fam[] __attribute__((counted_by(count)));
+};
+#pragma clang module end
+#pragma clang module endbuild
+
+#pragma clang module import bounds_safety
+
+struct Test *p;
+
+// CHECK: |-RecordDecl {{.*}}bounds_safety.map:4:1, line:7:1> line:4:8 imported in bounds_safety <undeserialized declarations> struct Test definition
+// CHECK: | |-FieldDecl {{.*}} imported in bounds_safety referenced count 'int'
+// CHECK: | `-FieldDecl {{.*}} imported in bounds_safety fam 'int[] __counted_by(count)':'int[]'
+
+// CHECK: |-ImportDecl {{.*}}bounds-safety-attributed-type.c:17:22> col:22 implicit bounds_safety
+// CHECK: |-RecordDecl {{.*}} struct Test
+// CHECK: `-VarDecl {{.*}} p 'struct Test *'
diff --git a/clang/test/OpenMP/bug60602.cpp b/clang/test/OpenMP/bug60602.cpp
index 48dc341a08224e..9c92ec27b86eea 100644
--- a/clang/test/OpenMP/bug60602.cpp
+++ b/clang/test/OpenMP/bug60602.cpp
@@ -95,7 +95,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -181,7 +181,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP71]], 1
 // CHECK-NEXT:    [[TMP72:%.*]] = zext i32 [[ADD]] to i64
 // CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP73]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP73]], align 4
 // CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK-NEXT:    store i32 3, ptr [[TMP74]], align 4
 // CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp
index 31ec6ff911905e..34d14c89fedaed 100644
--- a/clang/test/OpenMP/distribute_codegen.cpp
+++ b/clang/test/OpenMP/distribute_codegen.cpp
@@ -163,7 +163,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -354,7 +354,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -545,7 +545,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -744,7 +744,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[TMP10:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -911,7 +911,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1084,7 +1084,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1271,7 +1271,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1458,7 +1458,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1653,7 +1653,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK3-NEXT:    [[TMP10:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1820,7 +1820,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
index 800a002e439680..567d12119db218 100644
--- a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
@@ -542,7 +542,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -838,7 +838,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1207,7 +1207,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1501,7 +1501,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
index 772372076e9477..5d1a12d27145e0 100644
--- a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
@@ -527,7 +527,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -841,7 +841,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1229,7 +1229,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1541,7 +1541,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
index 9cb0a15530651b..a8892a06d4b306 100644
--- a/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
@@ -4372,7 +4372,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4447,7 +4447,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK9-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK9-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK9-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK9-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4531,7 +4531,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK9-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK9-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK9-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK9-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -4606,7 +4606,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK9-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK9-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK9-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK9-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -4690,7 +4690,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK9-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK9-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK9-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK9-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -4765,7 +4765,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK9-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK9-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK9-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK9-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -4849,7 +4849,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK9-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK9-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK9-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK9-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -6671,7 +6671,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6746,7 +6746,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK9-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK9-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK9-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK9-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -6830,7 +6830,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK9-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK9-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK9-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK9-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -6905,7 +6905,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK9-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK9-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK9-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK9-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -6989,7 +6989,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK9-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK9-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK9-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK9-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -7064,7 +7064,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK9-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK9-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK9-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK9-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -7148,7 +7148,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK9-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK9-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK9-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK9-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -8986,7 +8986,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9061,7 +9061,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK11-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK11-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK11-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -9145,7 +9145,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK11-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK11-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK11-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK11-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -9220,7 +9220,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK11-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK11-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK11-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -9304,7 +9304,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK11-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK11-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK11-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK11-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -9379,7 +9379,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK11-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK11-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK11-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK11-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -9463,7 +9463,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK11-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK11-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK11-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK11-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -11234,7 +11234,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -11309,7 +11309,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK11-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK11-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK11-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -11393,7 +11393,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK11-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK11-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK11-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK11-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -11468,7 +11468,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK11-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK11-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK11-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -11552,7 +11552,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK11-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK11-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK11-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK11-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -11627,7 +11627,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK11-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK11-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK11-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK11-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -11711,7 +11711,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK11-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK11-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK11-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK11-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
index 6084a9a6cf9314..faae599bb32afe 100644
--- a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
@@ -825,7 +825,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK8-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK8-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK8-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK8-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK8-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1251,7 +1251,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK8-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK8-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK8-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1744,7 +1744,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK10-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK10-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK10-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK10-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK10-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2164,7 +2164,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK10-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK10-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK10-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK10-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK10-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
index d3b6654e57e2c9..4b3911daefbad1 100644
--- a/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
@@ -128,7 +128,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -161,7 +161,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -492,7 +492,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -525,7 +525,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -569,7 +569,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1060,7 +1060,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1093,7 +1093,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1137,7 +1137,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
index 1f069c4070aecf..18a86c5e566639 100644
--- a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
@@ -801,7 +801,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1263,7 +1263,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1793,7 +1793,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2249,7 +2249,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
index b6ae783b74d049..dfe79d6a949342 100644
--- a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
@@ -116,7 +116,7 @@ int main() {
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -169,7 +169,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -565,7 +565,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -598,7 +598,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -641,7 +641,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -674,7 +674,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1368,7 +1368,7 @@ int main() {
 // CHECK5:       invoke.cont:
 // CHECK5-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1421,7 +1421,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1817,7 +1817,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1850,7 +1850,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1893,7 +1893,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1926,7 +1926,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2620,7 +2620,7 @@ int main() {
 // CHECK9:       invoke.cont:
 // CHECK9-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2673,7 +2673,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3069,7 +3069,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3102,7 +3102,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3145,7 +3145,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3178,7 +3178,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3872,7 +3872,7 @@ int main() {
 // CHECK13:       invoke.cont:
 // CHECK13-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3925,7 +3925,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4321,7 +4321,7 @@ int main() {
 // CHECK13-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4354,7 +4354,7 @@ int main() {
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4397,7 +4397,7 @@ int main() {
 // CHECK13-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4430,7 +4430,7 @@ int main() {
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
index e2b0d64093ebc0..0969a0ca21e7ab 100644
--- a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
@@ -521,7 +521,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -842,7 +842,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1232,7 +1232,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1547,7 +1547,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
index 040f90b9ef78bd..c1203349b7c1bc 100644
--- a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
@@ -63,7 +63,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -96,7 +96,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -414,7 +414,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp
index 77336877e2be04..a23d538910edd7 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp
@@ -4762,7 +4762,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4837,7 +4837,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK9-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK9-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK9-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK9-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4921,7 +4921,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK9-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK9-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK9-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK9-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -4996,7 +4996,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK9-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK9-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK9-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK9-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -5080,7 +5080,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK9-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK9-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK9-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK9-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -5155,7 +5155,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK9-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK9-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK9-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK9-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -5239,7 +5239,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK9-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK9-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK9-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK9-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -7229,7 +7229,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7304,7 +7304,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK9-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK9-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK9-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK9-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -7388,7 +7388,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK9-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK9-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK9-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK9-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -7463,7 +7463,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK9-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK9-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK9-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK9-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -7547,7 +7547,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK9-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK9-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK9-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK9-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -7622,7 +7622,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK9-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK9-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK9-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK9-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -7706,7 +7706,7 @@ int main() {
 // CHECK9-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK9-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK9-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK9-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK9-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -9697,7 +9697,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9772,7 +9772,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK11-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK11-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK11-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -9856,7 +9856,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK11-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK11-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK11-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK11-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -9931,7 +9931,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK11-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK11-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK11-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -10015,7 +10015,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK11-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK11-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK11-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK11-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -10090,7 +10090,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK11-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK11-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK11-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK11-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -10174,7 +10174,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK11-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK11-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK11-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK11-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
@@ -12113,7 +12113,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12188,7 +12188,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP59]], 1
 // CHECK11-NEXT:    [[TMP60:%.*]] = zext i32 [[ADD13]] to i64
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP61]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP61]], align 4
 // CHECK11-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP62]], align 4
 // CHECK11-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -12272,7 +12272,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP102]], 1
 // CHECK11-NEXT:    [[TMP103:%.*]] = zext i32 [[ADD27]] to i64
 // CHECK11-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP104]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP104]], align 4
 // CHECK11-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP105]], align 4
 // CHECK11-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -12347,7 +12347,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD41:%.*]] = add nsw i32 [[TMP140]], 1
 // CHECK11-NEXT:    [[TMP141:%.*]] = zext i32 [[ADD41]] to i64
 // CHECK11-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP142]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP142]], align 4
 // CHECK11-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP143]], align 4
 // CHECK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS42]], i32 0, i32 2
@@ -12431,7 +12431,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD56:%.*]] = add nsw i32 [[TMP183]], 1
 // CHECK11-NEXT:    [[TMP184:%.*]] = zext i32 [[ADD56]] to i64
 // CHECK11-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK11-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP186]], align 4
 // CHECK11-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS57]], i32 0, i32 2
@@ -12506,7 +12506,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP221]], 1
 // CHECK11-NEXT:    [[TMP222:%.*]] = zext i32 [[ADD70]] to i64
 // CHECK11-NEXT:    [[TMP223:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP223]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP223]], align 4
 // CHECK11-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP224]], align 4
 // CHECK11-NEXT:    [[TMP225:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS71]], i32 0, i32 2
@@ -12590,7 +12590,7 @@ int main() {
 // CHECK11-NEXT:    [[ADD85:%.*]] = add nsw i32 [[TMP264]], 1
 // CHECK11-NEXT:    [[TMP265:%.*]] = zext i32 [[ADD85]] to i64
 // CHECK11-NEXT:    [[TMP266:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP266]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP266]], align 4
 // CHECK11-NEXT:    [[TMP267:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP267]], align 4
 // CHECK11-NEXT:    [[TMP268:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS86]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
index f47b64b1e299d5..545ea9f0f43fdf 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -888,7 +888,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK8-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK8-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK8-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK8-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK8-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1328,7 +1328,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK8-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK8-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK8-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1835,7 +1835,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK10-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK10-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK10-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK10-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK10-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2269,7 +2269,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK10-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK10-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK10-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK10-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK10-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp
index b3e964fddcf0d7..4a321b24b8c313 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp
@@ -125,7 +125,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -158,7 +158,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -517,7 +517,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -550,7 +550,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -594,7 +594,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1127,7 +1127,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1160,7 +1160,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1204,7 +1204,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1727,7 +1727,7 @@ int main() {
 // CHECK3-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1760,7 +1760,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2119,7 +2119,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2152,7 +2152,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2196,7 +2196,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -2964,7 +2964,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2997,7 +2997,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3041,7 +3041,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -4140,7 +4140,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4173,7 +4173,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4532,7 +4532,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4565,7 +4565,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4609,7 +4609,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK9-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -5142,7 +5142,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5175,7 +5175,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -5219,7 +5219,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK9-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -5742,7 +5742,7 @@ int main() {
 // CHECK11-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5775,7 +5775,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6134,7 +6134,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6167,7 +6167,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6211,7 +6211,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK11-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK11-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -6979,7 +6979,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7012,7 +7012,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -7056,7 +7056,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK11-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP38]], align 4
 // CHECK11-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
index 7656fb7bc2c519..675f1313460e65 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -868,7 +868,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1344,7 +1344,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1888,7 +1888,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2358,7 +2358,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
index 9f4fffbea63d16..5c1ca41b1c62d1 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
@@ -116,7 +116,7 @@ int main() {
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -169,7 +169,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -593,7 +593,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -626,7 +626,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -669,7 +669,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -702,7 +702,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1791,7 +1791,7 @@ int main() {
 // CHECK5:       invoke.cont:
 // CHECK5-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1844,7 +1844,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2268,7 +2268,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2301,7 +2301,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2344,7 +2344,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2377,7 +2377,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3127,7 +3127,7 @@ int main() {
 // CHECK9:       invoke.cont:
 // CHECK9-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3180,7 +3180,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3604,7 +3604,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3637,7 +3637,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3680,7 +3680,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3713,7 +3713,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4802,7 +4802,7 @@ int main() {
 // CHECK13:       invoke.cont:
 // CHECK13-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4855,7 +4855,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -5279,7 +5279,7 @@ int main() {
 // CHECK13-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5312,7 +5312,7 @@ int main() {
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -5355,7 +5355,7 @@ int main() {
 // CHECK13-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK13-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5388,7 +5388,7 @@ int main() {
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
index 61f35d8b456cea..a74af2d7c39320 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
@@ -575,7 +575,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -910,7 +910,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1314,7 +1314,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1643,7 +1643,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
index 334965f59a826d..777d4a25ea9f34 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -63,7 +63,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -96,7 +96,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -442,7 +442,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_private_codegen.cpp b/clang/test/OpenMP/distribute_private_codegen.cpp
index 8a47b15a24d242..b6e796fb8027c0 100644
--- a/clang/test/OpenMP/distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_private_codegen.cpp
@@ -351,7 +351,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -384,7 +384,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 2
@@ -668,7 +668,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -957,7 +957,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -990,7 +990,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 2
@@ -1272,7 +1272,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index 16d909e329514f..9a192502125f96 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -191,7 +191,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -391,7 +391,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -589,7 +589,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -806,7 +806,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK1-NEXT:    [[TMP15:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -996,7 +996,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1176,7 +1176,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1372,7 +1372,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1566,7 +1566,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1779,7 +1779,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK3-NEXT:    [[TMP15:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1969,7 +1969,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2149,7 +2149,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2349,7 +2349,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2547,7 +2547,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2764,7 +2764,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK5-NEXT:    [[TMP15:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 2, ptr [[TMP17]], align 4
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2985,7 +2985,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3165,7 +3165,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3361,7 +3361,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3555,7 +3555,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3768,7 +3768,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK7-NEXT:    [[TMP15:%.*]] = zext i32 [[ADD4]] to i64
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 2, ptr [[TMP17]], align 4
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3989,7 +3989,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK7-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK7-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
index e388393c37f18d..1b5950ce11d7fa 100644
--- a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
@@ -595,7 +595,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -898,7 +898,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1274,7 +1274,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1575,7 +1575,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
index 5720c3aaaaff52..92b73b220bcc21 100644
--- a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
@@ -582,7 +582,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -903,7 +903,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1298,7 +1298,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1617,7 +1617,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_simd_private_codegen.cpp
index eac5bce7d5da80..93b2bd8b12a8ee 100644
--- a/clang/test/OpenMP/distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_private_codegen.cpp
@@ -396,7 +396,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -440,7 +440,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 2
@@ -742,7 +742,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK9-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1042,7 +1042,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1086,7 +1086,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 2
@@ -1386,7 +1386,7 @@ int main() {
 // CHECK11-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK11-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp
index 31d276e984eee2..b6cc1adfc76beb 100644
--- a/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp
@@ -105,7 +105,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -255,7 +255,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -402,7 +402,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -552,7 +552,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/map_struct_ordering.cpp b/clang/test/OpenMP/map_struct_ordering.cpp
index dd0bb8e9d6e7cd..e422010f3dcdfa 100644
--- a/clang/test/OpenMP/map_struct_ordering.cpp
+++ b/clang/test/OpenMP/map_struct_ordering.cpp
@@ -97,7 +97,7 @@ int map_struct() {
 // CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP24]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP25]], align 4
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
index 5d7da793e73267..86035e17ef3b60 100644
--- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp
+++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
@@ -191,7 +191,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [11 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [11 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP66]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 11, ptr [[TMP67]], align 4
 // CHECK1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -317,7 +317,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [11 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [11 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP138]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 11, ptr [[TMP139]], align 4
 // CHECK1-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -541,7 +541,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -599,7 +599,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP46]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP47]], align 4
 // CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -744,7 +744,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP13]], align 4
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index 7305c56289e01b..0f67cdc56ddcf8 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -318,7 +318,7 @@ int main()
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -359,7 +359,7 @@ int main()
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -890,7 +890,7 @@ int main()
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK2-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -971,7 +971,7 @@ int main()
 // CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP69]], 1
 // CHECK2-NEXT:    [[TMP70:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK2-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP72]], align 4
 // CHECK2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -1022,7 +1022,7 @@ int main()
 // CHECK2-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP96]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP96]], align 4
 // CHECK2-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 2, ptr [[TMP97]], align 4
 // CHECK2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -1073,7 +1073,7 @@ int main()
 // CHECK2-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS31]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS32]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS34]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP121]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP121]], align 4
 // CHECK2-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS34]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 2, ptr [[TMP122]], align 4
 // CHECK2-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS34]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_codegen_global_capture.cpp b/clang/test/OpenMP/target_codegen_global_capture.cpp
index 0fb52c3fa6c55c..d2c800a56a104a 100644
--- a/clang/test/OpenMP/target_codegen_global_capture.cpp
+++ b/clang/test/OpenMP/target_codegen_global_capture.cpp
@@ -288,7 +288,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP51]], align 4
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -615,7 +615,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP54]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP54]], align 4
 // CHECK1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP55]], align 4
 // CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -938,7 +938,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP54]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP54]], align 4
 // CHECK1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP55]], align 4
 // CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1184,7 +1184,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP44]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP44]], align 4
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1511,7 +1511,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP49]], align 4
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1834,7 +1834,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP49]], align 4
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_data_use_device_ptr_inheritance_codegen.cpp b/clang/test/OpenMP/target_data_use_device_ptr_inheritance_codegen.cpp
index 3db18643d3f033..20d3bd2ce11c32 100644
--- a/clang/test/OpenMP/target_data_use_device_ptr_inheritance_codegen.cpp
+++ b/clang/test/OpenMP/target_data_use_device_ptr_inheritance_codegen.cpp
@@ -91,7 +91,7 @@ void foo() {
 // CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(16) [[THIS1]]) #[[ATTR1]]
-// CHECK-NEXT:    store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr [[THIS1]], align 8
+// CHECK-NEXT:    store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2), ptr [[THIS1]], align 8
 // CHECK-NEXT:    ret void
 // CHECK-LABEL: define {{[^@]+}}@_ZN1AC2Ev
 // CHECK-SAME: (ptr noundef nonnull align 8 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
@@ -99,7 +99,7 @@ void foo() {
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr [[THIS1]], align 8
+// CHECK-NEXT:    store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2), ptr [[THIS1]], align 8
 // CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], ptr [[THIS1]], i32 0, i32 1
 // CHECK-NEXT:    store ptr null, ptr [[PTR]], align 8
 // CHECK-NEXT:    ret void
@@ -168,7 +168,7 @@ void foo() {
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(16) [[THIS1]]) #[[ATTR1]]
-// CHECK1-NEXT:    store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr [[THIS1]], align 8
+// CHECK1-NEXT:    store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2), ptr [[THIS1]], align 8
 // CHECK1-NEXT:    ret void
 //
 //
@@ -178,7 +178,7 @@ void foo() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr [[THIS1]], align 8
+// CHECK1-NEXT:    store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2), ptr [[THIS1]], align 8
 // CHECK1-NEXT:    [[PTR:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], ptr [[THIS1]], i32 0, i32 1
 // CHECK1-NEXT:    store ptr null, ptr [[PTR]], align 8
 // CHECK1-NEXT:    ret void
@@ -249,7 +249,7 @@ void foo() {
 // CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK2-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK2-NEXT:    call void @_ZN1AC2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR1]]
-// CHECK2-NEXT:    store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr [[THIS1]], align 4
+// CHECK2-NEXT:    store ptr getelementptr inbounds inrange(-8, 4) ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2), ptr [[THIS1]], align 4
 // CHECK2-NEXT:    ret void
 //
 //
@@ -259,7 +259,7 @@ void foo() {
 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK2-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK2-NEXT:    store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr [[THIS1]], align 4
+// CHECK2-NEXT:    store ptr getelementptr inbounds inrange(-8, 4) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2), ptr [[THIS1]], align 4
 // CHECK2-NEXT:    [[PTR:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], ptr [[THIS1]], i32 0, i32 1
 // CHECK2-NEXT:    store ptr null, ptr [[PTR]], align 4
 // CHECK2-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_firstprivate_codegen.cpp b/clang/test/OpenMP/target_firstprivate_codegen.cpp
index 6314940730470e..02de48e6aa3956 100644
--- a/clang/test/OpenMP/target_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_firstprivate_codegen.cpp
@@ -6159,7 +6159,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP24]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK0-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK0-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6260,7 +6260,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP76]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP76]], align 4
 // CHECK0-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 9, ptr [[TMP77]], align 4
 // CHECK0-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -6308,7 +6308,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK0-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 2, ptr [[TMP101]], align 4
 // CHECK0-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 2
@@ -6564,7 +6564,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK0-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 5, ptr [[TMP29]], align 4
 // CHECK0-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6651,7 +6651,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK0-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK0-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6718,7 +6718,7 @@ int bar(int n, double *ptr) {
 // CHECK0-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK0-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK0-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK0-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK0-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK0-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK0-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6930,7 +6930,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP24]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7031,7 +7031,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP76]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP76]], align 4
 // CHECK1-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP77]], align 4
 // CHECK1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -7079,7 +7079,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK1-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP101]], align 4
 // CHECK1-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 2
@@ -7335,7 +7335,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7422,7 +7422,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7489,7 +7489,7 @@ int bar(int n, double *ptr) {
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7699,7 +7699,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7802,7 +7802,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP76]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP76]], align 4
 // CHECK2-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 9, ptr [[TMP77]], align 4
 // CHECK2-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -7850,7 +7850,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK2-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 2, ptr [[TMP101]], align 4
 // CHECK2-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 2
@@ -8106,7 +8106,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 5, ptr [[TMP29]], align 4
 // CHECK2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8193,7 +8193,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8260,7 +8260,7 @@ int bar(int n, double *ptr) {
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8470,7 +8470,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8573,7 +8573,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP76]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP76]], align 4
 // CHECK3-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP77]], align 4
 // CHECK3-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -8621,7 +8621,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK3-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP101]], align 4
 // CHECK3-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS11]], i32 0, i32 2
@@ -8877,7 +8877,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8964,7 +8964,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9031,7 +9031,7 @@ int bar(int n, double *ptr) {
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_has_device_addr_codegen.cpp b/clang/test/OpenMP/target_has_device_addr_codegen.cpp
index e6a0e7bb38d646..ba1b618ed8bdd7 100644
--- a/clang/test/OpenMP/target_has_device_addr_codegen.cpp
+++ b/clang/test/OpenMP/target_has_device_addr_codegen.cpp
@@ -328,7 +328,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -372,7 +372,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP29]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -413,7 +413,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -457,7 +457,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS14]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -498,7 +498,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP90]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP90]], align 4
 // CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP91]], align 4
 // CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -539,7 +539,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS26]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS27]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP110]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP110]], align 4
 // CHECK-NEXT:    [[TMP111:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP111]], align 4
 // CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS29]], i32 0, i32 2
@@ -705,7 +705,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -749,7 +749,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP29]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -790,7 +790,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -831,7 +831,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP68]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP68]], align 4
 // CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP69]], align 4
 // CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -911,7 +911,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -955,7 +955,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP29]], align 4
 // CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -996,7 +996,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -1037,7 +1037,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP68]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP68]], align 4
 // CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP69]], align 4
 // CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -1239,7 +1239,7 @@ void use_template() {
 // CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
 // CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp b/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp
index efd0a1ee88fe23..6bbabdf51f6e7d 100644
--- a/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp
+++ b/clang/test/OpenMP/target_has_device_addr_codegen_01.cpp
@@ -102,7 +102,7 @@ int main() {
 // CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [6 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [6 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 6, ptr [[TMP27]], align 4
 // CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -211,7 +211,7 @@ int main() {
 // CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
index 162f18529a489d..190032023aed39 100644
--- a/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
+++ b/clang/test/OpenMP/target_is_device_ptr_codegen.cpp
@@ -1827,7 +1827,7 @@ void bar() {
 // CK10-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK10-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK10-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK10-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK10-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1869,7 +1869,7 @@ void bar() {
 // CK10-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CK10-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CK10-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CK10-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CK10-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -1911,7 +1911,7 @@ void bar() {
 // CK10-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CK10-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CK10-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CK10-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CK10-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -1956,7 +1956,7 @@ void bar() {
 // CK10-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CK10-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CK10-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK10-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CK10-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -2001,7 +2001,7 @@ void bar() {
 // CK10-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CK10-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CK10-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CK10-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP95]], align 4
 // CK10-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2046,7 +2046,7 @@ void bar() {
 // CK10-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CK10-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CK10-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP117]], align 4
 // CK10-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CK10-NEXT:    store i32 1, ptr [[TMP118]], align 4
 // CK10-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2101,7 +2101,7 @@ void bar() {
 // CK10-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CK10-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CK10-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CK10-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK10-NEXT:    store i32 3, ptr [[TMP146]], align 4
 // CK10-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CK10-NEXT:    store i32 2, ptr [[TMP147]], align 4
 // CK10-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -2298,7 +2298,7 @@ void bar() {
 // CK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2340,7 +2340,7 @@ void bar() {
 // CK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -2382,7 +2382,7 @@ void bar() {
 // CK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CK11-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -2427,7 +2427,7 @@ void bar() {
 // CK11-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CK11-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CK11-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK11-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CK11-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -2472,7 +2472,7 @@ void bar() {
 // CK11-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CK11-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CK11-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CK11-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP95]], align 4
 // CK11-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2517,7 +2517,7 @@ void bar() {
 // CK11-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CK11-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CK11-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP117]], align 4
 // CK11-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CK11-NEXT:    store i32 1, ptr [[TMP118]], align 4
 // CK11-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2572,7 +2572,7 @@ void bar() {
 // CK11-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CK11-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CK11-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CK11-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK11-NEXT:    store i32 3, ptr [[TMP146]], align 4
 // CK11-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CK11-NEXT:    store i32 2, ptr [[TMP147]], align 4
 // CK11-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -2769,7 +2769,7 @@ void bar() {
 // CK12-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK12-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK12-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK12-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK12-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2811,7 +2811,7 @@ void bar() {
 // CK12-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CK12-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CK12-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CK12-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CK12-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -2853,7 +2853,7 @@ void bar() {
 // CK12-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CK12-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CK12-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CK12-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CK12-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -2898,7 +2898,7 @@ void bar() {
 // CK12-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CK12-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CK12-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK12-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CK12-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -2943,7 +2943,7 @@ void bar() {
 // CK12-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CK12-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CK12-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CK12-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP95]], align 4
 // CK12-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2988,7 +2988,7 @@ void bar() {
 // CK12-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CK12-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CK12-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP117]], align 4
 // CK12-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CK12-NEXT:    store i32 1, ptr [[TMP118]], align 4
 // CK12-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -3043,7 +3043,7 @@ void bar() {
 // CK12-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CK12-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CK12-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CK12-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK12-NEXT:    store i32 3, ptr [[TMP146]], align 4
 // CK12-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CK12-NEXT:    store i32 2, ptr [[TMP147]], align 4
 // CK12-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -3240,7 +3240,7 @@ void bar() {
 // CK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK13-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3282,7 +3282,7 @@ void bar() {
 // CK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CK13-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CK13-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -3324,7 +3324,7 @@ void bar() {
 // CK13-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS7]], i32 0, i32 0
 // CK13-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS8]], i32 0, i32 0
 // CK13-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP48]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP48]], align 4
 // CK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP49]], align 4
 // CK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS10]], i32 0, i32 2
@@ -3369,7 +3369,7 @@ void bar() {
 // CK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS13]], i32 0, i32 0
 // CK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CK13-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -3414,7 +3414,7 @@ void bar() {
 // CK13-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS20]], i32 0, i32 0
 // CK13-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CK13-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP94]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP95]], align 4
 // CK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3459,7 +3459,7 @@ void bar() {
 // CK13-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CK13-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CK13-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP117]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP117]], align 4
 // CK13-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CK13-NEXT:    store i32 1, ptr [[TMP118]], align 4
 // CK13-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -3514,7 +3514,7 @@ void bar() {
 // CK13-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CK13-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CK13-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CK13-NEXT:    store i32 2, ptr [[TMP146]], align 4
+// CK13-NEXT:    store i32 3, ptr [[TMP146]], align 4
 // CK13-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CK13-NEXT:    store i32 2, ptr [[TMP147]], align 4
 // CK13-NEXT:    [[TMP148:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -4003,7 +4003,7 @@ void bar() {
 // CK20-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK20-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK20-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK20-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CK20-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK20-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CK20-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4061,7 +4061,7 @@ void bar() {
 // CK20-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CK20-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CK20-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CK20-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK20-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CK20-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CK20-NEXT:    store i32 2, ptr [[TMP37]], align 4
 // CK20-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4126,7 +4126,7 @@ void bar() {
 // CK20-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CK20-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
 // CK20-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CK20-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK20-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CK20-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CK20-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK20-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4283,7 +4283,7 @@ void bar() {
 // CK21-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK21-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK21-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK21-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK21-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CK21-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK21-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CK21-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4341,7 +4341,7 @@ void bar() {
 // CK21-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CK21-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CK21-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CK21-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK21-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CK21-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CK21-NEXT:    store i32 2, ptr [[TMP37]], align 4
 // CK21-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4406,7 +4406,7 @@ void bar() {
 // CK21-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CK21-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
 // CK21-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CK21-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK21-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CK21-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CK21-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK21-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4563,7 +4563,7 @@ void bar() {
 // CK22-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK22-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK22-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK22-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK22-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CK22-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK22-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CK22-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4621,7 +4621,7 @@ void bar() {
 // CK22-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CK22-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CK22-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CK22-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK22-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CK22-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CK22-NEXT:    store i32 2, ptr [[TMP37]], align 4
 // CK22-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4686,7 +4686,7 @@ void bar() {
 // CK22-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CK22-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
 // CK22-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CK22-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK22-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CK22-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CK22-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK22-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4843,7 +4843,7 @@ void bar() {
 // CK23-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK23-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK23-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK23-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CK23-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CK23-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK23-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CK23-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4901,7 +4901,7 @@ void bar() {
 // CK23-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CK23-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [2 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CK23-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CK23-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CK23-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CK23-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CK23-NEXT:    store i32 2, ptr [[TMP37]], align 4
 // CK23-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4966,7 +4966,7 @@ void bar() {
 // CK23-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CK23-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES13]], i32 0, i32 0
 // CK23-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CK23-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CK23-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CK23-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CK23-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CK23-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -5376,7 +5376,7 @@ void bar() {
 // CK30-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK30-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK30-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK30-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK30-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK30-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK30-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK30-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5439,7 +5439,7 @@ void bar() {
 // CK31-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK31-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK31-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK31-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK31-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK31-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK31-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK31-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5502,7 +5502,7 @@ void bar() {
 // CK32-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK32-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK32-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK32-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK32-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK32-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK32-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5565,7 +5565,7 @@ void bar() {
 // CK33-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CK33-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CK33-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CK33-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CK33-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CK33-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CK33-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CK33-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_map_codegen_03.cpp b/clang/test/OpenMP/target_map_codegen_03.cpp
index cd28e7bf848e06..959018647906f8 100644
--- a/clang/test/OpenMP/target_map_codegen_03.cpp
+++ b/clang/test/OpenMP/target_map_codegen_03.cpp
@@ -96,7 +96,7 @@ void implicit_maps_nested_integer (int a){
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP9]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -195,7 +195,7 @@ void implicit_maps_nested_integer (int a){
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_map_codegen_hold.cpp b/clang/test/OpenMP/target_map_codegen_hold.cpp
index 81306ccb4cf554..c6ee673d9598e1 100644
--- a/clang/test/OpenMP/target_map_codegen_hold.cpp
+++ b/clang/test/OpenMP/target_map_codegen_hold.cpp
@@ -245,7 +245,7 @@ void ST::test_present_members() {
 // CHECK-USE-PPC64LE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [7 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [7 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-USE-PPC64LE-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK-USE-PPC64LE-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-USE-PPC64LE-NEXT:    store i32 7, ptr [[TMP38]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -286,7 +286,7 @@ void ST::test_present_members() {
 // CHECK-USE-PPC64LE-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK-USE-PPC64LE-NEXT:    store i32 2, ptr [[TMP57]], align 4
+// CHECK-USE-PPC64LE-NEXT:    store i32 3, ptr [[TMP57]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK-USE-PPC64LE-NEXT:    store i32 1, ptr [[TMP58]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -410,7 +410,7 @@ void ST::test_present_members() {
 // CHECK-USE-PPC64LE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-USE-PPC64LE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-USE-PPC64LE-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK-USE-PPC64LE-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-USE-PPC64LE-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-USE-PPC64LE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -546,7 +546,7 @@ void ST::test_present_members() {
 // CHECK-USE-I386-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [7 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [7 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-USE-I386-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK-USE-I386-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-USE-I386-NEXT:    store i32 7, ptr [[TMP38]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -587,7 +587,7 @@ void ST::test_present_members() {
 // CHECK-USE-I386-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK-USE-I386-NEXT:    store i32 2, ptr [[TMP57]], align 4
+// CHECK-USE-I386-NEXT:    store i32 3, ptr [[TMP57]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK-USE-I386-NEXT:    store i32 1, ptr [[TMP58]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -711,7 +711,7 @@ void ST::test_present_members() {
 // CHECK-USE-I386-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-USE-I386-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-USE-I386-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK-USE-I386-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-USE-I386-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-USE-I386-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -847,7 +847,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [7 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [7 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NOUSE-PPC64LE-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK-NOUSE-PPC64LE-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NOUSE-PPC64LE-NEXT:    store i32 7, ptr [[TMP38]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -888,7 +888,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK-NOUSE-PPC64LE-NEXT:    store i32 2, ptr [[TMP57]], align 4
+// CHECK-NOUSE-PPC64LE-NEXT:    store i32 3, ptr [[TMP57]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK-NOUSE-PPC64LE-NEXT:    store i32 1, ptr [[TMP58]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -978,7 +978,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NOUSE-PPC64LE-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK-NOUSE-PPC64LE-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NOUSE-PPC64LE-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-NOUSE-PPC64LE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1103,7 +1103,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-I386-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [7 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [7 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NOUSE-I386-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK-NOUSE-I386-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NOUSE-I386-NEXT:    store i32 7, ptr [[TMP38]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1144,7 +1144,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-I386-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK-NOUSE-I386-NEXT:    store i32 2, ptr [[TMP57]], align 4
+// CHECK-NOUSE-I386-NEXT:    store i32 3, ptr [[TMP57]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK-NOUSE-I386-NEXT:    store i32 1, ptr [[TMP58]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1234,7 +1234,7 @@ void ST::test_present_members() {
 // CHECK-NOUSE-I386-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK-NOUSE-I386-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NOUSE-I386-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK-NOUSE-I386-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NOUSE-I386-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-NOUSE-I386-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_map_deref_array_codegen.cpp b/clang/test/OpenMP/target_map_deref_array_codegen.cpp
index da4176f20109c1..b854333002299a 100644
--- a/clang/test/OpenMP/target_map_deref_array_codegen.cpp
+++ b/clang/test/OpenMP/target_map_deref_array_codegen.cpp
@@ -91,7 +91,7 @@ void foo(int **t1d)
 // CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 2, ptr [[TMP20]], align 4
 // CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -143,7 +143,7 @@ void foo(int **t1d)
 // CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP47]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP47]], align 4
 // CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK-NEXT:    store i32 2, ptr [[TMP48]], align 4
 // CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -224,7 +224,7 @@ void foo(int **t1d)
 // CHECK-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS12]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS13]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP88]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP88]], align 4
 // CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK-NEXT:    store i32 4, ptr [[TMP89]], align 4
 // CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_map_member_expr_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
index 84844cff09b934..9979f381a70676 100644
--- a/clang/test/OpenMP/target_map_member_expr_codegen.cpp
+++ b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
@@ -138,7 +138,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP14]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP14]], align 4
 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 4, ptr [[TMP15]], align 4
 // CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -295,7 +295,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP55]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP55]], align 4
 // CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 2, ptr [[TMP56]], align 4
 // CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -379,7 +379,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES21]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP102]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP102]], align 4
 // CHECK-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 1
 // CHECK-NEXT:    store i32 3, ptr [[TMP103]], align 4
 // CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS22]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_offload_mandatory_codegen.cpp b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp
index 04360f1ea03bd9..3bf15277aa1c39 100644
--- a/clang/test/OpenMP/target_offload_mandatory_codegen.cpp
+++ b/clang/test/OpenMP/target_offload_mandatory_codegen.cpp
@@ -33,7 +33,7 @@ void host_dev(int device) {
 // MANDATORY-NEXT:  entry:
 // MANDATORY-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // MANDATORY-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// MANDATORY-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// MANDATORY-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // MANDATORY-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // MANDATORY-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // MANDATORY-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -79,7 +79,7 @@ void host_dev(int device) {
 // MANDATORY-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
 // MANDATORY:       omp_if.then:
 // MANDATORY-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// MANDATORY-NEXT:    store i32 2, ptr [[TMP1]], align 4
+// MANDATORY-NEXT:    store i32 3, ptr [[TMP1]], align 4
 // MANDATORY-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // MANDATORY-NEXT:    store i32 0, ptr [[TMP2]], align 4
 // MANDATORY-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -129,7 +129,7 @@ void host_dev(int device) {
 // MANDATORY-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // MANDATORY-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 // MANDATORY-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// MANDATORY-NEXT:    store i32 2, ptr [[TMP3]], align 4
+// MANDATORY-NEXT:    store i32 3, ptr [[TMP3]], align 4
 // MANDATORY-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // MANDATORY-NEXT:    store i32 0, ptr [[TMP4]], align 4
 // MANDATORY-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp b/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp
index e8b074a9d5f8be..10c4ac38e6a89b 100644
--- a/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp
+++ b/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp
@@ -256,7 +256,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -298,7 +298,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP39]], align 4
 // CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -408,7 +408,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -493,7 +493,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -561,7 +561,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -958,7 +958,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META27]]
 // CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META27]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META27]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1138,7 +1138,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1180,7 +1180,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP39]], align 4
 // CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1290,7 +1290,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1375,7 +1375,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1443,7 +1443,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -1836,7 +1836,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META28]]
 // CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META28]]
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META28]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META28]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP16]], align 4, !noalias [[META28]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_codegen.cpp b/clang/test/OpenMP/target_parallel_codegen.cpp
index 84b7f1ae4ec045..ad5a9d53bcf652 100644
--- a/clang/test/OpenMP/target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_codegen.cpp
@@ -356,7 +356,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -413,7 +413,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP47]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP47]], align 4
 // CHECK1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP48]], align 4
 // CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -523,7 +523,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK1-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP101]], align 4
 // CHECK1-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -614,7 +614,7 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META21]]
 // CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META21]]
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -972,7 +972,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1080,7 +1080,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1168,7 +1168,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1450,7 +1450,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP17]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1507,7 +1507,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1619,7 +1619,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS14]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK3-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP101]], align 4
 // CHECK3-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -1710,7 +1710,7 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META22]]
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -2068,7 +2068,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2176,7 +2176,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2264,7 +2264,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_for_codegen.cpp b/clang/test/OpenMP/target_parallel_for_codegen.cpp
index 31f6d0b8ad9ae1..73504cf7a8024e 100644
--- a/clang/test/OpenMP/target_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_codegen.cpp
@@ -366,7 +366,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP6]], align 8
 // CHECK1-NEXT:    store i64 [[TMP5]], ptr [[__VLA_EXPR1]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -487,7 +487,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP73]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP73]], align 4
 // CHECK1-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP74]], align 4
 // CHECK1-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -608,7 +608,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP132]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP132]], align 4
 // CHECK1-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 10, ptr [[TMP133]], align 4
 // CHECK1-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -1047,7 +1047,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias [[META24]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias [[META24]]
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1475,7 +1475,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1583,7 +1583,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1671,7 +1671,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2032,7 +2032,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP4]], align 8
 // CHECK3-NEXT:    store i32 [[TMP3]], ptr [[__VLA_EXPR1]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2150,7 +2150,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP69]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP69]], align 4
 // CHECK3-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP70]], align 4
 // CHECK3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -2273,7 +2273,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP130]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP130]], align 4
 // CHECK3-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 10, ptr [[TMP131]], align 4
 // CHECK3-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -2710,7 +2710,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias [[META25]]
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias [[META25]]
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -3138,7 +3138,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3246,7 +3246,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3334,7 +3334,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5256,7 +5256,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[VLA1:%.*]] = alloca double, i64 [[TMP6]], align 8
 // CHECK17-NEXT:    store i64 [[TMP5]], ptr [[__VLA_EXPR1]], align 8
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK17-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 0, ptr [[TMP8]], align 4
 // CHECK17-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5377,7 +5377,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP73]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP73]], align 4
 // CHECK17-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP74]], align 4
 // CHECK17-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -5498,7 +5498,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP132]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP132]], align 4
 // CHECK17-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 10, ptr [[TMP133]], align 4
 // CHECK17-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -5937,7 +5937,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias [[META24]]
 // CHECK17-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK17-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK17-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]]
+// CHECK17-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META24]]
 // CHECK17-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias [[META24]]
 // CHECK17-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -6365,7 +6365,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK17-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK17-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6473,7 +6473,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK17-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6561,7 +6561,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK17-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK17-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6922,7 +6922,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[VLA1:%.*]] = alloca double, i32 [[TMP4]], align 8
 // CHECK19-NEXT:    store i32 [[TMP3]], ptr [[__VLA_EXPR1]], align 4
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 0, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7040,7 +7040,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP69]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP69]], align 4
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP70]], align 4
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -7163,7 +7163,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP128:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP130:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP130]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP130]], align 4
 // CHECK19-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 10, ptr [[TMP131]], align 4
 // CHECK19-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -7600,7 +7600,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias [[META25]]
 // CHECK19-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK19-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK19-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK19-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK19-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias [[META25]]
 // CHECK19-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -8028,7 +8028,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK19-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK19-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8136,7 +8136,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK19-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8224,7 +8224,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK19-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK19-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
index 7751ae93d59fa4..8009e06666c821 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -395,7 +395,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP31]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP31]], align 4
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -452,7 +452,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP60]], align 4
 // CHECK1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -573,7 +573,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP118]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP118]], align 4
 // CHECK1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 10, ptr [[TMP119]], align 4
 // CHECK1-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -721,7 +721,7 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META25]]
 // CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]]
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1403,7 +1403,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1511,7 +1511,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1599,7 +1599,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2013,7 +2013,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2070,7 +2070,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP55]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP55]], align 4
 // CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP56]], align 4
 // CHECK3-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -2193,7 +2193,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP116]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP116]], align 4
 // CHECK3-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 10, ptr [[TMP117]], align 4
 // CHECK3-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -2341,7 +2341,7 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]]
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -3021,7 +3021,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3129,7 +3129,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3217,7 +3217,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3637,7 +3637,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP31]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP31]], align 4
 // CHECK5-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK5-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3694,7 +3694,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK5-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 2, ptr [[TMP60]], align 4
 // CHECK5-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -3815,7 +3815,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP118]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP118]], align 4
 // CHECK5-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 10, ptr [[TMP119]], align 4
 // CHECK5-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -3963,7 +3963,7 @@ int bar(int n){
 // CHECK5-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META25]]
 // CHECK5-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]]
 // CHECK5-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META25]]
-// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK5-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META25]]
 // CHECK5-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -4666,7 +4666,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP36:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK5-NEXT:    [[TMP37:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP36]], 0
 // CHECK5-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK5-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 6, ptr [[TMP39]], align 4
 // CHECK5-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4774,7 +4774,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK5-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4862,7 +4862,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5362,7 +5362,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK7-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK7-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5419,7 +5419,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP55]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP55]], align 4
 // CHECK7-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 2, ptr [[TMP56]], align 4
 // CHECK7-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -5542,7 +5542,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS15]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP115:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP116:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP116]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP116]], align 4
 // CHECK7-NEXT:    [[TMP117:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 10, ptr [[TMP117]], align 4
 // CHECK7-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS17]], i32 0, i32 2
@@ -5690,7 +5690,7 @@ int bar(int n){
 // CHECK7-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META26]]
 // CHECK7-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]]
 // CHECK7-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META26]]
-// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK7-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK7-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias [[META26]]
 // CHECK7-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -6391,7 +6391,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP36:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK7-NEXT:    [[TMP37:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP36]], 0
 // CHECK7-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK7-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 6, ptr [[TMP39]], align 4
 // CHECK7-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6499,7 +6499,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK7-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6587,7 +6587,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
index fa3d182b62ad2d..bbefab195b9ae7 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
@@ -4185,7 +4185,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4394,7 +4394,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4565,7 +4565,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4708,7 +4708,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4879,7 +4879,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5022,7 +5022,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5193,7 +5193,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5336,7 +5336,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5486,7 +5486,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5571,7 +5571,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5695,7 +5695,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5760,7 +5760,7 @@ int bar(int a){
 // OMP-DEfAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // OMP-DEfAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// OMP-DEfAULT-NEXT:    store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT:    store i32 3, ptr [[TMP9]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // OMP-DEfAULT-NEXT:    store i32 1, ptr [[TMP10]], align 4
 // OMP-DEfAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
index a7b7278dff8d9d..a92f052085c74c 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
@@ -88,7 +88,7 @@ int nested(int a){
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -306,7 +306,7 @@ int nested(int a){
 // CHECK-X86-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-X86-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-X86-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-X86-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK-X86-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK-X86-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-X86-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK-X86-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
index 472811c5c56672..0171d97e2f5458 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
@@ -90,7 +90,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_if_codegen.cpp b/clang/test/OpenMP/target_parallel_if_codegen.cpp
index 841c49e31ccb9e..6447a76dd467c4 100644
--- a/clang/test/OpenMP/target_parallel_if_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_if_codegen.cpp
@@ -270,7 +270,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -335,7 +335,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP47:%.*]] = select i1 [[TOBOOL15]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP48:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP47]], 0
 // CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP49]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP49]], align 4
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP50]], align 4
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -416,7 +416,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP10:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP10]], 0
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP13]], align 4
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -459,7 +459,7 @@ int bar(int n){
 // CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_IF_THEN5:%.*]], label [[OMP_IF_ELSE9:%.*]]
 // CHECK1:       omp_if.then5:
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -532,7 +532,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -586,7 +586,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP34]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP34]], align 4
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP35]], align 4
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -932,7 +932,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -997,7 +997,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP47:%.*]] = select i1 [[TOBOOL15]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP48:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP47]], 0
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP49]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP49]], align 4
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP50]], align 4
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS16]], i32 0, i32 2
@@ -1078,7 +1078,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP10:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP10]], 0
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP13]], align 4
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1121,7 +1121,7 @@ int bar(int n){
 // CHECK3-NEXT:    br i1 [[CMP4]], label [[OMP_IF_THEN5:%.*]], label [[OMP_IF_ELSE9:%.*]]
 // CHECK3:       omp_if.then5:
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1194,7 +1194,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1248,7 +1248,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP34]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP34]], align 4
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP35]], align 4
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
index 0778bca8c3f43e..a14518286dc918 100644
--- a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
@@ -271,7 +271,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -313,7 +313,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -384,7 +384,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -433,7 +433,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP33]], 0
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -488,7 +488,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -556,7 +556,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP34:%.*]] = zext i16 [[TMP33]] to i32
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -849,7 +849,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -891,7 +891,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -962,7 +962,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1011,7 +1011,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP33]], 0
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1066,7 +1066,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1134,7 +1134,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP34:%.*]] = zext i16 [[TMP33]] to i32
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_task_affinity_codegen.cpp b/clang/test/OpenMP/target_task_affinity_codegen.cpp
index 472c50235bf9e3..2ef6e5a2f7a3de 100644
--- a/clang/test/OpenMP/target_task_affinity_codegen.cpp
+++ b/clang/test/OpenMP/target_task_affinity_codegen.cpp
@@ -118,7 +118,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -341,7 +341,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp
index e22fcbbcd277bc..24dc2fd2e49f44 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -453,7 +453,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP53]], align 4
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -510,7 +510,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP80]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP80]], align 4
 // CHECK1-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP81]], align 4
 // CHECK1-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 2
@@ -568,7 +568,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP107]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP107]], align 4
 // CHECK1-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP108]], align 4
 // CHECK1-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 2
@@ -673,7 +673,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP159:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP160:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP160]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP160]], align 4
 // CHECK1-NEXT:    [[TMP161:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP161]], align 4
 // CHECK1-NEXT:    [[TMP162:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -723,7 +723,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP182:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP182]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP182]], align 4
 // CHECK1-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP183]], align 4
 // CHECK1-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -767,7 +767,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP202:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS42]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP203:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS43]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP204:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP204]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP204]], align 4
 // CHECK1-NEXT:    [[TMP205:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP205]], align 4
 // CHECK1-NEXT:    [[TMP206:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -919,7 +919,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias [[META25]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1317,7 +1317,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1482,7 +1482,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1590,7 +1590,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1678,7 +1678,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2040,7 +2040,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP51]], align 4
 // CHECK3-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2097,7 +2097,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP78]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP78]], align 4
 // CHECK3-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP79]], align 4
 // CHECK3-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS13]], i32 0, i32 2
@@ -2155,7 +2155,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS18]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS19]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP105]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP105]], align 4
 // CHECK3-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP106]], align 4
 // CHECK3-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS21]], i32 0, i32 2
@@ -2262,7 +2262,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP159:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP160:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP160]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP160]], align 4
 // CHECK3-NEXT:    [[TMP161:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP161]], align 4
 // CHECK3-NEXT:    [[TMP162:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2312,7 +2312,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS35]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS36]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP182:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP182]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP182]], align 4
 // CHECK3-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP183]], align 4
 // CHECK3-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS38]], i32 0, i32 2
@@ -2356,7 +2356,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP202:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS42]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP203:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS43]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP204:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP204]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP204]], align 4
 // CHECK3-NEXT:    [[TMP205:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP205]], align 4
 // CHECK3-NEXT:    [[TMP206:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -2508,7 +2508,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias [[META26]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -2905,7 +2905,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3070,7 +3070,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3178,7 +3178,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3266,7 +3266,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
index 1ad3d8333ba60f..bd33e52ca3ef08 100644
--- a/clang/test/OpenMP/target_teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
@@ -427,7 +427,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP53]], align 4
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -484,7 +484,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP80]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP80]], align 4
 // CHECK1-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP81]], align 4
 // CHECK1-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -605,7 +605,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP139]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP139]], align 4
 // CHECK1-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 10, ptr [[TMP140]], align 4
 // CHECK1-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -812,7 +812,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias [[META21]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1397,7 +1397,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1534,7 +1534,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1623,7 +1623,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2163,7 +2163,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP51]], align 4
 // CHECK3-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2220,7 +2220,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP78]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP78]], align 4
 // CHECK3-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP79]], align 4
 // CHECK3-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -2343,7 +2343,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP139]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP139]], align 4
 // CHECK3-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 10, ptr [[TMP140]], align 4
 // CHECK3-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -2550,7 +2550,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP19]], 0
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4, !noalias [[META22]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -3135,7 +3135,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3272,7 +3272,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK3-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3361,7 +3361,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp
index 2d886109b95b96..52b40ede902ce3 100644
--- a/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp
@@ -124,7 +124,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -285,7 +285,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -514,7 +514,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -750,7 +750,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -975,7 +975,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1209,7 +1209,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp
index 714ed708e71a85..e898f6cff7e7d0 100644
--- a/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp
@@ -161,7 +161,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -203,7 +203,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -245,7 +245,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -573,7 +573,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -615,7 +615,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -657,7 +657,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1030,7 +1030,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1101,7 +1101,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1183,7 +1183,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK9-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK9-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1619,7 +1619,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1660,7 +1660,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1701,7 +1701,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK9-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2071,7 +2071,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2143,7 +2143,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2226,7 +2226,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK11-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK11-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2659,7 +2659,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2700,7 +2700,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2741,7 +2741,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
index 1d6f69079df969..da69e19a890efa 100644
--- a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
@@ -303,7 +303,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -584,7 +584,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1105,7 +1105,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1384,7 +1384,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
index ffba1c2221c248..e23435d13e9aec 100644
--- a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
@@ -522,7 +522,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -837,7 +837,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1223,7 +1223,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1536,7 +1536,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
index 7e06d8c16169f7..0d0d9967472144 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
@@ -653,7 +653,7 @@ int target_teams_fun(int *g){
 // CHECK2-NEXT:    [[TMP29:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
 // CHECK2-NEXT:    [[TMP30:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP31]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP31]], align 4
 // CHECK2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 4, ptr [[TMP32]], align 4
 // CHECK2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -720,7 +720,7 @@ int target_teams_fun(int *g){
 // CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP62]], 1
 // CHECK2-NEXT:    [[TMP63:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP64]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP64]], align 4
 // CHECK2-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -1295,7 +1295,7 @@ int target_teams_fun(int *g){
 // CHECK4-NEXT:    [[TMP29:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
 // CHECK4-NEXT:    [[TMP30:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK4-NEXT:    store i32 2, ptr [[TMP31]], align 4
+// CHECK4-NEXT:    store i32 3, ptr [[TMP31]], align 4
 // CHECK4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK4-NEXT:    store i32 4, ptr [[TMP32]], align 4
 // CHECK4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1362,7 +1362,7 @@ int target_teams_fun(int *g){
 // CHECK4-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP62]], 1
 // CHECK4-NEXT:    [[TMP63:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK4-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK4-NEXT:    store i32 2, ptr [[TMP64]], align 4
+// CHECK4-NEXT:    store i32 3, ptr [[TMP64]], align 4
 // CHECK4-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK4-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK4-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
index b812011ea4ca04..da85b23de2e296 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
@@ -129,7 +129,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -365,7 +365,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -665,7 +665,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1027,7 +1027,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1327,7 +1327,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1691,7 +1691,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
index 0aa75952a3c2b9..e0b722bd804673 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
@@ -173,7 +173,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -215,7 +215,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -257,7 +257,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -807,7 +807,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -849,7 +849,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -891,7 +891,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1476,7 +1476,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1547,7 +1547,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1629,7 +1629,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK9-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK9-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2394,7 +2394,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2435,7 +2435,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2487,7 +2487,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK9-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -3096,7 +3096,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3168,7 +3168,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3251,7 +3251,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK11-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK11-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4001,7 +4001,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4042,7 +4042,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4094,7 +4094,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK11-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
index 375279d9636080..bf1fc1d7cbae5f 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -364,7 +364,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -775,7 +775,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1426,7 +1426,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1831,7 +1831,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
index 9baf13385bef5b..9be3ca8fd7587c 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
@@ -113,7 +113,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -146,7 +146,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -476,7 +476,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -535,7 +535,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1035,7 +1035,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1090,7 +1090,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
index e54c230eea7eef..ae900d2dc53b9c 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -783,7 +783,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1252,7 +1252,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1783,7 +1783,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2246,7 +2246,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
index feab9cdc948bcb..a64110f581df8e 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
@@ -27,7 +27,7 @@ void gtid_test() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
index 066d926e5abcbf..49e4681c051530 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
@@ -301,7 +301,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -571,7 +571,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1034,7 +1034,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1298,7 +1298,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
index 9f3e50fe20a62a..caac6edfc51dd1 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
@@ -58,7 +58,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -91,7 +91,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -409,7 +409,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
index 1036ffd195de00..d5f1b0b9e59dcc 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
@@ -109,7 +109,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -400,7 +400,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -688,7 +688,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -975,7 +975,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
index 3d031e09aa07d6..fb0b2893d95290 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
@@ -243,7 +243,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -285,7 +285,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -327,7 +327,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -369,7 +369,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -411,7 +411,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1270,7 +1270,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1312,7 +1312,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1354,7 +1354,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1396,7 +1396,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -1438,7 +1438,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2270,7 +2270,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2312,7 +2312,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2354,7 +2354,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK5-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK5-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2396,7 +2396,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK5-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK5-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2438,7 +2438,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK5-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK5-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3297,7 +3297,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK7-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3339,7 +3339,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3381,7 +3381,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK7-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK7-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3423,7 +3423,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK7-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK7-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3465,7 +3465,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK7-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK7-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4357,7 +4357,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK13-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4428,7 +4428,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK13-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK13-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK13-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK13-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -4510,7 +4510,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK13-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK13-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK13-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4581,7 +4581,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP132]], 1
 // CHECK13-NEXT:    [[TMP133:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK13-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK13-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP135]], align 4
 // CHECK13-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -4663,7 +4663,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP173]], 1
 // CHECK13-NEXT:    [[TMP174:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK13-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP175]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP175]], align 4
 // CHECK13-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP176]], align 4
 // CHECK13-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -5881,7 +5881,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5922,7 +5922,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -5974,7 +5974,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK13-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK13-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -6015,7 +6015,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK13-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6067,7 +6067,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK13-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -7008,7 +7008,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK15-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7080,7 +7080,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK15-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK15-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK15-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK15-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -7163,7 +7163,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK15-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK15-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK15-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK15-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -7235,7 +7235,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP135]], 1
 // CHECK15-NEXT:    [[TMP136:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK15-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP137]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK15-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK15-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -7318,7 +7318,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP177]], 1
 // CHECK15-NEXT:    [[TMP178:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK15-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP179]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP179]], align 4
 // CHECK15-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP180]], align 4
 // CHECK15-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -8511,7 +8511,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK15-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8552,7 +8552,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK15-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK15-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -8604,7 +8604,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK15-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK15-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -8645,7 +8645,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK15-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK15-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -8697,7 +8697,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK15-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK15-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -9611,7 +9611,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK17-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9682,7 +9682,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK17-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK17-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK17-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK17-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -9764,7 +9764,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK17-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK17-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK17-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -9835,7 +9835,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP132]], 1
 // CHECK17-NEXT:    [[TMP133:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK17-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK17-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP135]], align 4
 // CHECK17-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -9917,7 +9917,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP173]], 1
 // CHECK17-NEXT:    [[TMP174:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK17-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP175]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP175]], align 4
 // CHECK17-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP176]], align 4
 // CHECK17-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -11135,7 +11135,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -11176,7 +11176,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -11228,7 +11228,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK17-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK17-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -11269,7 +11269,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK17-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -11321,7 +11321,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK17-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -12262,7 +12262,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK19-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12334,7 +12334,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK19-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK19-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK19-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK19-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -12417,7 +12417,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK19-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK19-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK19-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK19-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -12489,7 +12489,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP135]], 1
 // CHECK19-NEXT:    [[TMP136:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK19-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP137]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK19-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK19-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -12572,7 +12572,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP177]], 1
 // CHECK19-NEXT:    [[TMP178:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK19-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP179]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP179]], align 4
 // CHECK19-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP180]], align 4
 // CHECK19-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -13765,7 +13765,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -13806,7 +13806,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -13858,7 +13858,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK19-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -13899,7 +13899,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK19-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK19-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -13951,7 +13951,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK19-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK19-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
index 109063c623a11d..a4119a07f61f2d 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -181,7 +181,7 @@ void test_target_teams_atomic() {
 // CHECK1-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP28]], 0
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP29]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -248,7 +248,7 @@ void test_target_teams_atomic() {
 // CHECK1-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP67]], 1
 // CHECK1-NEXT:    [[TMP68:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK1-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP69]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP69]], align 4
 // CHECK1-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -821,7 +821,7 @@ void test_target_teams_atomic() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1116,7 +1116,7 @@ void test_target_teams_atomic() {
 // CHECK3-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP28]], 0
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP29]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1183,7 +1183,7 @@ void test_target_teams_atomic() {
 // CHECK3-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP67]], 1
 // CHECK3-NEXT:    [[TMP68:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK3-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP69]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP69]], align 4
 // CHECK3-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -1746,7 +1746,7 @@ void test_target_teams_atomic() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
index 306ec1db2ab215..53a13e129d3b26 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
@@ -129,7 +129,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -381,7 +381,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -837,7 +837,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1235,7 +1235,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1551,7 +1551,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1951,7 +1951,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
index c265c1c616b14a..d449406927e716 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
@@ -173,7 +173,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -215,7 +215,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -257,7 +257,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -849,7 +849,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -891,7 +891,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -933,7 +933,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1807,7 +1807,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1878,7 +1878,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1960,7 +1960,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK9-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK9-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2797,7 +2797,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2838,7 +2838,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2890,7 +2890,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK9-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -3541,7 +3541,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3613,7 +3613,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3696,7 +3696,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK11-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK11-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4518,7 +4518,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4559,7 +4559,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4611,7 +4611,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK11-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index 37c1f428ef9a30..ea05aecebf3713 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -362,7 +362,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -787,7 +787,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1452,7 +1452,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1871,7 +1871,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
index df5dd7ba680523..71f4506fb6348e 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp
@@ -130,7 +130,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -163,7 +163,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP23]], align 4
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -536,7 +536,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -595,7 +595,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1137,7 +1137,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1192,7 +1192,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1735,7 +1735,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1768,7 +1768,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP23]], align 4
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2141,7 +2141,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2200,7 +2200,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -2972,7 +2972,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3027,7 +3027,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4163,7 +4163,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK9-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK9-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4196,7 +4196,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4569,7 +4569,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4628,7 +4628,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK9-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK9-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -5170,7 +5170,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5225,7 +5225,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK9-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -5768,7 +5768,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK11-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5801,7 +5801,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP22]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6174,7 +6174,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6233,7 +6233,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK11-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -7005,7 +7005,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7060,7 +7060,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
 // CHECK11-NEXT:    [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP26]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP27]], align 4
 // CHECK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index 3fdc0c48254108..b2b9e2082fa3a8 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -811,7 +811,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1294,7 +1294,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1839,7 +1839,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2316,7 +2316,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
index bba97a750adc62..0cc07c86aaea7d 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
@@ -301,7 +301,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -585,7 +585,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1062,7 +1062,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1340,7 +1340,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
index c347743bd4fdd7..7619bf7502e788 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -58,7 +58,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -91,7 +91,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -437,7 +437,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
index 52430d51cde1b5..3bff9904850f51 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
@@ -109,7 +109,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -414,7 +414,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -716,7 +716,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1017,7 +1017,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
index f7c0666d58b661..b92dd14de108d9 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
@@ -243,7 +243,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -285,7 +285,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -327,7 +327,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -369,7 +369,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -411,7 +411,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1340,7 +1340,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1382,7 +1382,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1424,7 +1424,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1466,7 +1466,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -1508,7 +1508,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2410,7 +2410,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2452,7 +2452,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2494,7 +2494,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK5-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK5-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2536,7 +2536,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK5-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK5-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2578,7 +2578,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK5-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK5-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3507,7 +3507,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK7-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3549,7 +3549,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3591,7 +3591,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK7-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK7-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3633,7 +3633,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK7-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK7-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3675,7 +3675,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK7-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK7-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -5022,7 +5022,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK13-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5093,7 +5093,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK13-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK13-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK13-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK13-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -5175,7 +5175,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK13-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK13-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK13-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -5246,7 +5246,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP132]], 1
 // CHECK13-NEXT:    [[TMP133:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK13-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK13-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP135]], align 4
 // CHECK13-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -5328,7 +5328,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP173]], 1
 // CHECK13-NEXT:    [[TMP174:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK13-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP175]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP175]], align 4
 // CHECK13-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP176]], align 4
 // CHECK13-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -6666,7 +6666,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6707,7 +6707,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -6759,7 +6759,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK13-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK13-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -6800,7 +6800,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK13-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6852,7 +6852,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK13-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -7863,7 +7863,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK15-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7935,7 +7935,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK15-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK15-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK15-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK15-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -8018,7 +8018,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK15-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK15-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK15-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK15-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -8090,7 +8090,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP135]], 1
 // CHECK15-NEXT:    [[TMP136:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK15-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP137]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK15-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK15-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -8173,7 +8173,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP177]], 1
 // CHECK15-NEXT:    [[TMP178:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK15-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP179]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP179]], align 4
 // CHECK15-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP180]], align 4
 // CHECK15-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -9486,7 +9486,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK15-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9527,7 +9527,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK15-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK15-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -9579,7 +9579,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK15-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK15-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -9620,7 +9620,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK15-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK15-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -9672,7 +9672,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK15-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK15-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -10656,7 +10656,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK17-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -10727,7 +10727,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK17-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK17-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK17-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK17-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -10809,7 +10809,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK17-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK17-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK17-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -10880,7 +10880,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP132]], 1
 // CHECK17-NEXT:    [[TMP133:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK17-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK17-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP135]], align 4
 // CHECK17-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -10962,7 +10962,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP173]], 1
 // CHECK17-NEXT:    [[TMP174:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK17-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP175]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP175]], align 4
 // CHECK17-NEXT:    [[TMP176:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP176]], align 4
 // CHECK17-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -12300,7 +12300,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12341,7 +12341,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -12393,7 +12393,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK17-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK17-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -12434,7 +12434,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK17-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -12486,7 +12486,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK17-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -13497,7 +13497,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK19-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -13569,7 +13569,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK19-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK19-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK19-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK19-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -13652,7 +13652,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK19-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK19-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK19-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK19-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -13724,7 +13724,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD45:%.*]] = add nsw i32 [[TMP135]], 1
 // CHECK19-NEXT:    [[TMP136:%.*]] = zext i32 [[ADD45]] to i64
 // CHECK19-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP137]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK19-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP138]], align 4
 // CHECK19-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS46]], i32 0, i32 2
@@ -13807,7 +13807,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD62:%.*]] = add nsw i32 [[TMP177]], 1
 // CHECK19-NEXT:    [[TMP178:%.*]] = zext i32 [[ADD62]] to i64
 // CHECK19-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP179]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP179]], align 4
 // CHECK19-NEXT:    [[TMP180:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP180]], align 4
 // CHECK19-NEXT:    [[TMP181:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS63]], i32 0, i32 2
@@ -15120,7 +15120,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -15161,7 +15161,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -15213,7 +15213,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK19-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -15254,7 +15254,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP71]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP71]], align 4
 // CHECK19-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP72]], align 4
 // CHECK19-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -15306,7 +15306,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS24]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS25]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK19-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK19-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
index 5bc78062c8309b..3cb2d2c2f9b43e 100644
--- a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
@@ -240,7 +240,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -409,7 +409,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -768,7 +768,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -935,7 +935,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp
index 9e9306a5e749f8..47251b00c7a144 100644
--- a/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp
@@ -335,7 +335,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -376,7 +376,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -418,7 +418,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -460,7 +460,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -502,7 +502,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS22]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -544,7 +544,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS29]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS30]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP105]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP105]], align 4
 // CHECK1-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP106]], align 4
 // CHECK1-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 2
@@ -585,7 +585,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS37]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP125]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP125]], align 4
 // CHECK1-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP126]], align 4
 // CHECK1-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 2
@@ -626,7 +626,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS44]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP145]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP145]], align 4
 // CHECK1-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP146]], align 4
 // CHECK1-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 2
@@ -668,7 +668,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP163:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS50]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS51]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP165]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP165]], align 4
 // CHECK1-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP166]], align 4
 // CHECK1-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 2
@@ -710,7 +710,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS57]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS58]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK1-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP186]], align 4
 // CHECK1-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -2179,7 +2179,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2226,7 +2226,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2274,7 +2274,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2322,7 +2322,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP74]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP74]], align 4
 // CHECK1-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP75]], align 4
 // CHECK1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -2370,7 +2370,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS22]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK1-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK1-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -2418,7 +2418,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS29]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS30]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP120]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP120]], align 4
 // CHECK1-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP121]], align 4
 // CHECK1-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 2
@@ -2465,7 +2465,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS37]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP143]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP143]], align 4
 // CHECK1-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP144]], align 4
 // CHECK1-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 2
@@ -2512,7 +2512,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS44]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP166]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP166]], align 4
 // CHECK1-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP167]], align 4
 // CHECK1-NEXT:    [[TMP168:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 2
@@ -2560,7 +2560,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS50]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP188:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS51]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP189:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP189]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP189]], align 4
 // CHECK1-NEXT:    [[TMP190:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP190]], align 4
 // CHECK1-NEXT:    [[TMP191:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 2
@@ -2608,7 +2608,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP210:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS57]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP211:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS58]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP212:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP212]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP212]], align 4
 // CHECK1-NEXT:    [[TMP213:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP213]], align 4
 // CHECK1-NEXT:    [[TMP214:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -4163,7 +4163,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4204,7 +4204,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4246,7 +4246,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -4288,7 +4288,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -4330,7 +4330,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS22]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -4372,7 +4372,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS29]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP104:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS30]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP105:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP105]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP105]], align 4
 // CHECK3-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP106]], align 4
 // CHECK3-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 2
@@ -4413,7 +4413,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP124:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS37]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP125]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP125]], align 4
 // CHECK3-NEXT:    [[TMP126:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP126]], align 4
 // CHECK3-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 2
@@ -4454,7 +4454,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS44]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP145]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP145]], align 4
 // CHECK3-NEXT:    [[TMP146:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP146]], align 4
 // CHECK3-NEXT:    [[TMP147:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 2
@@ -4496,7 +4496,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP163:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS50]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS51]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP165]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP165]], align 4
 // CHECK3-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP166]], align 4
 // CHECK3-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 2
@@ -4538,7 +4538,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP183:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS57]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP184:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS58]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP185:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP185]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP185]], align 4
 // CHECK3-NEXT:    [[TMP186:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP186]], align 4
 // CHECK3-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -6007,7 +6007,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6054,7 +6054,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -6102,7 +6102,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP51]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP51]], align 4
 // CHECK3-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP52]], align 4
 // CHECK3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -6150,7 +6150,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP74]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP74]], align 4
 // CHECK3-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP75]], align 4
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6198,7 +6198,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS22]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS23]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP97]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP97]], align 4
 // CHECK3-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP98]], align 4
 // CHECK3-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS26]], i32 0, i32 2
@@ -6246,7 +6246,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS29]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS30]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP120]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP120]], align 4
 // CHECK3-NEXT:    [[TMP121:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP121]], align 4
 // CHECK3-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS33]], i32 0, i32 2
@@ -6293,7 +6293,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP141:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP142:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS37]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP143:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP143]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP143]], align 4
 // CHECK3-NEXT:    [[TMP144:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP144]], align 4
 // CHECK3-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS40]], i32 0, i32 2
@@ -6340,7 +6340,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP164:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS44]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP166:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP166]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP166]], align 4
 // CHECK3-NEXT:    [[TMP167:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP167]], align 4
 // CHECK3-NEXT:    [[TMP168:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS47]], i32 0, i32 2
@@ -6388,7 +6388,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP187:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS50]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP188:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS51]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP189:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP189]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP189]], align 4
 // CHECK3-NEXT:    [[TMP190:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP190]], align 4
 // CHECK3-NEXT:    [[TMP191:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS54]], i32 0, i32 2
@@ -6436,7 +6436,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP210:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS57]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP211:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS58]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP212:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP212]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP212]], align 4
 // CHECK3-NEXT:    [[TMP213:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP213]], align 4
 // CHECK3-NEXT:    [[TMP214:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
index 5091d41eec4f3d..d2a5d1f62a8b9d 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -424,7 +424,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP53]], align 4
 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -481,7 +481,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP80]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP80]], align 4
 // CHECK1-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP81]], align 4
 // CHECK1-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -591,7 +591,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK1-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 9, ptr [[TMP134]], align 4
 // CHECK1-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 2
@@ -803,7 +803,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK1-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias [[META26]]
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -1392,7 +1392,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1529,7 +1529,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK1-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1618,7 +1618,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2186,7 +2186,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP51]], align 4
 // CHECK3-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2243,7 +2243,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP78]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP78]], align 4
 // CHECK3-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP79]], align 4
 // CHECK3-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -2355,7 +2355,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK3-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 9, ptr [[TMP134]], align 4
 // CHECK3-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 2
@@ -2567,7 +2567,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
+// CHECK3-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias [[META27]]
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -3156,7 +3156,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3293,7 +3293,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK3-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK3-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3382,7 +3382,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3952,7 +3952,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK5-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP53]], align 4
 // CHECK5-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4009,7 +4009,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP80]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP80]], align 4
 // CHECK5-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 2, ptr [[TMP81]], align 4
 // CHECK5-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -4119,7 +4119,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK5-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 9, ptr [[TMP134]], align 4
 // CHECK5-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 2
@@ -4331,7 +4331,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
+// CHECK5-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META26]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias [[META26]]
 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -4937,7 +4937,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [6 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [6 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK5-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 6, ptr [[TMP36]], align 4
 // CHECK5-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5074,7 +5074,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK5-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK5-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK5-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK5-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5163,7 +5163,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5783,7 +5783,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK7-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP51]], align 4
 // CHECK7-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5840,7 +5840,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS10]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS11]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP78]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP78]], align 4
 // CHECK7-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 2, ptr [[TMP79]], align 4
 // CHECK7-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
@@ -5952,7 +5952,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP131:%.*]] = getelementptr inbounds [9 x ptr], ptr [[DOTOFFLOAD_PTRS21]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [9 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK7-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 9, ptr [[TMP134]], align 4
 // CHECK7-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS24]], i32 0, i32 2
@@ -6164,7 +6164,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
+// CHECK7-NEXT:    store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META27]]
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias [[META27]]
 // CHECK7-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
@@ -6770,7 +6770,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [6 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [6 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK7-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 6, ptr [[TMP36]], align 4
 // CHECK7-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6907,7 +6907,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[ADD5:%.*]] = add i32 [[TMP30]], 1
 // CHECK7-NEXT:    [[TMP31:%.*]] = zext i32 [[ADD5]] to i64
 // CHECK7-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP32]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP32]], align 4
 // CHECK7-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 5, ptr [[TMP33]], align 4
 // CHECK7-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6996,7 +6996,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 3, ptr [[TMP17]], align 4
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp
index ac7fc7c1acf91e..b052a2200479ac 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp
@@ -124,7 +124,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -293,7 +293,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -670,7 +670,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -924,7 +924,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1157,7 +1157,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1409,7 +1409,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp
index 867ec89421402c..7f7e83c4dee396 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp
@@ -161,7 +161,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -203,7 +203,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -245,7 +245,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -594,7 +594,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -636,7 +636,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -678,7 +678,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1319,7 +1319,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1390,7 +1390,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1472,7 +1472,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP97]], 1
 // CHECK9-NEXT:    [[TMP98:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP99]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP100]], align 4
 // CHECK9-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1944,7 +1944,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1985,7 +1985,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2026,7 +2026,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK9-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2417,7 +2417,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2489,7 +2489,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2572,7 +2572,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP99]], 1
 // CHECK11-NEXT:    [[TMP100:%.*]] = zext i32 [[ADD30]] to i64
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP101]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP102]], align 4
 // CHECK11-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3041,7 +3041,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3082,7 +3082,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -3123,7 +3123,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
index 425bcaf6c3f3d5..d22aeee120d5d5 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
@@ -303,7 +303,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -591,7 +591,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1119,7 +1119,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1405,7 +1405,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
index 4c9e83afb3bb3e..93c570ac0604ff 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
@@ -572,7 +572,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -894,7 +894,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1287,7 +1287,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1607,7 +1607,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP19]], align 4
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
index 6cfec0010d00cb..4e9b4e1027e686 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
@@ -240,7 +240,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -416,7 +416,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -782,7 +782,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -956,7 +956,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp
index 1481b4fe85fde9..667450c32f1dd0 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp
@@ -97,7 +97,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -281,7 +281,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -462,7 +462,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -646,7 +646,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
index 190ea17e960704..ec806081a3e442 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
@@ -635,7 +635,7 @@ int target_teams_fun(int *g){
 // CHECK2-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK2-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
 // CHECK2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -702,7 +702,7 @@ int target_teams_fun(int *g){
 // CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP60]], 1
 // CHECK2-NEXT:    [[TMP61:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK2-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP62]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP62]], align 4
 // CHECK2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 3, ptr [[TMP63]], align 4
 // CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
@@ -1260,7 +1260,7 @@ int target_teams_fun(int *g){
 // CHECK4-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK4-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
 // CHECK4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK4-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK4-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK4-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1327,7 +1327,7 @@ int target_teams_fun(int *g){
 // CHECK4-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP60]], 1
 // CHECK4-NEXT:    [[TMP61:%.*]] = zext i32 [[ADD17]] to i64
 // CHECK4-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
-// CHECK4-NEXT:    store i32 2, ptr [[TMP62]], align 4
+// CHECK4-NEXT:    store i32 3, ptr [[TMP62]], align 4
 // CHECK4-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
 // CHECK4-NEXT:    store i32 3, ptr [[TMP63]], align 4
 // CHECK4-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
index 82ad43373327af..f4aea067ba22ab 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
@@ -129,7 +129,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -365,7 +365,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -665,7 +665,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1027,7 +1027,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1327,7 +1327,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1691,7 +1691,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
index b2ff4c20db7a12..1edcbfe2d77798 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
@@ -111,7 +111,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -437,7 +437,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -496,7 +496,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP28]], align 4
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -991,7 +991,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1029,7 +1029,7 @@ int main() {
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
 // CHECK1:       omp_if.then:
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP17]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
index 85f6a85a11bd81..fbf7b004d5efb0 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
@@ -27,7 +27,7 @@ void gtid_test() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
index 7503b69b92aa91..99416f76e409c5 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
@@ -301,7 +301,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -571,7 +571,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1034,7 +1034,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1298,7 +1298,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
index 1036ffd195de00..d5f1b0b9e59dcc 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
@@ -109,7 +109,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -400,7 +400,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -688,7 +688,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -975,7 +975,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
index 0dc2c95641e286..5001f9cc891511 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
@@ -291,7 +291,7 @@ void foo() {
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_map_codegen.cpp b/clang/test/OpenMP/target_teams_map_codegen.cpp
index 9ccf74514691ba..974ba36e6bb0d6 100644
--- a/clang/test/OpenMP/target_teams_map_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_map_codegen.cpp
@@ -100,7 +100,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -178,7 +178,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -272,7 +272,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -416,7 +416,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -494,7 +494,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -572,7 +572,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -668,7 +668,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -721,7 +721,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -1031,7 +1031,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1084,7 +1084,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
@@ -1327,7 +1327,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1405,7 +1405,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1499,7 +1499,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP8]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP9]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1643,7 +1643,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1721,7 +1721,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1799,7 +1799,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1895,7 +1895,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1948,7 +1948,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP37]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS4]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
index e5618aefaf3cbd..ba01a411670ce5 100644
--- a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
@@ -256,7 +256,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -298,7 +298,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -369,7 +369,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP8]], 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -418,7 +418,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP33]], 0
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -473,7 +473,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -541,7 +541,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -834,7 +834,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -876,7 +876,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -947,7 +947,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP8]], 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -996,7 +996,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP33]], 0
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -1051,7 +1051,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1119,7 +1119,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
diff --git a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
index b6593b7e536854..c081f6e52d7afb 100644
--- a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
@@ -256,7 +256,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -298,7 +298,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -385,7 +385,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP14]], 0
 // CHECK1-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -434,7 +434,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP41]], 0
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP44]], align 4
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -489,7 +489,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -557,7 +557,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
@@ -853,7 +853,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP17]], 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -895,7 +895,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP40]], align 4
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
@@ -982,7 +982,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP14]], 0
 // CHECK3-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP19]], align 4
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1031,7 +1031,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
 // CHECK3-NEXT:    [[TMP42:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP41]], 0
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP44]], align 4
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -1086,7 +1086,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[A]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1154,7 +1154,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_codegen.cpp b/clang/test/OpenMP/teams_codegen.cpp
index 4aab9dae2d6ad6..91702fb47af73f 100644
--- a/clang/test/OpenMP/teams_codegen.cpp
+++ b/clang/test/OpenMP/teams_codegen.cpp
@@ -361,7 +361,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -405,7 +405,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -460,7 +460,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP56:%.*]] = load i32, ptr [[LA]], align 4
 // CHECK1-NEXT:    [[TMP57:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP56]], 0
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -515,7 +515,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP85:%.*]] = load i32, ptr [[LA]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP85]], 0
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP87]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP87]], align 4
 // CHECK1-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP88]], align 4
 // CHECK1-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 2
@@ -605,7 +605,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP134:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD]], 0
 // CHECK1-NEXT:    [[TMP135:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP133]], 0
 // CHECK1-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK1-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP137]], align 4
 // CHECK1-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -670,7 +670,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP168:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD36]], 0
 // CHECK1-NEXT:    [[TMP169:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD38]], 0
 // CHECK1-NEXT:    [[TMP170:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP170]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP170]], align 4
 // CHECK1-NEXT:    [[TMP171:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP171]], align 4
 // CHECK1-NEXT:    [[TMP172:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 2
@@ -963,7 +963,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1007,7 +1007,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1062,7 +1062,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP56:%.*]] = load i32, ptr [[LA]], align 4
 // CHECK3-NEXT:    [[TMP57:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP56]], 0
 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -1117,7 +1117,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP85:%.*]] = load i32, ptr [[LA]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP85]], 0
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP87]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP87]], align 4
 // CHECK3-NEXT:    [[TMP88:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP88]], align 4
 // CHECK3-NEXT:    [[TMP89:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS20]], i32 0, i32 2
@@ -1204,7 +1204,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP132:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD]], 0
 // CHECK3-NEXT:    [[TMP133:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP131]], 0
 // CHECK3-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP134]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK3-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP135]], align 4
 // CHECK3-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS28]], i32 0, i32 2
@@ -1269,7 +1269,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP166:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD36]], 0
 // CHECK3-NEXT:    [[TMP167:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD38]], 0
 // CHECK3-NEXT:    [[TMP168:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP168]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP168]], align 4
 // CHECK3-NEXT:    [[TMP169:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP169]], align 4
 // CHECK3-NEXT:    [[TMP170:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS39]], i32 0, i32 2
@@ -1552,7 +1552,7 @@ void foo() {
 // CHECK9-NEXT:    [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP13]], 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1616,7 +1616,7 @@ void foo() {
 // CHECK9-NEXT:    [[TMP50:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP47]], 0
 // CHECK9-NEXT:    [[TMP51:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP49]], 0
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK9-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP53]], align 4
 // CHECK9-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1779,7 +1779,7 @@ void foo() {
 // CHECK11-NEXT:    [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP13]], 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1843,7 +1843,7 @@ void foo() {
 // CHECK11-NEXT:    [[TMP50:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP47]], 0
 // CHECK11-NEXT:    [[TMP51:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP49]], 0
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP52]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP52]], align 4
 // CHECK11-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP53]], align 4
 // CHECK11-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2006,7 +2006,7 @@ void foo() {
 // CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A2]], align 4
 // CHECK17-NEXT:    [[TMP11:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP10]], 0
 // CHECK17-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK17-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP13]], align 4
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2062,7 +2062,7 @@ void foo() {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 123
 // CHECK17-NEXT:    [[TMP38:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD]], 0
 // CHECK17-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK17-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP40]], align 4
 // CHECK17-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
@@ -2212,7 +2212,7 @@ void foo() {
 // CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A2]], align 4
 // CHECK19-NEXT:    [[TMP11:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP10]], 0
 // CHECK19-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP12]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP12]], align 4
 // CHECK19-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP13]], align 4
 // CHECK19-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2268,7 +2268,7 @@ void foo() {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV]], 123
 // CHECK19-NEXT:    [[TMP38:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[ADD]], 0
 // CHECK19-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP39]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP39]], align 4
 // CHECK19-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP40]], align 4
 // CHECK19-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS8]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_codegen.cpp b/clang/test/OpenMP/teams_distribute_codegen.cpp
index 0bfadcf70d9bc3..11b5c9a212985c 100644
--- a/clang/test/OpenMP/teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_codegen.cpp
@@ -254,7 +254,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -314,7 +314,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK1-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -645,7 +645,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -705,7 +705,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK3-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1016,7 +1016,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1226,7 +1226,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1403,7 +1403,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1548,7 +1548,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1730,7 +1730,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK25-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK25-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK25-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK25-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1929,7 +1929,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK25-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK25-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK25-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK25-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2116,7 +2116,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK27-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK27-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK27-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK27-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2314,7 +2314,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK27-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK27-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK27-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK27-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp
index 178af730a8d4ae..42b355f39e6b9e 100644
--- a/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_collapse_codegen.cpp
@@ -127,7 +127,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -288,7 +288,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -517,7 +517,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -747,7 +747,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -972,7 +972,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1200,7 +1200,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp
index f21c7e9be9205a..39c260996fcbd2 100644
--- a/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp
@@ -170,7 +170,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -212,7 +212,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -254,7 +254,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -582,7 +582,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -624,7 +624,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -666,7 +666,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1037,7 +1037,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1108,7 +1108,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1179,7 +1179,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP91]], 1
 // CHECK9-NEXT:    [[TMP92:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK9-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP93]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP93]], align 4
 // CHECK9-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CHECK9-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -1607,7 +1607,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1648,7 +1648,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1689,7 +1689,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK9-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2057,7 +2057,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2129,7 +2129,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2201,7 +2201,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP93]], 1
 // CHECK11-NEXT:    [[TMP94:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK11-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK11-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP96]], align 4
 // CHECK11-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2626,7 +2626,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2667,7 +2667,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2708,7 +2708,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
index 57175e72b79f8a..8b3e657428ec64 100644
--- a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
@@ -306,7 +306,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -589,7 +589,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1110,7 +1110,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1391,7 +1391,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
index 5ee4a5ce9e29dc..66c952f3281fb5 100644
--- a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
@@ -518,7 +518,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -829,7 +829,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1214,7 +1214,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1523,7 +1523,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp
index a13b565cb38d85..85b20e9b21cfad 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp
@@ -254,7 +254,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -314,7 +314,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK1-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -856,7 +856,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -916,7 +916,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK3-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1430,7 +1430,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1741,7 +1741,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2015,7 +2015,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2233,7 +2233,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2484,7 +2484,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK25-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK25-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK25-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK25-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2784,7 +2784,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK25-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK25-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK25-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK25-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3044,7 +3044,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK27-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK27-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK27-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK27-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3339,7 +3339,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK27-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK27-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK27-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK27-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp
index 64ee660b706ccf..f1c74bd6a0d792 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp
@@ -132,7 +132,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -368,7 +368,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -668,7 +668,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1018,7 +1018,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1318,7 +1318,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1670,7 +1670,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
index 37d19f86a9509b..2fda35f3e0009d 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp
@@ -123,7 +123,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -347,7 +347,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP12]], align 4
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -578,7 +578,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -797,7 +797,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP11]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP11]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP12]], align 4
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp
index eb3d2fc84568ab..070bb1ce3fed26 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp
@@ -182,7 +182,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -224,7 +224,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -266,7 +266,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -816,7 +816,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -858,7 +858,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -900,7 +900,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1484,7 +1484,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1555,7 +1555,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1635,7 +1635,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK9-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK9-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2384,7 +2384,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2425,7 +2425,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2475,7 +2475,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK9-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -3086,7 +3086,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3158,7 +3158,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3239,7 +3239,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK11-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK11-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -3973,7 +3973,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4014,7 +4014,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4064,7 +4064,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK11-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
index 904e6a1c1ce7bb..0726fb659e7d15 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -340,7 +340,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -753,7 +753,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1404,7 +1404,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1811,7 +1811,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp
index 49ff6a5d08beed..aed27c47fa1d36 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp
@@ -121,7 +121,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -154,7 +154,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -486,7 +486,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -519,7 +519,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -571,7 +571,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1073,7 +1073,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1106,7 +1106,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1158,7 +1158,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
index 7b8df7367b828f..06fc87d7da4c6c 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -766,7 +766,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1225,7 +1225,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1752,7 +1752,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2205,7 +2205,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp
index a7b1f6eb309ee9..6b200403637b6f 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp
@@ -97,7 +97,7 @@ int main() {
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -155,7 +155,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i32
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP27]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -556,7 +556,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -589,7 +589,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -634,7 +634,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -676,7 +676,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
 // CHECK1-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1384,7 +1384,7 @@ int main() {
 // CHECK5:       invoke.cont:
 // CHECK5-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1442,7 +1442,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i32
 // CHECK5-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP27]], 0
 // CHECK5-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK5-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK5-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1843,7 +1843,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1876,7 +1876,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1921,7 +1921,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1963,7 +1963,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
 // CHECK5-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp
index dc430fc55787c1..4d10d16f47ed87 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp
@@ -263,7 +263,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -533,7 +533,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -996,7 +996,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1260,7 +1260,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp
index 386e772b78970f..5e3962d96121cf 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp
@@ -60,7 +60,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -93,7 +93,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -411,7 +411,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp
index 7ac42579e91b95..2e800745baf2fb 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp
@@ -116,7 +116,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -410,7 +410,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -701,7 +701,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -991,7 +991,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp
index 7518179f477655..2dd2302c0f2cf6 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp
@@ -257,7 +257,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -299,7 +299,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -341,7 +341,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -383,7 +383,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -425,7 +425,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1284,7 +1284,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1326,7 +1326,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1368,7 +1368,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1410,7 +1410,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -1452,7 +1452,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK3-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK3-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2284,7 +2284,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2326,7 +2326,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2368,7 +2368,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK5-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK5-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2410,7 +2410,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK5-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK5-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2452,7 +2452,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK5-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK5-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3311,7 +3311,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK7-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3353,7 +3353,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3395,7 +3395,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK7-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK7-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3437,7 +3437,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK7-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK7-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3479,7 +3479,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK7-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK7-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK7-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK7-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK7-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -4369,7 +4369,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK13-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4440,7 +4440,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK13-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK13-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK13-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK13-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -4520,7 +4520,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK13-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK13-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK13-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -4591,7 +4591,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP131]], 1
 // CHECK13-NEXT:    [[TMP132:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK13-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK13-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK13-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -4671,7 +4671,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP171]], 1
 // CHECK13-NEXT:    [[TMP172:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK13-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP173]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP173]], align 4
 // CHECK13-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP174]], align 4
 // CHECK13-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -5863,7 +5863,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5904,7 +5904,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -5954,7 +5954,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK13-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK13-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -5995,7 +5995,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6045,7 +6045,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK13-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -6990,7 +6990,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK15-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7062,7 +7062,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK15-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK15-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK15-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK15-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -7143,7 +7143,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK15-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK15-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK15-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK15-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -7215,7 +7215,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP134]], 1
 // CHECK15-NEXT:    [[TMP135:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK15-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK15-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK15-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -7296,7 +7296,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP175]], 1
 // CHECK15-NEXT:    [[TMP176:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK15-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP177]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP177]], align 4
 // CHECK15-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 4, ptr [[TMP178]], align 4
 // CHECK15-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -8463,7 +8463,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK15-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -8504,7 +8504,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK15-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK15-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -8554,7 +8554,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK15-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK15-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -8595,7 +8595,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK15-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK15-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -8645,7 +8645,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK15-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK15-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK15-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK15-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK15-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -9563,7 +9563,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK17-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9634,7 +9634,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK17-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK17-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK17-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK17-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -9714,7 +9714,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK17-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK17-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK17-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK17-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -9785,7 +9785,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP131]], 1
 // CHECK17-NEXT:    [[TMP132:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK17-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK17-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK17-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -9865,7 +9865,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP171]], 1
 // CHECK17-NEXT:    [[TMP172:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK17-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP173]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP173]], align 4
 // CHECK17-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP174]], align 4
 // CHECK17-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -11057,7 +11057,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -11098,7 +11098,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -11148,7 +11148,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK17-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK17-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -11189,7 +11189,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -11239,7 +11239,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK17-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK17-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -12184,7 +12184,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK19-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12256,7 +12256,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK19-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK19-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK19-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK19-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -12337,7 +12337,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK19-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK19-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK19-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK19-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -12409,7 +12409,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP134]], 1
 // CHECK19-NEXT:    [[TMP135:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK19-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK19-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK19-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -12490,7 +12490,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP175]], 1
 // CHECK19-NEXT:    [[TMP176:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK19-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP177]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP177]], align 4
 // CHECK19-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP178]], align 4
 // CHECK19-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -13657,7 +13657,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -13698,7 +13698,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -13748,7 +13748,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -13789,7 +13789,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK19-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -13839,7 +13839,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK19-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK19-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp
index 854afe3b9bbc82..0a492152e9e01a 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp
@@ -262,7 +262,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -331,7 +331,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP61]], 1
 // CHECK1-NEXT:    [[TMP62:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP63]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP63]], align 4
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 3, ptr [[TMP64]], align 4
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -933,7 +933,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1002,7 +1002,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP61]], 1
 // CHECK3-NEXT:    [[TMP62:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP63]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP63]], align 4
 // CHECK3-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 3, ptr [[TMP64]], align 4
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1853,7 +1853,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK9-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK9-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK9-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2222,7 +2222,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK11-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK11-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2718,7 +2718,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK17-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK17-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2986,7 +2986,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP10]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP10]], align 4
 // CHECK19-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP11]], align 4
 // CHECK19-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3412,7 +3412,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK25-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK25-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK25-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK25-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3759,7 +3759,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK25-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK25-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK25-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK25-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4044,7 +4044,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK27-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK27-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK27-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK27-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4386,7 +4386,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK27-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK27-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK27-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK27-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp
index e7b9978dd43cd8..910805c3107457 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp
@@ -137,7 +137,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -389,7 +389,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -845,7 +845,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1231,7 +1231,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1547,7 +1547,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1935,7 +1935,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
index d3777e81047ca0..aa6d6c67c20f1c 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
@@ -185,7 +185,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -227,7 +227,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -269,7 +269,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -861,7 +861,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -903,7 +903,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -945,7 +945,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1818,7 +1818,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1889,7 +1889,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1969,7 +1969,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK9-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK9-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK9-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK9-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -2790,7 +2790,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2831,7 +2831,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2881,7 +2881,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK9-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK9-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -3534,7 +3534,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3606,7 +3606,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3687,7 +3687,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK11-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK11-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK11-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK11-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -4493,7 +4493,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4534,7 +4534,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -4584,7 +4584,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK11-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK11-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index 4f87d40e58f683..7d9e2ab89676f6 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -343,7 +343,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -770,7 +770,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1435,7 +1435,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1856,7 +1856,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp
index 7c86b180ec674b..58c1f4155abfbb 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp
@@ -118,7 +118,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -151,7 +151,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -511,7 +511,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -544,7 +544,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -596,7 +596,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1140,7 +1140,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1173,7 +1173,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1225,7 +1225,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1758,7 +1758,7 @@ int main() {
 // CHECK3-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1791,7 +1791,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2151,7 +2151,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2184,7 +2184,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2236,7 +2236,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3010,7 +3010,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK3-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3043,7 +3043,7 @@ int main() {
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -3095,7 +3095,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -4219,7 +4219,7 @@ int main() {
 // CHECK9-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4252,7 +4252,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4612,7 +4612,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4645,7 +4645,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -4697,7 +4697,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK9-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK9-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK9-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -5241,7 +5241,7 @@ int main() {
 // CHECK9-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK9-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5274,7 +5274,7 @@ int main() {
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -5326,7 +5326,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK9-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK9-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK9-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -5859,7 +5859,7 @@ int main() {
 // CHECK11-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5892,7 +5892,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6252,7 +6252,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6285,7 +6285,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -6337,7 +6337,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK11-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK11-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK11-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -7111,7 +7111,7 @@ int main() {
 // CHECK11-NEXT:    [[KERNEL_ARGS7:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK11-NEXT:    store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7144,7 +7144,7 @@ int main() {
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -7196,7 +7196,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = select i1 [[TOBOOL5]], i32 0, i32 1
 // CHECK11-NEXT:    [[TMP40:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP39]], 0
 // CHECK11-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK11-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP42]], align 4
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index 268298c14801cd..8643cb9bb84a8b 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -839,7 +839,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1312,7 +1312,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1853,7 +1853,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2320,7 +2320,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp
index fa8bcf65b8fb63..47d067256fb3a9 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp
@@ -99,7 +99,7 @@ int main() {
 // CHECK1:       invoke.cont:
 // CHECK1-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -157,7 +157,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i32
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP27]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -586,7 +586,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -619,7 +619,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -664,7 +664,7 @@ int main() {
 // CHECK1-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -706,7 +706,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
 // CHECK1-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP19]], align 4
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -1819,7 +1819,7 @@ int main() {
 // CHECK5:       invoke.cont:
 // CHECK5-NEXT:    store i8 [[CALL]], ptr [[A]], align 1
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1877,7 +1877,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP27:%.*]] = zext i8 [[TMP26]] to i32
 // CHECK5-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP27]], 0
 // CHECK5-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK5-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP30]], align 4
 // CHECK5-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2306,7 +2306,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2339,7 +2339,7 @@ int main() {
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -2384,7 +2384,7 @@ int main() {
 // CHECK5-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK5-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2426,7 +2426,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
 // CHECK5-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 0, ptr [[TMP19]], align 4
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp
index 03bee1dbb63c09..f40acb42f9dae4 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp
@@ -265,7 +265,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -549,7 +549,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1026,7 +1026,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1304,7 +1304,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
index 7d35ea305f92b5..c473e8532189b1 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -62,7 +62,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -95,7 +95,7 @@ int main() {
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP15]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
@@ -441,7 +441,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp
index 2dd1db48f0306b..669ca4a6b76ece 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp
@@ -120,7 +120,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -428,7 +428,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -733,7 +733,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1037,7 +1037,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp
index ff70e71e012673..3d40ac12e5ec62 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp
@@ -267,7 +267,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -309,7 +309,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -351,7 +351,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -393,7 +393,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -435,7 +435,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK1-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK1-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -1364,7 +1364,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1406,7 +1406,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -1448,7 +1448,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1490,7 +1490,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK2-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -1532,7 +1532,7 @@ int main (int argc, char **argv) {
 // CHECK2-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK2-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK2-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK2-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK2-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK2-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -2461,7 +2461,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2503,7 +2503,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -2545,7 +2545,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK5-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK5-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2587,7 +2587,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK5-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK5-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -2629,7 +2629,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK5-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK5-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK5-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK5-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK5-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -3531,7 +3531,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK6-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3573,7 +3573,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK6-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK6-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -3615,7 +3615,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK6-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK6-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -3657,7 +3657,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS19]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS20]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK6-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP66]], align 4
 // CHECK6-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS23]], i32 0, i32 2
@@ -3699,7 +3699,7 @@ int main (int argc, char **argv) {
 // CHECK6-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS27]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP84:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS28]], i32 0, i32 0
 // CHECK6-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 0
-// CHECK6-NEXT:    store i32 2, ptr [[TMP85]], align 4
+// CHECK6-NEXT:    store i32 3, ptr [[TMP85]], align 4
 // CHECK6-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 1
 // CHECK6-NEXT:    store i32 1, ptr [[TMP86]], align 4
 // CHECK6-NEXT:    [[TMP87:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS31]], i32 0, i32 2
@@ -5044,7 +5044,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK13-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -5115,7 +5115,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK13-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK13-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK13-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK13-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -5195,7 +5195,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK13-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK13-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK13-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK13-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -5266,7 +5266,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP131]], 1
 // CHECK13-NEXT:    [[TMP132:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK13-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK13-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK13-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -5346,7 +5346,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP171]], 1
 // CHECK13-NEXT:    [[TMP172:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK13-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP173]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP173]], align 4
 // CHECK13-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 4, ptr [[TMP174]], align 4
 // CHECK13-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -6658,7 +6658,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK13-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -6699,7 +6699,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK13-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK13-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -6749,7 +6749,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK13-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK13-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -6790,7 +6790,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK13-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK13-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -6840,7 +6840,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK13-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK13-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK13-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK13-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK13-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -7855,7 +7855,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK14-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK14-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK14-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK14-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -7926,7 +7926,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK14-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK14-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK14-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK14-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -8006,7 +8006,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP96]], 1
 // CHECK14-NEXT:    [[TMP97:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK14-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP98]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP98]], align 4
 // CHECK14-NEXT:    [[TMP99:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 4, ptr [[TMP99]], align 4
 // CHECK14-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -8077,7 +8077,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP131]], 1
 // CHECK14-NEXT:    [[TMP132:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK14-NEXT:    [[TMP133:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP133]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP133]], align 4
 // CHECK14-NEXT:    [[TMP134:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 3, ptr [[TMP134]], align 4
 // CHECK14-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -8157,7 +8157,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP171]], 1
 // CHECK14-NEXT:    [[TMP172:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK14-NEXT:    [[TMP173:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP173]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP173]], align 4
 // CHECK14-NEXT:    [[TMP174:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 4, ptr [[TMP174]], align 4
 // CHECK14-NEXT:    [[TMP175:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -9469,7 +9469,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK14-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK14-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -9510,7 +9510,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK14-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK14-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -9560,7 +9560,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK14-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK14-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -9601,7 +9601,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK14-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK14-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -9651,7 +9651,7 @@ int main (int argc, char **argv) {
 // CHECK14-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK14-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK14-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK14-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK14-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK14-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK14-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -10666,7 +10666,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK17-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -10738,7 +10738,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK17-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK17-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK17-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK17-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -10819,7 +10819,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK17-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK17-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK17-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK17-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -10891,7 +10891,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP134]], 1
 // CHECK17-NEXT:    [[TMP135:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK17-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK17-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK17-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -10972,7 +10972,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP175]], 1
 // CHECK17-NEXT:    [[TMP176:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK17-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP177]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP177]], align 4
 // CHECK17-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 4, ptr [[TMP178]], align 4
 // CHECK17-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -12259,7 +12259,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -12300,7 +12300,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -12350,7 +12350,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK17-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK17-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -12391,7 +12391,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK17-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK17-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -12441,7 +12441,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK17-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK17-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
@@ -13429,7 +13429,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK19-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -13501,7 +13501,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK19-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK19-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK19-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK19-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -13582,7 +13582,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP98]], 1
 // CHECK19-NEXT:    [[TMP99:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK19-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP100]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP100]], align 4
 // CHECK19-NEXT:    [[TMP101:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP101]], align 4
 // CHECK19-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -13654,7 +13654,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD44:%.*]] = add nsw i32 [[TMP134]], 1
 // CHECK19-NEXT:    [[TMP135:%.*]] = zext i32 [[ADD44]] to i64
 // CHECK19-NEXT:    [[TMP136:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP136]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP136]], align 4
 // CHECK19-NEXT:    [[TMP137:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP137]], align 4
 // CHECK19-NEXT:    [[TMP138:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS45]], i32 0, i32 2
@@ -13735,7 +13735,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[ADD60:%.*]] = add nsw i32 [[TMP175]], 1
 // CHECK19-NEXT:    [[TMP176:%.*]] = zext i32 [[ADD60]] to i64
 // CHECK19-NEXT:    [[TMP177:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP177]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP177]], align 4
 // CHECK19-NEXT:    [[TMP178:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 4, ptr [[TMP178]], align 4
 // CHECK19-NEXT:    [[TMP179:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS61]], i32 0, i32 2
@@ -15022,7 +15022,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -15063,7 +15063,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -15113,7 +15113,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP50]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP50]], align 4
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP51]], align 4
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -15154,7 +15154,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS16]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP70]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP70]], align 4
 // CHECK19-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP71]], align 4
 // CHECK19-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS19]], i32 0, i32 2
@@ -15204,7 +15204,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS23]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS24]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK19-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 2, ptr [[TMP96]], align 4
 // CHECK19-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS27]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_private_codegen.cpp
index bdc001fe5d5d8b..78b42e3194c796 100644
--- a/clang/test/OpenMP/teams_distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_private_codegen.cpp
@@ -243,7 +243,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -412,7 +412,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -771,7 +771,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -938,7 +938,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp
index 309cbffcf61625..ff2defa849efb7 100644
--- a/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_reduction_codegen.cpp
@@ -105,7 +105,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -285,7 +285,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -462,7 +462,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -642,7 +642,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
index ee5d5cad72fe20..2fab0cff55373a 100644
--- a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
@@ -302,7 +302,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP26]], 0
 // CHECK1-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP29]], 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -362,7 +362,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP63]], 1
 // CHECK1-NEXT:    [[TMP64:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP66]], align 4
 // CHECK1-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -737,7 +737,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP34:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP26]], 0
 // CHECK3-NEXT:    [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP29]], 0
 // CHECK3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -797,7 +797,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP63]], 1
 // CHECK3-NEXT:    [[TMP64:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP65]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP65]], align 4
 // CHECK3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP66]], align 4
 // CHECK3-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1407,7 +1407,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1629,7 +1629,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1993,7 +1993,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK17-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK17-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2171,7 +2171,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK19-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK19-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2348,7 +2348,7 @@ int main (int argc, char **argv) {
 // CHECK21-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK21-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK21-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK21-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK21-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK21-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK21-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK21-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2576,7 +2576,7 @@ int main (int argc, char **argv) {
 // CHECK23-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK23-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK23-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK23-NEXT:    store i32 2, ptr [[TMP18]], align 4
+// CHECK23-NEXT:    store i32 3, ptr [[TMP18]], align 4
 // CHECK23-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK23-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK23-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3130,7 +3130,7 @@ int main (int argc, char **argv) {
 // CHECK33-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK33-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK33-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK33-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK33-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK33-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK33-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK33-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3343,7 +3343,7 @@ int main (int argc, char **argv) {
 // CHECK33-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK33-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK33-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK33-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK33-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK33-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK33-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK33-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3537,7 +3537,7 @@ int main (int argc, char **argv) {
 // CHECK35-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK35-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK35-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK35-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK35-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK35-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK35-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK35-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3749,7 +3749,7 @@ int main (int argc, char **argv) {
 // CHECK35-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK35-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK35-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK35-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK35-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK35-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK35-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK35-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3952,7 +3952,7 @@ int main (int argc, char **argv) {
 // CHECK37-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK37-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK37-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK37-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK37-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK37-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK37-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK37-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4212,7 +4212,7 @@ int main (int argc, char **argv) {
 // CHECK37-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK37-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK37-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK37-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK37-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK37-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK37-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK37-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4416,7 +4416,7 @@ int main (int argc, char **argv) {
 // CHECK39-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
 // CHECK39-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
 // CHECK39-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK39-NEXT:    store i32 2, ptr [[TMP28]], align 4
+// CHECK39-NEXT:    store i32 3, ptr [[TMP28]], align 4
 // CHECK39-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK39-NEXT:    store i32 4, ptr [[TMP29]], align 4
 // CHECK39-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -4674,7 +4674,7 @@ int main (int argc, char **argv) {
 // CHECK39-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK39-NEXT:    [[TMP20:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
 // CHECK39-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK39-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK39-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK39-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK39-NEXT:    store i32 3, ptr [[TMP22]], align 4
 // CHECK39-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp
index 292bfb2a296c68..da0e7703e30cdf 100644
--- a/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp
@@ -128,7 +128,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -297,7 +297,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -674,7 +674,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -922,7 +922,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1155,7 +1155,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1401,7 +1401,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp
index a18b4d2feeec60..1266534b890f8a 100644
--- a/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp
@@ -170,7 +170,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -212,7 +212,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -254,7 +254,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -603,7 +603,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -645,7 +645,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS7]], i32 0, i32 2
@@ -687,7 +687,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS11]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS12]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK3-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1326,7 +1326,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1397,7 +1397,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK9-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK9-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK9-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK9-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1468,7 +1468,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP91]], 1
 // CHECK9-NEXT:    [[TMP92:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK9-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP93]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP93]], align 4
 // CHECK9-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP94]], align 4
 // CHECK9-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -1932,7 +1932,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1973,7 +1973,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -2014,7 +2014,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK9-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK9-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
@@ -2403,7 +2403,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2475,7 +2475,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP57]], 1
 // CHECK11-NEXT:    [[TMP58:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK11-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP59]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP59]], align 4
 // CHECK11-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP60]], align 4
 // CHECK11-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -2547,7 +2547,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP93]], 1
 // CHECK11-NEXT:    [[TMP94:%.*]] = zext i32 [[ADD29]] to i64
 // CHECK11-NEXT:    [[TMP95:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP95]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP95]], align 4
 // CHECK11-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP96]], align 4
 // CHECK11-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS30]], i32 0, i32 2
@@ -3008,7 +3008,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3049,7 +3049,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -3090,7 +3090,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP45]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP45]], align 4
 // CHECK11-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP46]], align 4
 // CHECK11-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS12]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
index dd3daf865512ab..2d3fccd90c0a5d 100644
--- a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
@@ -306,7 +306,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -596,7 +596,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1124,7 +1124,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1412,7 +1412,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
index 073960cc5a4601..ec95ab55b1552a 100644
--- a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
@@ -568,7 +568,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -886,7 +886,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1278,7 +1278,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP25]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP25]], align 4
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP26]], align 4
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1594,7 +1594,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP20]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp
index 52b213c3459151..c839268ada7534 100644
--- a/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp
@@ -244,7 +244,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -420,7 +420,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -786,7 +786,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -960,7 +960,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp
index e660009f1cd7f8..27c47f50547eaa 100644
--- a/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp
@@ -105,7 +105,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -292,7 +292,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -476,7 +476,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -663,7 +663,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
index c023de8cf010dd..649fae99b21ac3 100644
--- a/clang/test/OpenMP/teams_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_firstprivate_codegen.cpp
@@ -327,7 +327,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -371,7 +371,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK9-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP44]], align 4
 // CHECK9-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -663,7 +663,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 4, ptr [[TMP17]], align 4
 // CHECK9-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -707,7 +707,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK9-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP39]], align 4
 // CHECK9-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1121,7 +1121,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP21]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP22]], align 4
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1165,7 +1165,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK11-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP44]], align 4
 // CHECK11-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1457,7 +1457,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP16]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP16]], align 4
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 4, ptr [[TMP17]], align 4
 // CHECK11-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1501,7 +1501,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP38]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP38]], align 4
 // CHECK11-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP39]], align 4
 // CHECK11-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
@@ -1942,7 +1942,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [8 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [8 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP43]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP43]], align 4
 // CHECK17-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 8, ptr [[TMP44]], align 4
 // CHECK17-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2180,7 +2180,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK17-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP55]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP55]], align 4
 // CHECK17-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 10, ptr [[TMP56]], align 4
 // CHECK17-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2405,7 +2405,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK19-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [8 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [8 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP41]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP41]], align 4
 // CHECK19-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 8, ptr [[TMP42]], align 4
 // CHECK19-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2641,7 +2641,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 // CHECK19-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [10 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [10 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP53]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP53]], align 4
 // CHECK19-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 10, ptr [[TMP54]], align 4
 // CHECK19-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp b/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp
index 85a4dfea91a21a..d4a98f07fe24d5 100644
--- a/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp
+++ b/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp
@@ -253,7 +253,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK1-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -313,7 +313,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK1-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -840,7 +840,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP27:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
 // CHECK3-NEXT:    [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP22]], 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP29]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP29]], align 4
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 4, ptr [[TMP30]], align 4
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -900,7 +900,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP56]], 1
 // CHECK3-NEXT:    [[TMP57:%.*]] = zext i32 [[ADD14]] to i64
 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP58]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP58]], align 4
 // CHECK3-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 2, ptr [[TMP59]], align 4
 // CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
@@ -1399,7 +1399,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK9-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1710,7 +1710,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK11-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1984,7 +1984,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK17-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK17-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK17-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2202,7 +2202,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK19-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK19-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2453,7 +2453,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK25-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK25-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK25-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK25-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -2753,7 +2753,7 @@ int main (int argc, char **argv) {
 // CHECK25-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK25-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK25-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK25-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK25-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK25-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK25-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK25-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3013,7 +3013,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK27-NEXT:    [[TMP22:%.*]] = zext i32 [[ADD]] to i64
 // CHECK27-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP23]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP23]], align 4
 // CHECK27-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP24]], align 4
 // CHECK27-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -3308,7 +3308,7 @@ int main (int argc, char **argv) {
 // CHECK27-NEXT:    [[TMP17:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
 // CHECK27-NEXT:    [[TMP18:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP16]], 0
 // CHECK27-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK27-NEXT:    store i32 2, ptr [[TMP19]], align 4
+// CHECK27-NEXT:    store i32 3, ptr [[TMP19]], align 4
 // CHECK27-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK27-NEXT:    store i32 3, ptr [[TMP20]], align 4
 // CHECK27-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp
index 49df83c9c765cf..c0c04986f147e0 100644
--- a/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp
+++ b/clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp
@@ -132,7 +132,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -368,7 +368,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -668,7 +668,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
 // CHECK9-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP36]], align 4
 // CHECK9-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 5, ptr [[TMP37]], align 4
 // CHECK9-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1018,7 +1018,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1318,7 +1318,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
 // CHECK11-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP35]], align 4
 // CHECK11-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 5, ptr [[TMP36]], align 4
 // CHECK11-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1670,7 +1670,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
index 5ef729f044e097..303bce8c648c20 100644
--- a/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
@@ -263,7 +263,7 @@ int main() {
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -533,7 +533,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 8
 // CHECK1-NEXT:    store ptr undef, ptr [[_TMP1]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -996,7 +996,7 @@ int main() {
 // CHECK3-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1260,7 +1260,7 @@ int main() {
 // CHECK3-NEXT:    store ptr [[TEST]], ptr [[VAR]], align 4
 // CHECK3-NEXT:    store ptr undef, ptr [[_TMP1]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp
index 4da49eed32efd2..d80c003c010995 100644
--- a/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp
@@ -116,7 +116,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -410,7 +410,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -701,7 +701,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -991,7 +991,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP8]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/teams_private_codegen.cpp b/clang/test/OpenMP/teams_private_codegen.cpp
index 0126545c5915bc..1d0b2435ff0058 100644
--- a/clang/test/OpenMP/teams_private_codegen.cpp
+++ b/clang/test/OpenMP/teams_private_codegen.cpp
@@ -219,7 +219,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK1-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK1-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -405,7 +405,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK3-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK3-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -557,7 +557,7 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK9-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -724,7 +724,7 @@ int main() {
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
 // CHECK9-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef signext 3)
 // CHECK9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -804,7 +804,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK9-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1037,7 +1037,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK9-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK9-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK9-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK9-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1147,7 +1147,7 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], float noundef 2.000000e+00)
 // CHECK11-NEXT:    call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], float noundef 3.000000e+00)
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1314,7 +1314,7 @@ int main() {
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
 // CHECK11-NEXT:    call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]], i32 noundef 3)
 // CHECK11-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP0]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 0, ptr [[TMP1]], align 4
 // CHECK11-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1394,7 +1394,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP6]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP7]], align 4
 // CHECK11-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1627,7 +1627,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK11-NEXT:    store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT:    store i32 3, ptr [[TMP5]], align 4
 // CHECK11-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
 // CHECK11-NEXT:    store i32 1, ptr [[TMP6]], align 4
 // CHECK11-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/PCH/Inputs/bounds-safety-attributed-type.h b/clang/test/PCH/Inputs/bounds-safety-attributed-type.h
new file mode 100644
index 00000000000000..9bfd9615c2bfc1
--- /dev/null
+++ b/clang/test/PCH/Inputs/bounds-safety-attributed-type.h
@@ -0,0 +1,4 @@
+struct Test {
+  int count;
+  int fam[] __attribute__((counted_by(count)));
+};
diff --git a/clang/test/PCH/bounds-safety-attributed-type.c b/clang/test/PCH/bounds-safety-attributed-type.c
new file mode 100644
index 00000000000000..c2dc0272bbd605
--- /dev/null
+++ b/clang/test/PCH/bounds-safety-attributed-type.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -include %S/Inputs/bounds-safety-attributed-type.h -fsyntax-only -verify %s
+
+// Test with pch.
+// RUN: %clang_cc1 -emit-pch -o %t %S/Inputs/bounds-safety-attributed-type.h
+// RUN: %clang_cc1 -include-pch %t -fsyntax-only -verify %s
+// RUN: %clang_cc1 -include-pch %t -ast-print %s | FileCheck %s --check-prefix PRINT
+// RUN: %clang_cc1 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix DUMP
+// expected-no-diagnostics
+
+// PRINT: 	   struct Test {
+// PRINT-NEXT:   int count;
+// PRINT-NEXT:   int fam[] __counted_by(count);
+// PRINT-NEXT: };
+
+// DUMP:        RecordDecl {{.*}} imported <undeserialized declarations> struct Test definition
+// DUMP-NEXT:   |-FieldDecl {{.*}} imported referenced count 'int'
+// DUMP-NEXT:   `-FieldDecl {{.*}} imported fam 'int[] __counted_by(count)':'int[]'
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 6ec4dcd60cf601..9f8a8bdeeb9cb0 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -342,15 +342,15 @@
 
 // RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
 // RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
-// RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
-// RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto+nofp+nosimd+nocrc+nocrypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
+// RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto -mabi=aapcs-soft -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
+// RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto+nofp+nosimd+nocrc+nocrypto -mabi=aapcs-soft -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-2 %s
 // RUN: %clang -target aarch64 -march=armv8-a+nosimd -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-3 %s
 // CHECK-MARCH-1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+fp-armv8" "-target-feature" "+sha2" "-target-feature" "+neon"
 // CHECK-MARCH-2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "-fp-armv8"{{.*}} "-target-feature" "-neon"
 // CHECK-MARCH-3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "-neon"
 
 // While we're checking +nofp, also make sure it stops defining __ARM_FP
-// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-r+nofp -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-NOFP %s
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv8-r+nofp -mabi=aapcs-soft -x c -E -dM %s -o - | FileCheck -check-prefix=CHECK-NOFP %s
 // CHECK-NOFP-NOT: #define __ARM_FP{{ }}
 
 // Check +sm4:
diff --git a/clang/test/Preprocessor/bpf-predefined-macros.c b/clang/test/Preprocessor/bpf-predefined-macros.c
index fea24d1ea0ff7b..246cbfa2d6ab7d 100644
--- a/clang/test/Preprocessor/bpf-predefined-macros.c
+++ b/clang/test/Preprocessor/bpf-predefined-macros.c
@@ -61,7 +61,7 @@ int r;
 #ifdef __BPF_FEATURE_ST
 int s;
 #endif
-#ifdef __BPF_FEATURE_ARENA_CAST
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
 int t;
 #endif
 
diff --git a/clang/test/Preprocessor/ptrauth_feature.c b/clang/test/Preprocessor/ptrauth_feature.c
new file mode 100644
index 00000000000000..e45c6ea90fd11f
--- /dev/null
+++ b/clang/test/Preprocessor/ptrauth_feature.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -E -triple=arm64-- | FileCheck %s --check-prefixes=NOINTRIN
+// RUN: %clang_cc1 %s -E -triple=arm64-- -fptrauth-intrinsics | FileCheck %s --check-prefixes=INTRIN
+
+#if __has_feature(ptrauth_intrinsics)
+// INTRIN: has_ptrauth_intrinsics
+void has_ptrauth_intrinsics() {}
+#else
+// NOINTRIN: no_ptrauth_intrinsics
+void no_ptrauth_intrinsics() {}
+#endif
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index b2cad622610bfc..dfc6d18dee504e 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -1666,9 +1666,9 @@
 // CHECK-MISALIGNED-AVOID: __riscv_misaligned_avoid 1
 
 // RUN: %clang --target=riscv32-unknown-linux-gnu -march=rv32i -E -dM %s \
-// RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
+// RUN:   -mno-strict-align -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64i -E -dM %s \
-// RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
+// RUN:   -mno-strict-align -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // RUN: %clang --target=riscv64-unknown-linux-gnu -mcpu=sifive-p450 -E -dM %s \
 // RUN:  -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // CHECK-MISALIGNED-FAST: __riscv_misaligned_fast 1
diff --git a/clang/test/Preprocessor/sysroot-prefix.c b/clang/test/Preprocessor/sysroot-prefix.c
index 08c72f53b44e9f..eff71f5e3d5a36 100644
--- a/clang/test/Preprocessor/sysroot-prefix.c
+++ b/clang/test/Preprocessor/sysroot-prefix.c
@@ -4,6 +4,16 @@
 // RUN: %clang_cc1 -v -isysroot /var/empty -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_SYSROOT_NULL %s
 // RUN: %clang_cc1 -v -isysroot /var/empty -isysroot /var/empty/root -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSROOT_SYSROOT_NULL %s
 // RUN: %clang_cc1 -v -isysroot /var/empty/root -isysroot /var/empty -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL %s
+// RUN: %clang_cc1 -v -isystem=/usr/include -isysroot /var/empty -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSTEM_SYSROOT %s
+// RUN: %clang_cc1 -v -isystem=/usr/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSTEM_NO_SYSROOT %s
+// RUN: %clang_cc1 -v -iquote=/usr/include -isysroot /var/empty  -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_IQUOTE_SYSROOT %s
+// RUN: %clang_cc1 -v -iquote=/usr/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_IQUOTE_NO_SYSROOT %s
+// RUN: %clang_cc1 -v -idirafter=/usr/include -isysroot /var/empty  -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_IDIRAFTER_SYSROOT %s
+// RUN: %clang_cc1 -v -idirafter=/usr/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_IDIRAFTER_NO_SYSROOT %s
+// RUN: %clang_cc1 -v -isysroot /var/empty -isystem=/usr/include -iwithsysroot /opt/include -isystem=/usr/local/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-INTERLEAVE_I_PATHS %s
+
+// RUN: not %clang_cc1 -v -isysroot /var/empty -E %s -o /dev/null -I 2>&1 | FileCheck -check-prefix CHECK-EMPTY_I_PATH %s
+// RUN: %clang_cc1 -v -isysroot /var/empty/usr -E %s -o /dev/null -I= 2>&1 | FileCheck -check-prefix CHECK-SYSROOT_I_PATH %s
 
 // CHECK-ISYSROOT_NO_SYSROOT: ignoring nonexistent directory "/var/empty/include"
 // CHECK-ISYSROOT_NO_SYSROOT-NOT: ignoring nonexistent directory "/var/empty/var/empty/include"
@@ -23,3 +33,18 @@
 // CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL: ignoring nonexistent directory "/var/empty{{.}}null"
 // CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL-NOT: ignoring nonexistent directory "=null"
 
+// CHECK-ISYSROOT_ISYSTEM_SYSROOT: ignoring nonexistent directory "/var/empty/usr/include"
+// CHECK-ISYSROOT_ISYSTEM_NO_SYSROOT: ignoring nonexistent directory "=/usr/include"
+
+// CHECK-ISYSROOT_IQUOTE_SYSROOT: ignoring nonexistent directory "/var/empty/usr/include"
+// CHECK-ISYSROOT_IQUOTE_NO_SYSROOT: ignoring nonexistent directory "=/usr/include"
+
+// CHECK-ISYSROOT_IDIRAFTER_SYSROOT: ignoring nonexistent directory "/var/empty/usr/include"
+// CHECK-ISYSROOT_IDIRAFTER_NO_SYSROOT: ignoring nonexistent directory "=/usr/include"
+
+// CHECK-INTERLEAVE_I_PATHS: ignoring nonexistent directory "/var/empty/usr/include"
+// CHECK-INTERLEAVE_I_PATHS: ignoring nonexistent directory "/var/empty/opt/include"
+// CHECK-INTERLEAVE_I_PATHS: ignoring nonexistent directory "/var/empty/usr/local/include"
+
+// CHECK-EMPTY_I_PATH: argument to '-I' is missing
+// CHECK-SYSROOT_I_PATH: ignoring nonexistent directory "/var/empty/usr{{/|\\}}"
diff --git a/clang/test/Sema/PR85343.cpp b/clang/test/Sema/PR85343.cpp
new file mode 100644
index 00000000000000..d90ef19d423455
--- /dev/null
+++ b/clang/test/Sema/PR85343.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -std=c++14 -verify %s
+// expected-no-diagnostics
+
+template <typename c> auto ab() -> c ;
+
+template <typename> struct e {};
+
+template <typename f> struct ac {
+  template <typename h> static e<decltype(ab<h>()(ab<int>))> i;
+  decltype(i<f>) j;
+};
+
+struct d {
+  template <typename f>
+  d(f) { 
+    ac<f> a;
+  }
+};
+struct a {
+  d b = [=](auto) { (void)[this] {}; };
+};
+void b() { new a; }
diff --git a/clang/test/Sema/arm-vector-types-support.c b/clang/test/Sema/arm-vector-types-support.c
index 83a83ddfe78017..ed5f5ba175a94a 100644
--- a/clang/test/Sema/arm-vector-types-support.c
+++ b/clang/test/Sema/arm-vector-types-support.c
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 %s -triple armv7 -fsyntax-only -verify
+// RUN: %clang_cc1 %s -triple aarch64 -fsyntax-only -verify
+// RUN: %clang_cc1 %s -triple aarch64 -target-feature -fp-armv8 -target-abi aapcs-soft -fsyntax-only -verify
 
 typedef __attribute__((neon_vector_type(2))) int int32x2_t; // expected-error{{'neon_vector_type' attribute is not supported on targets missing 'neon', 'mve', 'sve' or 'sme'; specify an appropriate -march= or -mcpu=}}
 typedef __attribute__((neon_polyvector_type(16))) short poly8x16_t; // expected-error{{'neon_polyvector_type' attribute is not supported on targets missing 'neon' or 'mve'; specify an appropriate -march= or -mcpu=}}
diff --git a/clang/test/Sema/attr-counted-by.c b/clang/test/Sema/attr-counted-by.c
index f14da9c77fa8b4..d5d4ebf5573922 100644
--- a/clang/test/Sema/attr-counted-by.c
+++ b/clang/test/Sema/attr-counted-by.c
@@ -11,16 +11,64 @@ struct not_found {
 
 struct no_found_count_not_in_substruct {
   unsigned long flags;
-  unsigned char count; // expected-note {{field 'count' declared here}}
+  unsigned char count; // expected-note {{'count' declared here}}
   struct A {
     int dummy;
     int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
   } a;
 };
 
+struct not_found_count_not_in_unnamed_substruct {
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct_2 {
+  struct {
+    unsigned char count; // expected-note {{'count' declared here}}
+  };
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_in_other_unnamed_substruct {
+  struct {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct {
+  struct _a1 {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct_2 {
+  struct _a2 {
+    unsigned char count;
+  } a2;
+
+  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+};
+
 struct not_found_suggest {
-  int bork; // expected-note {{'bork' declared here}}
-  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'; did you mean 'bork'?}}
+  int bork;
+  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
 };
 
 int global; // expected-note {{'global' declared here}}
@@ -32,17 +80,17 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __counted_by(self); // expected-error {{'counted_by' cannot refer to the flexible array 'self'}}
+  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
 };
 
 struct non_int_count {
-  double dbl_count; // expected-note {{field 'dbl_count' declared here}}
-  struct bar *fam[] __counted_by(dbl_count); // expected-error {{field 'dbl_count' in 'counted_by' must be a non-boolean integer type}}
+  double dbl_count;
+  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
 };
 
 struct array_of_ints_count {
-  int integers[2]; // expected-note {{field 'integers' declared here}}
-  struct bar *fam[] __counted_by(integers); // expected-error {{field 'integers' in 'counted_by' must be a non-boolean integer type}}
+  int integers[2];
+  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
 };
 
 struct not_a_fam {
@@ -58,7 +106,7 @@ struct not_a_c99_fam {
 struct annotated_with_anon_struct {
   unsigned long flags;
   struct {
-    unsigned char count; // expected-note {{'count' declared here}}
-    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'; did you mean 'count'?}}
+    unsigned char count;
+    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
   };
 };
diff --git a/clang/test/Sema/const-eval.c b/clang/test/Sema/const-eval.c
index 2e38d5e23c208a..e358aceaad5a43 100644
--- a/clang/test/Sema/const-eval.c
+++ b/clang/test/Sema/const-eval.c
@@ -134,8 +134,7 @@ void PR21945(void) { int i = (({}), 0l); }
 
 void PR24622(void);
 struct PR24622 {} pr24622;
-EVAL_EXPR(52, &pr24622 == (void *)&PR24622); // expected-error {{not an integer constant expression}}
-                                             // expected-note@-1 {{past the end}}
+EVAL_EXPR(52, &pr24622 == (void *)&PR24622);
 
 // We evaluate these by providing 2s' complement semantics in constant
 // expressions, like we do for integers.
diff --git a/clang/test/Sema/constexpr-void-cast.c b/clang/test/Sema/constexpr-void-cast.c
new file mode 100644
index 00000000000000..91e4027f67fe38
--- /dev/null
+++ b/clang/test/Sema/constexpr-void-cast.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -x c -fsyntax-only %s -verify=c -std=c11
+// RUN: %clang_cc1 -x c -fsyntax-only %s -pedantic -verify=c-pedantic -std=c11
+//
+// RUN: %clang_cc1 -x c++ -fsyntax-only %s -verify=cxx
+// RUN: %clang_cc1 -x c++ -fsyntax-only %s -pedantic -verify=cxx-pedantic
+
+// c-no-diagnostics
+// cxx-no-diagnostics
+
+void f(void);
+struct S {char c;} s;
+_Static_assert(&s != (void *)&f, ""); // c-pedantic-warning {{not an integer constant expression}} \
+                                      // c-pedantic-note {{this conversion is not allowed in a constant expression}} \
+                                      // cxx-pedantic-warning {{'_Static_assert' is a C11 extension}}
diff --git a/clang/test/Sema/ptrauth-intrinsics-macro.c b/clang/test/Sema/ptrauth-intrinsics-macro.c
new file mode 100644
index 00000000000000..07d63740451452
--- /dev/null
+++ b/clang/test/Sema/ptrauth-intrinsics-macro.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -Wall -fsyntax-only -verify -fptrauth-intrinsics %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -Wall -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+#include <ptrauth.h>
+
+#define VALID_CODE_KEY 0
+#define VALID_DATA_KEY 2
+
+extern int dv;
+
+void test(int *dp, int value) {
+  dp = ptrauth_strip(dp, VALID_DATA_KEY);
+  ptrauth_extra_data_t t0 = ptrauth_blend_discriminator(dp, value);
+  (void)t0;
+  dp = ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, 0);
+  dp = ptrauth_auth_and_resign(dp, VALID_DATA_KEY, dp, VALID_DATA_KEY, dp);
+  dp = ptrauth_auth_data(dp, VALID_DATA_KEY, 0);
+  int pu0 = 0, pu1 = 0, pu2 = 0, pu3 = 0, pu4 = 0, pu5 = 0, pu6 = 0, pu7 = 0;
+  ptrauth_blend_discriminator(&pu0, value);
+  ptrauth_auth_and_resign(&pu1, VALID_DATA_KEY, dp, VALID_DATA_KEY, dp);
+  ptrauth_auth_and_resign(dp, VALID_DATA_KEY, &pu2, VALID_DATA_KEY, dp);
+  ptrauth_auth_and_resign(dp, VALID_DATA_KEY, dp, VALID_DATA_KEY, &pu3);
+  ptrauth_sign_generic_data(pu4, dp);
+  ptrauth_sign_generic_data(dp, pu5);
+  ptrauth_auth_data(&pu6, VALID_DATA_KEY, value);
+  ptrauth_auth_data(dp, VALID_DATA_KEY, pu7);
+
+
+
+  int t2 = ptrauth_sign_generic_data(dp, 0);
+  (void)t2;
+}
diff --git a/clang/test/Sema/ptrauth.c b/clang/test/Sema/ptrauth.c
new file mode 100644
index 00000000000000..3ad3d70c24e413
--- /dev/null
+++ b/clang/test/Sema/ptrauth.c
@@ -0,0 +1,126 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics %s
+
+#if __has_feature(ptrauth_intrinsics)
+#warning Pointer authentication enabled!
+// expected-warning@-1 {{Pointer authentication enabled!}}
+#endif
+
+#if __aarch64__
+#define VALID_CODE_KEY 0
+#define VALID_DATA_KEY 2
+#define INVALID_KEY 200
+#else
+#error Provide these constants if you port this test
+#endif
+
+#define NULL ((void*) 0)
+struct A { int x; } mismatched_type;
+
+extern int dv;
+extern int fv(int);
+
+void test_strip(int *dp, int (*fp)(int)) {
+  __builtin_ptrauth_strip(dp); // expected-error {{too few arguments}}
+  __builtin_ptrauth_strip(dp, VALID_DATA_KEY, dp); // expected-error {{too many arguments}}
+  (void) __builtin_ptrauth_strip(NULL, VALID_DATA_KEY); // no warning
+
+  __builtin_ptrauth_strip(mismatched_type, VALID_DATA_KEY); // expected-error {{signed value must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_strip(dp, mismatched_type); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+
+  int *dr = __builtin_ptrauth_strip(dp, VALID_DATA_KEY);
+  dr = __builtin_ptrauth_strip(dp, INVALID_KEY); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  int (*fr)(int) = __builtin_ptrauth_strip(fp, VALID_CODE_KEY);
+  fr = __builtin_ptrauth_strip(fp, INVALID_KEY); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  float *mismatch = __builtin_ptrauth_strip(dp, VALID_DATA_KEY); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+}
+
+void test_blend_discriminator(int *dp, int (*fp)(int), int value) {
+  __builtin_ptrauth_blend_discriminator(dp); // expected-error {{too few arguments}}
+  __builtin_ptrauth_blend_discriminator(dp, dp, dp); // expected-error {{too many arguments}}
+  (void) __builtin_ptrauth_blend_discriminator(dp, value); // no warning
+
+  __builtin_ptrauth_blend_discriminator(mismatched_type, value); // expected-error {{blended pointer must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_blend_discriminator(dp, mismatched_type); // expected-error {{blended integer must have integer type; type here is 'struct A'}}
+
+  float *mismatch = __builtin_ptrauth_blend_discriminator(dp, value); // expected-error {{incompatible integer to pointer conversion initializing 'float *' with an expression of type}}
+}
+
+void test_sign_unauthenticated(int *dp, int (*fp)(int)) {
+  __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY); // expected-error {{too few arguments}}
+  __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, dp, dp); // expected-error {{too many arguments}}
+
+  __builtin_ptrauth_sign_unauthenticated(mismatched_type, VALID_DATA_KEY, 0); // expected-error {{signed value must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_sign_unauthenticated(dp, mismatched_type, 0); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+  __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+
+  (void) __builtin_ptrauth_sign_unauthenticated(NULL, VALID_DATA_KEY, 0); // expected-warning {{signing a null pointer will yield a non-null pointer}}
+
+  int *dr = __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, 0);
+  dr = __builtin_ptrauth_sign_unauthenticated(dp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  int (*fr)(int) = __builtin_ptrauth_sign_unauthenticated(fp, VALID_CODE_KEY, 0);
+  fr = __builtin_ptrauth_sign_unauthenticated(fp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  float *mismatch = __builtin_ptrauth_sign_unauthenticated(dp, VALID_DATA_KEY, 0); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+}
+
+void test_auth(int *dp, int (*fp)(int)) {
+  __builtin_ptrauth_auth(dp, VALID_DATA_KEY); // expected-error {{too few arguments}}
+  __builtin_ptrauth_auth(dp, VALID_DATA_KEY, dp, dp); // expected-error {{too many arguments}}
+
+  __builtin_ptrauth_auth(mismatched_type, VALID_DATA_KEY, 0); // expected-error {{signed value must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_auth(dp, mismatched_type, 0); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+  __builtin_ptrauth_auth(dp, VALID_DATA_KEY, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+
+  (void) __builtin_ptrauth_auth(NULL, VALID_DATA_KEY, 0); // expected-warning {{authenticating a null pointer will almost certainly trap}}
+
+  int *dr = __builtin_ptrauth_auth(dp, VALID_DATA_KEY, 0);
+  dr = __builtin_ptrauth_auth(dp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  int (*fr)(int) = __builtin_ptrauth_auth(fp, VALID_CODE_KEY, 0);
+  fr = __builtin_ptrauth_auth(fp, INVALID_KEY, 0); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  float *mismatch = __builtin_ptrauth_auth(dp, VALID_DATA_KEY, 0); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+}
+
+void test_auth_and_resign(int *dp, int (*fp)(int)) {
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY); // expected-error {{too few arguments}}
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, dp, VALID_DATA_KEY, dp, 0); // expected-error {{too many arguments}}
+
+  __builtin_ptrauth_auth_and_resign(mismatched_type, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp); // expected-error {{signed value must have pointer type; type here is 'struct A'}}
+  __builtin_ptrauth_auth_and_resign(dp, mismatched_type, 0, VALID_DATA_KEY, dp); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, mismatched_type, VALID_DATA_KEY, dp); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, mismatched_type, dp); // expected-error {{passing 'struct A' to parameter of incompatible type 'int'}}
+  __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+
+  (void) __builtin_ptrauth_auth_and_resign(NULL, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp); // expected-warning {{authenticating a null pointer will almost certainly trap}}
+
+  int *dr = __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp);
+  dr = __builtin_ptrauth_auth_and_resign(dp, INVALID_KEY, 0, VALID_DATA_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+  dr = __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, INVALID_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  int (*fr)(int) = __builtin_ptrauth_auth_and_resign(fp, VALID_CODE_KEY, 0, VALID_CODE_KEY, dp);
+  fr = __builtin_ptrauth_auth_and_resign(fp, INVALID_KEY, 0, VALID_CODE_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+  fr = __builtin_ptrauth_auth_and_resign(fp, VALID_CODE_KEY, 0, INVALID_KEY, dp); // expected-error {{does not identify a valid pointer authentication key for the current target}}
+
+  float *mismatch = __builtin_ptrauth_auth_and_resign(dp, VALID_DATA_KEY, 0, VALID_DATA_KEY, dp); // expected-warning {{incompatible pointer types initializing 'float *' with an expression of type 'int *'}}
+}
+
+void test_sign_generic_data(int *dp) {
+  __builtin_ptrauth_sign_generic_data(dp); // expected-error {{too few arguments}}
+  __builtin_ptrauth_sign_generic_data(dp, 0, 0); // expected-error {{too many arguments}}
+
+  __builtin_ptrauth_sign_generic_data(mismatched_type, 0); // expected-error {{signed value must have pointer or integer type; type here is 'struct A'}}
+  __builtin_ptrauth_sign_generic_data(dp, mismatched_type); // expected-error {{extra discriminator must have pointer or integer type; type here is 'struct A'}}
+
+  (void) __builtin_ptrauth_sign_generic_data(NULL, 0); // no warning
+
+  unsigned long dr = __builtin_ptrauth_sign_generic_data(dp, 0);
+  dr = __builtin_ptrauth_sign_generic_data(dp, &dv);
+  dr = __builtin_ptrauth_sign_generic_data(12314, 0);
+  dr = __builtin_ptrauth_sign_generic_data(12314, &dv);
+
+  int *mismatch = __builtin_ptrauth_sign_generic_data(dp, 0); // expected-error {{incompatible integer to pointer conversion initializing 'int *' with an expression of type}}
+}
diff --git a/clang/test/Sema/warn-bitwise-and-bool.c b/clang/test/Sema/warn-bitwise-and-bool.c
index 6bec1be1abdef6..c30498e2bd8d1a 100644
--- a/clang/test/Sema/warn-bitwise-and-bool.c
+++ b/clang/test/Sema/warn-bitwise-and-bool.c
@@ -19,6 +19,8 @@ boolean baz(void) __attribute__((const));
 void sink(boolean);
 
 #define FOO foo()
+#define MY_AND &
+#define My_BITAND bitand
 
 void test(boolean a, boolean b, int *p, volatile int *q, int i) {
   b = a & b;
@@ -44,9 +46,12 @@ void test(boolean a, boolean b, int *p, volatile int *q, int i) {
   b = b & foo();
   b = bar() & (i > 4);
   b = (i == 7) & foo();
+  b = b MY_AND foo();     // OK, no warning expected
+
 #ifdef __cplusplus
-  b = foo() bitand bar(); // expected-warning {{use of bitwise '&' with boolean operands}}
-                          // expected-note@-1 {{cast one or both operands to int to silence this warning}}
+  b = foo() bitand bar(); // Ok, no warning expected
+  b = foo() My_BITAND bar(); // Ok, no warning expected
+                          
 #endif
 
   if (foo() & bar())      // expected-warning {{use of bitwise '&' with boolean operands}}
@@ -60,4 +65,5 @@ void test(boolean a, boolean b, int *p, volatile int *q, int i) {
 
   int n = i + 10;
   b = (n & (n - 1));
+
 }
diff --git a/clang/test/Sema/warn-bitwise-or-bool.c b/clang/test/Sema/warn-bitwise-or-bool.c
index ae86790901aac5..e45ef28da0a70d 100644
--- a/clang/test/Sema/warn-bitwise-or-bool.c
+++ b/clang/test/Sema/warn-bitwise-or-bool.c
@@ -19,6 +19,8 @@ boolean baz(void) __attribute__((const));
 void sink(boolean);
 
 #define FOO foo()
+#define MY_OR |
+#define My_BITOR bitor
 
 void test(boolean a, boolean b, int *p, volatile int *q, int i) {
   b = a | b;
@@ -44,9 +46,11 @@ void test(boolean a, boolean b, int *p, volatile int *q, int i) {
   b = b | foo();
   b = bar() | (i > 4);
   b = (i == 7) | foo();
+  b = b MY_OR foo();     // OK, no warning expected
 #ifdef __cplusplus
-  b = foo() bitor bar();  // expected-warning {{use of bitwise '|' with boolean operands}}
-                          // expected-note@-1 {{cast one or both operands to int to silence this warning}}
+  b = foo() bitor bar();  //Ok, no warning expected
+  b = foo() My_BITOR bar(); // Ok, no warning expected
+                          
 #endif
 
   if (foo() | bar())      // expected-warning {{use of bitwise '|' with boolean operands}}
diff --git a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
index c5b8032f40b131..7520b43a194aba 100644
--- a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
+++ b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
@@ -90,7 +90,8 @@ void test_record() {
   struct tuple4 {
     unsigned x, y, z, doublez;
 
-    constexpr bool operator==(tuple4 const &other) const {
+    bool operator==(tuple4 const &other) const = default;
+    constexpr bool operator==(bases const &other) const {
       return x == other.x && y == other.y &&
              z == other.z && doublez == other.doublez;
     }
@@ -99,6 +100,9 @@ void test_record() {
   constexpr tuple4 t4 = bit_cast<tuple4>(b);
   static_assert(t4 == tuple4{1, 2, 3, 4});
   static_assert(round_trip<tuple4>(b));
+
+  constexpr auto b2 = bit_cast<bases>(t4);
+  static_assert(t4 == b2);
 }
 
 void test_partially_initialized() {
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index 2d7c9b174d9019..478da092471aff 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -126,3 +126,13 @@ static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint sati
 static_assert(f5<double>() == 2);
 static_assert(f5<E>() == 1); // expected-note {{while checking constraint satisfaction}} expected-note {{in instantiation of}}
 static_assert(f5<F>() == 2); // expected-note {{while checking constraint satisfaction}} expected-note {{in instantiation of}}
+
+// Do not validate assumptions whose evaluation would have side-effects.
+constexpr int foo() {
+  int a = 0;
+  [[assume(a++)]] [[assume(++a)]]; // expected-warning 2 {{has side effects that will be discarded}} ext-warning 2 {{C++23 extension}}
+  [[assume((a+=1))]]; // expected-warning {{has side effects that will be discarded}} ext-warning {{C++23 extension}}
+  return a;
+}
+
+static_assert(foo() == 0);
diff --git a/clang/test/SemaCXX/datasizeof.cpp b/clang/test/SemaCXX/datasizeof.cpp
index 5baf2ecb24ed7a..43135c00496638 100644
--- a/clang/test/SemaCXX/datasizeof.cpp
+++ b/clang/test/SemaCXX/datasizeof.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -triple x86_64-linux-gnu -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple x86_64-linux-gnu -verify %s -fexperimental-new-constant-interpreter
 
 #if !__has_extension(datasizeof)
 #  error "Expected datasizeof extension"
diff --git a/clang/test/SemaCXX/decomposed-condition.cpp b/clang/test/SemaCXX/decomposed-condition.cpp
index ab011f6ae4ba43..e55bbee3134ca2 100644
--- a/clang/test/SemaCXX/decomposed-condition.cpp
+++ b/clang/test/SemaCXX/decomposed-condition.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++1z -Wno-binding-in-condition -verify %s
+// RUN: %clang_cc1 -std=c++1z -Wno-binding-in-condition -verify %s -fexperimental-new-constant-interpreter
 
 struct X {
   bool flag;
diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp
index 0516a5da31ae9a..389002ab0e349b 100644
--- a/clang/test/SemaCXX/lambda-expressions.cpp
+++ b/clang/test/SemaCXX/lambda-expressions.cpp
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 -std=c++11 -Wno-unused-value -fsyntax-only -verify=expected,expected-cxx14,cxx11 -fblocks %s
 // RUN: %clang_cc1 -std=c++14 -Wno-unused-value -fsyntax-only -verify -verify=expected-cxx14 -fblocks %s
 // RUN: %clang_cc1 -std=c++17 -Wno-unused-value -verify -ast-dump -fblocks %s | FileCheck %s
 
@@ -558,8 +559,8 @@ struct B {
   int x;
   A a = [&] { int y = x; };
   A b = [&] { [&] { [&] { int y = x; }; }; };
-  A d = [&](auto param) { int y = x; };
-  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; };
+  A d = [&](auto param) { int y = x; }; // cxx11-error {{'auto' not allowed in lambda parameter}}
+  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx11-error 2 {{'auto' not allowed in lambda parameter}}
 };
 
 B<int> b;
@@ -589,6 +590,7 @@ struct S1 {
 void foo1() {
   auto s0 = S1{[name=]() {}}; // expected-error 2 {{expected expression}}
   auto s1 = S1{[name=name]() {}}; // expected-error {{use of undeclared identifier 'name'; did you mean 'name1'?}}
+                                  // cxx11-warning@-1 {{initialized lambda captures are a C++14 extension}}
 }
 }
 
@@ -604,7 +606,7 @@ namespace PR25627_dont_odr_use_local_consts {
 
 namespace ConversionOperatorDoesNotHaveDeducedReturnType {
   auto x = [](int){};
-  auto y = [](auto &v) -> void { v.n = 0; };
+  auto y = [](auto &v) -> void { v.n = 0; }; // cxx11-error {{'auto' not allowed in lambda parameter}} cxx11-note {{candidate function not viable}} cxx11-note {{conversion candidate}}
   using T = decltype(x);
   using U = decltype(y);
   using ExpectedTypeT = void (*)(int);
@@ -624,22 +626,22 @@ namespace ConversionOperatorDoesNotHaveDeducedReturnType {
     template<typename T>
       friend constexpr U::operator ExpectedTypeU<T>() const noexcept;
 #else
-    friend auto T::operator()(int) const;
+    friend auto T::operator()(int) const; // cxx11-error {{'auto' return without trailing return type; deduced return types are a C++14 extension}}
     friend T::operator ExpectedTypeT() const;
 
     template<typename T>
-      friend void U::operator()(T&) const;
+      friend void U::operator()(T&) const; // cxx11-error {{friend declaration of 'operator()' does not match any declaration}}
     // FIXME: This should not match, as above.
     template<typename T>
-      friend U::operator ExpectedTypeU<T>() const;
+      friend U::operator ExpectedTypeU<T>() const; // cxx11-error {{friend declaration of 'operator void (*)(type-parameter-0-0 &)' does not match any declaration}}
 #endif
 
   private:
     int n;
   };
 
-  // Should be OK: lambda's call operator is a friend.
-  void use(X &x) { y(x); }
+  // Should be OK in C++14 and later: lambda's call operator is a friend.
+  void use(X &x) { y(x); } // cxx11-error {{no matching function for call to object}}
 
   // This used to crash in return type deduction for the conversion opreator.
   struct A { int n; void f() { +[](decltype(n)) {}; } };
@@ -733,6 +735,8 @@ void GH67492() {
   auto lambda = (test, []() noexcept(true) {});
 }
 
+// FIXME: This currently causes clang to crash in C++11 mode.
+#if __cplusplus >= 201402L
 namespace GH83267 {
 auto l = [](auto a) { return 1; };
 using type = decltype(l);
@@ -747,3 +751,4 @@ using t = decltype(ll);
 template auto t::operator()<int>(int a) const; // expected-note {{in instantiation}}
 
 }
+#endif
diff --git a/clang/test/SemaCXX/source_location.cpp b/clang/test/SemaCXX/source_location.cpp
index b151fc45fdad62..63157cfacdd98b 100644
--- a/clang/test/SemaCXX/source_location.cpp
+++ b/clang/test/SemaCXX/source_location.cpp
@@ -1,14 +1,14 @@
 // RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fexceptions -verify %s
 // RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -DUSE_CONSTEVAL -fexceptions -verify %s
 // RUN: %clang_cc1 -std=c++2b -fcxx-exceptions -DUSE_CONSTEVAL -DPAREN_INIT -fexceptions -verify %s
-// RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fms-extensions -DMS -fexceptions -verify %s
-// RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -fms-extensions -DMS -DUSE_CONSTEVAL -fexceptions -verify %s
+// RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fms-extensions -DMS -fexceptions -fms-compatibility -verify %s
+// RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -fms-extensions -DMS -DUSE_CONSTEVAL -fexceptions -fms-compatibility -verify %s
 //
 // RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
 // RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -DUSE_CONSTEVAL -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
 // RUN: %clang_cc1 -std=c++2b -fcxx-exceptions -DUSE_CONSTEVAL -DPAREN_INIT -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
-// RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fms-extensions -DMS -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
-// RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -fms-extensions -DMS -DUSE_CONSTEVAL -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify %s
+// RUN: %clang_cc1 -std=c++1z -fcxx-exceptions -fms-extensions -DMS -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -fms-compatibility -verify %s
+// RUN: %clang_cc1 -std=c++2a -fcxx-exceptions -fms-extensions -DMS -DUSE_CONSTEVAL -fexceptions -fexperimental-new-constant-interpreter -DNEW_INTERP -verify -fms-compatibility %s
 // expected-no-diagnostics
 
 #define assert(...) ((__VA_ARGS__) ? ((void)0) : throw 42)
@@ -463,8 +463,70 @@ void ctor_tests() {
 constexpr SL global_sl = SL::current();
 static_assert(is_equal(global_sl.function(), ""));
 
+template <class T>
+class TestBI {
+public:
+   TestBI() {
+#ifdef MS
+     static_assert(is_equal(__FUNCTION__, "test_func::TestBI<int>::TestBI"));
+#else
+     static_assert(is_equal(__FUNCTION__, "TestBI"));
+#endif
+     static_assert(is_equal(__func__, "TestBI"));
+   }
+};
+
+template <class T>
+class TestClass {
+public:
+   TestClass() {
+#ifdef MS
+      static_assert(is_equal(__FUNCTION__, "test_func::TestClass<class test_func::C>::TestClass"));
+#else
+      static_assert(is_equal(__FUNCTION__, "TestClass"));
+#endif
+      static_assert(is_equal(__func__, "TestClass"));
+   }
+};
+
+template <class T>
+class TestStruct {
+public:
+   TestStruct() {
+#ifdef MS
+      static_assert(is_equal(__FUNCTION__, "test_func::TestStruct<struct test_func::S>::TestStruct"));
+#else
+      static_assert(is_equal(__FUNCTION__, "TestStruct"));
+#endif
+      static_assert(is_equal(__func__, "TestStruct"));
+   }
+};
+
+template <class T>
+class TestEnum {
+public:
+   TestEnum() {
+#ifdef MS
+      static_assert(is_equal(__FUNCTION__, "test_func::TestEnum<enum test_func::E>::TestEnum"));
+#else
+      static_assert(is_equal(__FUNCTION__, "TestEnum"));
+#endif
+      static_assert(is_equal(__func__, "TestEnum"));
+   }
+};
+
+class C {};
+struct S {};
+enum E {};
+
+TestBI<int> t1;
+TestClass<test_func::C> t2;
+TestStruct<test_func::S> t3;
+TestEnum<test_func::E> t4;
+
 } // namespace test_func
 
+
 //===----------------------------------------------------------------------===//
 //                            __builtin_FUNCSIG()
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/SemaCXX/warn-constant-evaluated-constexpr.cpp b/clang/test/SemaCXX/warn-constant-evaluated-constexpr.cpp
index 35dc69cccb3a2e..fa40c13971118d 100644
--- a/clang/test/SemaCXX/warn-constant-evaluated-constexpr.cpp
+++ b/clang/test/SemaCXX/warn-constant-evaluated-constexpr.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++2a -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++2a -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
 
 namespace std {
 constexpr bool is_constant_evaluated() noexcept {
diff --git a/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
new file mode 100644
index 00000000000000..4c0e5315ce532e
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/clamp-errors.hlsl
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float2 test_no_second_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_clamp(p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 1}}
+}
+
+float2 test_no_third_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0);
+  // expected-error@-1 {{too few arguments to function call, expected 3, have 2}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 3, have 4}}
+}
+
+float2 test_clamp_no_second_arg(float2 p0) {
+  return clamp(p0);
+  // expected-error@-1 {{no matching function for call to 'clamp'}}
+}
+
+float2 test_clamp_vector_size_mismatch(float3 p0, float2 p1) {
+  return clamp(p0, p0, p1);
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
+}
+
+float2 test_clamp_builtin_vector_size_mismatch(float3 p0, float2 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must have the same type}}
+}
+
+float test_clamp_scalar_mismatch(float p0, half p1) {
+  return clamp(p1, p0, p1);
+  // expected-error@-1 {{call to 'clamp' is ambiguous}}
+}
+
+float2 test_clamp_element_type_mismatch(half2 p0, float2 p1) {
+  return clamp(p1, p0, p1);
+  // expected-error@-1 {{call to 'clamp' is ambiguous}}
+}
+
+float2 test_builtin_clamp_float2_splat(float p0, float2 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float3 test_builtin_clamp_float3_splat(float p0, float3 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float4 test_builtin_clamp_float4_splat(float p0, float4 p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float2 test_clamp_float2_int_splat(float2 p0, int p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float3 test_clamp_float3_int_splat(float3 p0, int p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float2 test_builtin_clamp_int_vect_to_float_vec_promotion(int2 p0, float p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p1, p1);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_elementwise_clamp' must be vectors}}
+}
+
+float test_builtin_clamp_bool_type_promotion(bool p0) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p0);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion(float p0, bool p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
+  return __builtin_hlsl_elementwise_clamp(p1, p0, p1);
+  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
+}
+
+float builtin_clamp_int_to_float_promotion(float p0, int p1) {
+  return __builtin_hlsl_elementwise_clamp(p0, p0, p1);
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
index 59eb9482b9ef92..ba7ffc20484ae0 100644
--- a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
@@ -108,3 +108,12 @@ int test_builtin_dot_bool_type_promotion(bool p0, bool p1) {
   return __builtin_hlsl_dot(p0, p1);
   // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
 }
+
+double test_dot_double(double2 p0, double2 p1) {
+  return dot(p0, p1);
+  // expected-error@-1 {{call to 'dot' is ambiguous}}
+}
+double test_dot_double_builtin(double2 p0, double2 p1) {
+  return __builtin_hlsl_dot(p0, p1);
+  // expected-error@-1 {{passing 'double2' (aka 'vector<double, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
index 06dbdf0a68dfc1..f82b5942fd46b6 100644
--- a/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/frac-errors.hlsl
@@ -13,7 +13,7 @@ float2 test_too_many_arg(float2 p0) {
 
 float builtin_bool_to_float_type_promotion(bool p1) {
   return __builtin_hlsl_elementwise_frac(p1);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+  // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}}
 }
 
 float builtin_frac_int_to_float_promotion(int p1) {
@@ -25,3 +25,15 @@ float2 builtin_frac_int2_to_float2_promotion(int2 p1) {
   return __builtin_hlsl_elementwise_frac(p1);
   // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
 }
+
+// builtins are variadic functions and so are subject to DefaultVariadicArgumentPromotion
+half builtin_frac_half_scalar (half p0) {
+  return __builtin_hlsl_elementwise_frac (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+float builtin_frac_float_scalar ( float p0) {
+  return __builtin_hlsl_elementwise_frac (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
diff --git a/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
new file mode 100644
index 00000000000000..7ddfd56638273b
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/isinf-errors.hlsl
@@ -0,0 +1,38 @@
+
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+bool test_too_few_arg() {
+  return __builtin_hlsl_elementwise_isinf();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+bool2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_isinf(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+bool builtin_bool_to_float_type_promotion(bool p1) {
+  return __builtin_hlsl_elementwise_isinf(p1);
+  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
+}
+
+bool builtin_isinf_int_to_float_promotion(int p1) {
+  return __builtin_hlsl_elementwise_isinf(p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+bool2 builtin_isinf_int2_to_float2_promotion(int2 p1) {
+  return __builtin_hlsl_elementwise_isinf(p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
+
+// builtins are variadic functions and so are subject to DefaultVariadicArgumentPromotion
+half builtin_isinf_half_scalar (half p0) {
+  return __builtin_hlsl_elementwise_isinf (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+float builtin_isinf_float_scalar ( float p0) {
+  return __builtin_hlsl_elementwise_isinf (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
index f6ce87e7c33e3e..83751f68357edf 100644
--- a/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/lerp-errors.hlsl
@@ -92,5 +92,18 @@ float builtin_lerp_int_to_float_promotion(float p0, int p1) {
 
 float4 test_lerp_int4(int4 p0, int4 p1, int4 p2) {
   return __builtin_hlsl_lerp(p0, p1, p2);
-   // expected-error@-1 {{1st argument must be a floating point type (was 'int4' (aka 'vector<int, 4>'))}}
-}
\ No newline at end of file
+  // expected-error@-1 {{1st argument must be a floating point type (was 'int4' (aka 'vector<int, 4>'))}}
+}
+
+// note: DefaultVariadicArgumentPromotion --> DefaultArgumentPromotion has already promoted to double
+// we don't know anymore that the input was half when __builtin_hlsl_lerp is called so we default to float
+// for expected type
+half builtin_lerp_half_scalar (half p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+float builtin_lerp_float_scalar ( float p0) {
+  return __builtin_hlsl_lerp ( p0, p0, p0 );
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
index 0b6843591455bd..97ce931bf1b5b5 100644
--- a/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/mad-errors.hlsl
@@ -72,15 +72,15 @@ float2 test_builtin_mad_int_vect_to_float_vec_promotion(int2 p0, float p1) {
 
 float builtin_bool_to_float_type_promotion(float p0, bool p1) {
   return __builtin_hlsl_mad(p0, p0, p1);
-  // expected-error@-1 {{3rd argument must be a vector, integer or floating point type (was 'bool')}}
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'bool')}}
 }
 
 float builtin_bool_to_float_type_promotion2(bool p0, float p1) {
   return __builtin_hlsl_mad(p1, p0, p1);
-  // expected-error@-1 {{2nd argument must be a vector, integer or floating point type (was 'bool')}}
+  // expected-error@-1 {{2nd argument must be a floating point type (was 'bool')}}
 }
 
 float builtin_mad_int_to_float_promotion(float p0, int p1) {
   return __builtin_hlsl_mad(p0, p0, p1);
-  // expected-error@-1 {{arguments are of different types ('double' vs 'int')}}
+  // expected-error@-1 {{3rd argument must be a floating point type (was 'int')}}
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
index dc4501dbd6d15a..fa6fd813f19e64 100644
--- a/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/rcp-errors.hlsl
@@ -13,7 +13,7 @@ float2 test_too_many_arg(float2 p0) {
 
 float builtin_bool_to_float_type_promotion(bool p1) {
   return __builtin_hlsl_elementwise_rcp(p1);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
 }
 
 float builtin_rcp_int_to_float_promotion(int p1) {
diff --git a/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
new file mode 100644
index 00000000000000..c027a698c5e58f
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/rsqrt-errors.hlsl
@@ -0,0 +1,38 @@
+
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float test_too_few_arg() {
+  return __builtin_hlsl_elementwise_rsqrt();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_rsqrt(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+float builtin_bool_to_float_type_promotion(bool p1) {
+  return __builtin_hlsl_elementwise_rsqrt(p1);
+  // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}}
+}
+
+float builtin_rsqrt_int_to_float_promotion(int p1) {
+  return __builtin_hlsl_elementwise_rsqrt(p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+float2 builtin_rsqrt_int2_to_float2_promotion(int2 p1) {
+  return __builtin_hlsl_elementwise_rsqrt(p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
+
+// builtins are variadic functions and so are subject to DefaultVariadicArgumentPromotion
+half builtin_rsqrt_half_scalar (half p0) {
+  return __builtin_hlsl_elementwise_rsqrt (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+float builtin_rsqrt_float_scalar ( float p0) {
+  return __builtin_hlsl_elementwise_rsqrt (p0);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index bac209a28da912..b7ea0d003a52d7 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1085,3 +1085,32 @@ template void Struct<void>::bar<>();
 template int Struct<void>::field<1, 2>;
 
 }
+
+namespace GH64808 {
+
+template <class T> struct basic_sender {
+  T func;
+  basic_sender(T) : func(T()) {}
+};
+
+auto a = basic_sender{[](auto... __captures) {
+  return []() // #note-a-1
+    requires((__captures, ...), false) // #note-a-2
+  {};
+}()};
+
+auto b = basic_sender{[](auto... __captures) {
+  return []()
+    requires([](int, double) { return true; }(decltype(__captures)()...))
+  {};
+}(1, 2.33)};
+
+void foo() {
+  a.func();
+  // expected-error@-1{{no matching function for call}}
+  // expected-note@#note-a-1{{constraints not satisfied}}
+  // expected-note@#note-a-2{{evaluated to false}}
+  b.func();
+}
+
+} // namespace GH64808
diff --git a/clang/tools/clang-format/.clang-format b/clang/tools/clang-format/.clang-format
index f95602cab0f7fc..d7331b3c8cf02e 100644
--- a/clang/tools/clang-format/.clang-format
+++ b/clang/tools/clang-format/.clang-format
@@ -1,6 +1 @@
-BasedOnStyle: LLVM
-InsertBraces: true
-InsertNewlineAtEOF: true
-LineEnding: LF
-RemoveBracesLLVM: true
-RemoveParentheses: ReturnStatement
+BasedOnStyle: clang-format
diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt
index e05f4eac3ad198..b90ffc847b1558 100644
--- a/clang/tools/clang-installapi/CMakeLists.txt
+++ b/clang/tools/clang-installapi/CMakeLists.txt
@@ -2,13 +2,20 @@ set(LLVM_LINK_COMPONENTS
   Support
   TargetParser
   TextAPI
+  TextAPIBinaryReader
   Option
   )
 
+set(LLVM_TARGET_DEFINITIONS InstallAPIOpts.td)
+tablegen(LLVM InstallAPIOpts.inc -gen-opt-parser-defs)
+add_public_tablegen_target(InstallAPIDriverOptions)
+
 add_clang_tool(clang-installapi
   ClangInstallAPI.cpp
   Options.cpp
 
+  DEPENDS
+  InstallAPIDriverOptions
   GENERATE_DRIVER
   )
 
@@ -22,3 +29,4 @@ clang_target_link_libraries(clang-installapi
   clangTooling
   clangSerialization
   )
+
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index 15b0baee88bc34..54e82d78d4d228 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -14,12 +14,12 @@
 #include "Options.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticFrontend.h"
-#include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/InstallAPI/Frontend.h"
 #include "clang/InstallAPI/FrontendRecords.h"
+#include "clang/InstallAPI/InstallAPIDiagnostic.h"
 #include "clang/InstallAPI/MachO.h"
 #include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -92,22 +92,8 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   IntrusiveRefCntPtr<clang::FileManager> FM(
       new FileManager(clang::FileSystemOptions(), OverlayFileSystem));
 
-  // Set up driver to parse input arguments.
-  auto DriverArgs = llvm::ArrayRef(Args).slice(1);
-  clang::driver::Driver Driver(ProgName, llvm::sys::getDefaultTargetTriple(),
-                               *Diag, "clang installapi tool");
-  auto TargetAndMode =
-      clang::driver::ToolChain::getTargetAndModeFromProgramName(ProgName);
-  Driver.setTargetAndMode(TargetAndMode);
-  bool HasError = false;
-  llvm::opt::InputArgList ArgList =
-      Driver.ParseArgStrings(DriverArgs, /*UseDriverMode=*/true, HasError);
-  if (HasError)
-    return EXIT_FAILURE;
-  Driver.setCheckInputsExist(false);
-
-  // Capture InstallAPI specific options and diagnose any option errors.
-  Options Opts(*Diag, FM.get(), ArgList);
+  // Capture all options and diagnose any errors.
+  Options Opts(*Diag, FM.get(), Args, ProgName);
   if (Diag->hasErrorOccurred())
     return EXIT_FAILURE;
 
@@ -125,19 +111,21 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   // Execute and gather AST results.
   // An invocation is ran for each unique target triple and for each header
   // access level.
-  Records FrontendResults;
   for (const auto &[Targ, Trip] : Opts.DriverOpts.Targets) {
+    Ctx.Verifier->setTarget(Targ);
+    Ctx.Slice = std::make_shared<FrontendRecordsSlice>(Trip);
     for (const HeaderType Type :
          {HeaderType::Public, HeaderType::Private, HeaderType::Project}) {
-      Ctx.Slice = std::make_shared<FrontendRecordsSlice>(Trip);
       Ctx.Type = Type;
       if (!runFrontend(ProgName, Opts.DriverOpts.Verbose, Ctx,
                        InMemoryFileSystem.get(), Opts.getClangFrontendArgs()))
         return EXIT_FAILURE;
-      FrontendResults.emplace_back(std::move(Ctx.Slice));
     }
   }
 
+  if (Ctx.Verifier->getState() == DylibVerifier::Result::Invalid)
+    return EXIT_FAILURE;
+
   // After symbols have been collected, prepare to write output.
   auto Out = CI->createOutputFile(Ctx.OutputLoc, /*Binary=*/false,
                                   /*RemoveFileOnSignal=*/false,
@@ -147,13 +135,7 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
     return EXIT_FAILURE;
 
   // Assign attributes for serialization.
-  auto Symbols = std::make_unique<SymbolSet>();
-  for (const auto &FR : FrontendResults) {
-    SymbolConverter Converter(Symbols.get(), FR->getTarget());
-    FR->visit(Converter);
-  }
-
-  InterfaceFile IF(std::move(Symbols));
+  InterfaceFile IF(Ctx.Verifier->getExports());
   for (const auto &TargetInfo : Opts.DriverOpts.Targets) {
     IF.addTarget(TargetInfo.first);
     IF.setFromBinaryAttrs(Ctx.BA, TargetInfo.first);
@@ -161,7 +143,8 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
 
   // Write output file and perform CI cleanup.
   if (auto Err = TextAPIWriter::writeToStream(*Out, IF, Ctx.FT)) {
-    Diag->Report(diag::err_cannot_open_file) << Ctx.OutputLoc;
+    Diag->Report(diag::err_cannot_write_file)
+        << Ctx.OutputLoc << std::move(Err);
     CI->clearOutputFiles(/*EraseFiles=*/true);
     return EXIT_FAILURE;
   }
diff --git a/clang/tools/clang-installapi/InstallAPIOpts.td b/clang/tools/clang-installapi/InstallAPIOpts.td
new file mode 100644
index 00000000000000..87f4c3327e8409
--- /dev/null
+++ b/clang/tools/clang-installapi/InstallAPIOpts.td
@@ -0,0 +1,31 @@
+//===--- InstallAPIOpts.td ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the specific options for InstallAPI.
+//
+//===----------------------------------------------------------------------===//
+
+// Include the common option parsing interfaces.
+include "llvm/Option/OptParser.td"
+
+
+/////////
+// Options
+
+// TextAPI options.
+def filetype : Joined<["--"], "filetype=">,
+  HelpText<"Specify the output file type (tbd-v4 or tbd-v5)">;
+
+// Verification options.
+def verify_against : Separate<["-"], "verify-against">,
+  HelpText<"Verify the specified dynamic library/framework against the headers">;
+def verify_against_EQ : Joined<["--"], "verify-against=">, Alias<verify_against>;
+def verify_mode_EQ : Joined<["--"], "verify-mode=">,
+  HelpText<"Specify the severity and extend of the validation. Valid modes are ErrorsOnly, ErrorsAndWarnings, and Pedantic.">;
+def demangle : Flag<["--", "-"], "demangle">,
+  HelpText<"Demangle symbols when printing warnings and errors">;
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 701ab81c57c3d0..b8696bb7896d86 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -10,35 +10,85 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/InstallAPI/FileList.h"
+#include "clang/InstallAPI/InstallAPIDiagnostic.h"
 #include "llvm/Support/Program.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TextAPI/DylibReader.h"
+#include "llvm/TextAPI/TextAPIWriter.h"
 
-using namespace clang::driver;
-using namespace clang::driver::options;
+using namespace llvm;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
+namespace drv = clang::driver::options;
+
 namespace clang {
 namespace installapi {
 
+/// Create prefix string literals used in InstallAPIOpts.td.
+#define PREFIX(NAME, VALUE)                                                    \
+  static constexpr llvm::StringLiteral NAME##_init[] = VALUE;                  \
+  static constexpr llvm::ArrayRef<llvm::StringLiteral> NAME(                   \
+      NAME##_init, std::size(NAME##_init) - 1);
+#include "InstallAPIOpts.inc"
+#undef PREFIX
+
+static constexpr const llvm::StringLiteral PrefixTable_init[] =
+#define PREFIX_UNION(VALUES) VALUES
+#include "InstallAPIOpts.inc"
+#undef PREFIX_UNION
+    ;
+static constexpr const ArrayRef<StringLiteral>
+    PrefixTable(PrefixTable_init, std::size(PrefixTable_init) - 1);
+
+/// Create table mapping all options defined in InstallAPIOpts.td.
+static constexpr OptTable::Info InfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS,         \
+               VISIBILITY, PARAM, HELPTEXT, METAVAR, VALUES)                   \
+  {PREFIX, NAME,  HELPTEXT,   METAVAR,     OPT_##ID,    Option::KIND##Class,   \
+   PARAM,  FLAGS, VISIBILITY, OPT_##GROUP, OPT_##ALIAS, ALIASARGS,             \
+   VALUES},
+#include "InstallAPIOpts.inc"
+#undef OPTION
+};
+
+namespace {
+
+/// \brief Create OptTable class for parsing actual command line arguments.
+class DriverOptTable : public opt::PrecomputedOptTable {
+public:
+  DriverOptTable() : PrecomputedOptTable(InfoTable, PrefixTable) {}
+};
+
+} // end anonymous namespace.
+
+static llvm::opt::OptTable *createDriverOptTable() {
+  return new DriverOptTable();
+}
+
 bool Options::processDriverOptions(InputArgList &Args) {
   // Handle inputs.
-  llvm::append_range(DriverOpts.FileLists, Args.getAllArgValues(OPT_INPUT));
+  llvm::append_range(DriverOpts.FileLists,
+                     Args.getAllArgValues(drv::OPT_INPUT));
 
   // Handle output.
   SmallString<PATH_MAX> OutputPath;
-  if (auto *Arg = Args.getLastArg(OPT_o)) {
+  if (auto *Arg = Args.getLastArg(drv::OPT_o)) {
     OutputPath = Arg->getValue();
     if (OutputPath != "-")
       FM->makeAbsolutePath(OutputPath);
     DriverOpts.OutputPath = std::string(OutputPath);
   }
+  if (DriverOpts.OutputPath.empty()) {
+    Diags->Report(diag::err_no_output_file);
+    return false;
+  }
 
   // Do basic error checking first for mixing -target and -arch options.
-  auto *ArgArch = Args.getLastArgNoClaim(OPT_arch);
-  auto *ArgTarget = Args.getLastArgNoClaim(OPT_target);
+  auto *ArgArch = Args.getLastArgNoClaim(drv::OPT_arch);
+  auto *ArgTarget = Args.getLastArgNoClaim(drv::OPT_target);
   auto *ArgTargetVariant =
-      Args.getLastArgNoClaim(OPT_darwin_target_variant_triple);
+      Args.getLastArgNoClaim(drv::OPT_darwin_target_variant_triple);
   if (ArgArch && (ArgTarget || ArgTargetVariant)) {
     Diags->Report(clang::diag::err_drv_argument_not_allowed_with)
         << ArgArch->getAsString(Args)
@@ -46,7 +96,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
     return false;
   }
 
-  auto *ArgMinTargetOS = Args.getLastArgNoClaim(OPT_mtargetos_EQ);
+  auto *ArgMinTargetOS = Args.getLastArgNoClaim(drv::OPT_mtargetos_EQ);
   if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) {
     Diags->Report(clang::diag::err_drv_cannot_mix_options)
         << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args);
@@ -55,7 +105,7 @@ bool Options::processDriverOptions(InputArgList &Args) {
 
   // Capture target triples first.
   if (ArgTarget) {
-    for (const Arg *A : Args.filtered(OPT_target)) {
+    for (const Arg *A : Args.filtered(drv::OPT_target)) {
       A->claim();
       llvm::Triple TargetTriple(A->getValue());
       Target TAPITarget = Target(TargetTriple);
@@ -69,27 +119,32 @@ bool Options::processDriverOptions(InputArgList &Args) {
     }
   }
 
-  DriverOpts.Verbose = Args.hasArgNoClaim(OPT_v);
+  DriverOpts.Verbose = Args.hasArgNoClaim(drv::OPT_v);
 
   return true;
 }
 
 bool Options::processLinkerOptions(InputArgList &Args) {
-  // TODO: add error handling.
-
-  // Required arguments.
-  if (const Arg *A = Args.getLastArg(options::OPT_install__name))
+  // Handle required arguments.
+  if (const Arg *A = Args.getLastArg(drv::OPT_install__name))
     LinkerOpts.InstallName = A->getValue();
+  if (LinkerOpts.InstallName.empty()) {
+    Diags->Report(diag::err_no_install_name);
+    return false;
+  }
 
   // Defaulted or optional arguments.
-  if (auto *Arg = Args.getLastArg(OPT_current__version))
+  if (auto *Arg = Args.getLastArg(drv::OPT_current__version))
     LinkerOpts.CurrentVersion.parse64(Arg->getValue());
 
-  LinkerOpts.IsDylib = Args.hasArg(OPT_dynamiclib);
+  if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version))
+    LinkerOpts.CompatVersion.parse64(Arg->getValue());
+
+  LinkerOpts.IsDylib = Args.hasArg(drv::OPT_dynamiclib);
 
-  LinkerOpts.AppExtensionSafe =
-      Args.hasFlag(OPT_fapplication_extension, OPT_fno_application_extension,
-                   /*Default=*/LinkerOpts.AppExtensionSafe);
+  LinkerOpts.AppExtensionSafe = Args.hasFlag(
+      drv::OPT_fapplication_extension, drv::OPT_fno_application_extension,
+      /*Default=*/LinkerOpts.AppExtensionSafe);
 
   if (::getenv("LD_NO_ENCRYPT") != nullptr)
     LinkerOpts.AppExtensionSafe = true;
@@ -102,7 +157,7 @@ bool Options::processLinkerOptions(InputArgList &Args) {
 bool Options::processFrontendOptions(InputArgList &Args) {
   // Do not claim any arguments, as they will be passed along for CC1
   // invocations.
-  if (auto *A = Args.getLastArgNoClaim(OPT_x)) {
+  if (auto *A = Args.getLastArgNoClaim(drv::OPT_x)) {
     FEOpts.LangMode = llvm::StringSwitch<clang::Language>(A->getValue())
                           .Case("c", clang::Language::C)
                           .Case("c++", clang::Language::CXX)
@@ -116,8 +171,8 @@ bool Options::processFrontendOptions(InputArgList &Args) {
       return false;
     }
   }
-  for (auto *A : Args.filtered(OPT_ObjC, OPT_ObjCXX)) {
-    if (A->getOption().matches(OPT_ObjC))
+  for (auto *A : Args.filtered(drv::OPT_ObjC, drv::OPT_ObjCXX)) {
+    if (A->getOption().matches(drv::OPT_ObjC))
       FEOpts.LangMode = clang::Language::ObjC;
     else
       FEOpts.LangMode = clang::Language::ObjCXX;
@@ -126,9 +181,77 @@ bool Options::processFrontendOptions(InputArgList &Args) {
   return true;
 }
 
+std::vector<const char *>
+Options::processAndFilterOutInstallAPIOptions(ArrayRef<const char *> Args) {
+  std::unique_ptr<llvm::opt::OptTable> Table;
+  Table.reset(createDriverOptTable());
+
+  unsigned MissingArgIndex, MissingArgCount;
+  auto ParsedArgs = Table->ParseArgs(Args.slice(1), MissingArgIndex,
+                                     MissingArgCount, Visibility());
+
+  // Capture InstallAPI only driver options.
+  DriverOpts.Demangle = ParsedArgs.hasArg(OPT_demangle);
+
+  if (auto *A = ParsedArgs.getLastArg(OPT_filetype)) {
+    DriverOpts.OutFT = TextAPIWriter::parseFileType(A->getValue());
+    if (DriverOpts.OutFT == FileType::Invalid) {
+      Diags->Report(clang::diag::err_drv_invalid_value)
+          << A->getAsString(ParsedArgs) << A->getValue();
+      return {};
+    }
+  }
+
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_verify_mode_EQ)) {
+    DriverOpts.VerifyMode =
+        StringSwitch<VerificationMode>(A->getValue())
+            .Case("ErrorsOnly", VerificationMode::ErrorsOnly)
+            .Case("ErrorsAndWarnings", VerificationMode::ErrorsAndWarnings)
+            .Case("Pedantic", VerificationMode::Pedantic)
+            .Default(VerificationMode::Invalid);
+
+    if (DriverOpts.VerifyMode == VerificationMode::Invalid) {
+      Diags->Report(clang::diag::err_drv_invalid_value)
+          << A->getAsString(ParsedArgs) << A->getValue();
+      return {};
+    }
+  }
+
+  if (const Arg *A = ParsedArgs.getLastArg(OPT_verify_against))
+    DriverOpts.DylibToVerify = A->getValue();
+
+  /// Any unclaimed arguments should be forwarded to the clang driver.
+  std::vector<const char *> ClangDriverArgs(ParsedArgs.size());
+  for (const Arg *A : ParsedArgs) {
+    if (A->isClaimed())
+      continue;
+    llvm::copy(A->getValues(), std::back_inserter(ClangDriverArgs));
+  }
+  return ClangDriverArgs;
+}
+
 Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
-                 InputArgList &ArgList)
+                 ArrayRef<const char *> Args, const StringRef ProgName)
     : Diags(&Diag), FM(FM) {
+
+  // First process InstallAPI specific options.
+  auto DriverArgs = processAndFilterOutInstallAPIOptions(Args);
+  if (Diags->hasErrorOccurred())
+    return;
+
+  // Set up driver to parse remaining input arguments.
+  clang::driver::Driver Driver(ProgName, llvm::sys::getDefaultTargetTriple(),
+                               *Diags, "clang installapi tool");
+  auto TargetAndMode =
+      clang::driver::ToolChain::getTargetAndModeFromProgramName(ProgName);
+  Driver.setTargetAndMode(TargetAndMode);
+  bool HasError = false;
+  llvm::opt::InputArgList ArgList =
+      Driver.ParseArgStrings(DriverArgs, /*UseDriverMode=*/true, HasError);
+  if (HasError)
+    return;
+  Driver.setCheckInputsExist(false);
+
   if (!processDriverOptions(ArgList))
     return;
 
@@ -138,15 +261,16 @@ Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
   if (!processFrontendOptions(ArgList))
     return;
 
+  /// Force cc1 options that should always be on.
+  FrontendArgs = {"-fsyntax-only", "-Wprivate-extern"};
+
   /// Any unclaimed arguments should be handled by invoking the clang frontend.
   for (const Arg *A : ArgList) {
     if (A->isClaimed())
       continue;
-
     FrontendArgs.emplace_back(A->getSpelling());
     llvm::copy(A->getValues(), std::back_inserter(FrontendArgs));
   }
-  FrontendArgs.push_back("-fsyntax-only");
 }
 
 InstallAPIContext Options::createContext() {
@@ -159,6 +283,7 @@ InstallAPIContext Options::createContext() {
 
   Ctx.BA.InstallName = LinkerOpts.InstallName;
   Ctx.BA.CurrentVersion = LinkerOpts.CurrentVersion;
+  Ctx.BA.CompatVersion = LinkerOpts.CompatVersion;
   Ctx.BA.AppExtensionSafe = LinkerOpts.AppExtensionSafe;
   Ctx.FT = DriverOpts.OutFT;
   Ctx.OutputLoc = DriverOpts.OutputPath;
@@ -168,16 +293,40 @@ InstallAPIContext Options::createContext() {
   for (const std::string &ListPath : DriverOpts.FileLists) {
     auto Buffer = FM->getBufferForFile(ListPath);
     if (auto Err = Buffer.getError()) {
-      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      Diags->Report(diag::err_cannot_open_file) << ListPath << Err.message();
       return Ctx;
     }
     if (auto Err = FileListReader::loadHeaders(std::move(Buffer.get()),
                                                Ctx.InputHeaders)) {
-      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      Diags->Report(diag::err_cannot_open_file) << ListPath << std::move(Err);
       return Ctx;
     }
   }
 
+  // Parse binary dylib and initialize verifier.
+  if (DriverOpts.DylibToVerify.empty()) {
+    Ctx.Verifier = std::make_unique<DylibVerifier>();
+    return Ctx;
+  }
+
+  auto Buffer = FM->getBufferForFile(DriverOpts.DylibToVerify);
+  if (auto Err = Buffer.getError()) {
+    Diags->Report(diag::err_cannot_open_file)
+        << DriverOpts.DylibToVerify << Err.message();
+    return Ctx;
+  }
+
+  DylibReader::ParseOption PO;
+  PO.Undefineds = false;
+  Expected<Records> Slices =
+      DylibReader::readFile((*Buffer)->getMemBufferRef(), PO);
+  if (auto Err = Slices.takeError()) {
+    Diags->Report(diag::err_cannot_open_file) << DriverOpts.DylibToVerify;
+    return Ctx;
+  }
+
+  Ctx.Verifier = std::make_unique<DylibVerifier>(
+      std::move(*Slices), Diags, DriverOpts.VerifyMode, DriverOpts.Demangle);
   return Ctx;
 }
 
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index 06f79b62c531ec..2beeafc86bb086 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -11,8 +11,10 @@
 
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/FileManager.h"
+#include "clang/Driver/Driver.h"
 #include "clang/Frontend/FrontendOptions.h"
 #include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/MachO.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
@@ -32,12 +34,21 @@ struct DriverOptions {
   /// \brief Mappings of target triples & tapi targets to build for.
   std::map<llvm::MachO::Target, llvm::Triple> Targets;
 
+  /// \brief Path to binary dylib for comparing.
+  std::string DylibToVerify;
+
   /// \brief Output path.
   std::string OutputPath;
 
   /// \brief File encoding to print.
   FileType OutFT = FileType::TBD_V5;
 
+  /// \brief Verification mode for comparing symbols.
+  VerificationMode VerifyMode = VerificationMode::Pedantic;
+
+  /// \brief Print demangled symbols when reporting errors.
+  bool Demangle = false;
+
   /// \brief Print verbose output.
   bool Verbose = false;
 };
@@ -49,6 +60,9 @@ struct LinkerOptions {
   /// \brief The current version to use for the dynamic library.
   PackedVersion CurrentVersion;
 
+  /// \brief The compatibility version to use for the dynamic library.
+  PackedVersion CompatVersion;
+
   /// \brief Is application extension safe.
   bool AppExtensionSafe = false;
 
@@ -66,6 +80,8 @@ class Options {
   bool processDriverOptions(llvm::opt::InputArgList &Args);
   bool processLinkerOptions(llvm::opt::InputArgList &Args);
   bool processFrontendOptions(llvm::opt::InputArgList &Args);
+  std::vector<const char *>
+  processAndFilterOutInstallAPIOptions(ArrayRef<const char *> Args);
 
 public:
   /// The various options grouped together.
@@ -80,7 +96,7 @@ class Options {
 
   /// \brief Constructor for options.
   Options(clang::DiagnosticsEngine &Diag, FileManager *FM,
-          llvm::opt::InputArgList &Args);
+          ArrayRef<const char *> Args, const StringRef ProgName);
 
   /// \brief Get CC1 arguments after extracting out the irrelevant
   /// ones.
@@ -92,6 +108,16 @@ class Options {
   std::vector<std::string> FrontendArgs;
 };
 
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS,         \
+               VISIBILITY, PARAM, HELPTEXT, METAVAR, VALUES)                   \
+  OPT_##ID,
+#include "InstallAPIOpts.inc"
+  LastOption
+#undef OPTION
+};
+
 } // namespace installapi
 } // namespace clang
 #endif
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 535ef42c78c463..c60be2789bd61e 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -562,6 +562,7 @@ Expected<StringRef> linkDevice(ArrayRef<StringRef> InputFiles,
   case Triple::aarch64_be:
   case Triple::ppc64:
   case Triple::ppc64le:
+  case Triple::systemz:
     return generic::clang(InputFiles, Args);
   default:
     return createStringError(inconvertibleErrorCode(),
diff --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp
index 852b8c226e8859..eb90f082437b33 100644
--- a/clang/tools/diagtool/DiagnosticNames.cpp
+++ b/clang/tools/diagtool/DiagnosticNames.cpp
@@ -42,6 +42,7 @@ static const DiagnosticRecord BuiltinDiagnosticsByID[] = {
 #include "clang/Basic/DiagnosticSemaKinds.inc"
 #include "clang/Basic/DiagnosticAnalysisKinds.inc"
 #include "clang/Basic/DiagnosticRefactoringKinds.inc"
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
 #undef DIAG
 };
 
diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp
index 376025e3605be8..83b5bbb71f5212 100644
--- a/clang/tools/driver/driver.cpp
+++ b/clang/tools/driver/driver.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
@@ -41,7 +42,6 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/Regex.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/TargetSelect.h"
@@ -73,136 +73,8 @@ std::string GetExecutablePath(const char *Argv0, bool CanonicalPrefixes) {
   return llvm::sys::fs::getMainExecutable(Argv0, P);
 }
 
-static const char *GetStableCStr(std::set<std::string> &SavedStrings,
-                                 StringRef S) {
-  return SavedStrings.insert(std::string(S)).first->c_str();
-}
-
-/// ApplyOneQAOverride - Apply a list of edits to the input argument lists.
-///
-/// The input string is a space separated list of edits to perform,
-/// they are applied in order to the input argument lists. Edits
-/// should be one of the following forms:
-///
-///  '#': Silence information about the changes to the command line arguments.
-///
-///  '^': Add FOO as a new argument at the beginning of the command line.
-///
-///  '+': Add FOO as a new argument at the end of the command line.
-///
-///  's/XXX/YYY/': Substitute the regular expression XXX with YYY in the command
-///  line.
-///
-///  'xOPTION': Removes all instances of the literal argument OPTION.
-///
-///  'XOPTION': Removes all instances of the literal argument OPTION,
-///  and the following argument.
-///
-///  'Ox': Removes all flags matching 'O' or 'O[sz0-9]' and adds 'Ox'
-///  at the end of the command line.
-///
-/// \param OS - The stream to write edit information to.
-/// \param Args - The vector of command line arguments.
-/// \param Edit - The override command to perform.
-/// \param SavedStrings - Set to use for storing string representations.
-static void ApplyOneQAOverride(raw_ostream &OS,
-                               SmallVectorImpl<const char*> &Args,
-                               StringRef Edit,
-                               std::set<std::string> &SavedStrings) {
-  // This does not need to be efficient.
-
-  if (Edit[0] == '^') {
-    const char *Str =
-      GetStableCStr(SavedStrings, Edit.substr(1));
-    OS << "### Adding argument " << Str << " at beginning\n";
-    Args.insert(Args.begin() + 1, Str);
-  } else if (Edit[0] == '+') {
-    const char *Str =
-      GetStableCStr(SavedStrings, Edit.substr(1));
-    OS << "### Adding argument " << Str << " at end\n";
-    Args.push_back(Str);
-  } else if (Edit[0] == 's' && Edit[1] == '/' && Edit.ends_with("/") &&
-             Edit.slice(2, Edit.size() - 1).contains('/')) {
-    StringRef MatchPattern = Edit.substr(2).split('/').first;
-    StringRef ReplPattern = Edit.substr(2).split('/').second;
-    ReplPattern = ReplPattern.slice(0, ReplPattern.size()-1);
-
-    for (unsigned i = 1, e = Args.size(); i != e; ++i) {
-      // Ignore end-of-line response file markers
-      if (Args[i] == nullptr)
-        continue;
-      std::string Repl = llvm::Regex(MatchPattern).sub(ReplPattern, Args[i]);
-
-      if (Repl != Args[i]) {
-        OS << "### Replacing '" << Args[i] << "' with '" << Repl << "'\n";
-        Args[i] = GetStableCStr(SavedStrings, Repl);
-      }
-    }
-  } else if (Edit[0] == 'x' || Edit[0] == 'X') {
-    auto Option = Edit.substr(1);
-    for (unsigned i = 1; i < Args.size();) {
-      if (Option == Args[i]) {
-        OS << "### Deleting argument " << Args[i] << '\n';
-        Args.erase(Args.begin() + i);
-        if (Edit[0] == 'X') {
-          if (i < Args.size()) {
-            OS << "### Deleting argument " << Args[i] << '\n';
-            Args.erase(Args.begin() + i);
-          } else
-            OS << "### Invalid X edit, end of command line!\n";
-        }
-      } else
-        ++i;
-    }
-  } else if (Edit[0] == 'O') {
-    for (unsigned i = 1; i < Args.size();) {
-      const char *A = Args[i];
-      // Ignore end-of-line response file markers
-      if (A == nullptr)
-        continue;
-      if (A[0] == '-' && A[1] == 'O' &&
-          (A[2] == '\0' ||
-           (A[3] == '\0' && (A[2] == 's' || A[2] == 'z' ||
-                             ('0' <= A[2] && A[2] <= '9'))))) {
-        OS << "### Deleting argument " << Args[i] << '\n';
-        Args.erase(Args.begin() + i);
-      } else
-        ++i;
-    }
-    OS << "### Adding argument " << Edit << " at end\n";
-    Args.push_back(GetStableCStr(SavedStrings, '-' + Edit.str()));
-  } else {
-    OS << "### Unrecognized edit: " << Edit << "\n";
-  }
-}
-
-/// ApplyQAOverride - Apply a space separated list of edits to the
-/// input argument lists. See ApplyOneQAOverride.
-static void ApplyQAOverride(SmallVectorImpl<const char*> &Args,
-                            const char *OverrideStr,
-                            std::set<std::string> &SavedStrings) {
-  raw_ostream *OS = &llvm::errs();
-
-  if (OverrideStr[0] == '#') {
-    ++OverrideStr;
-    OS = &llvm::nulls();
-  }
-
-  *OS << "### CCC_OVERRIDE_OPTIONS: " << OverrideStr << "\n";
-
-  // This does not need to be efficient.
-
-  const char *S = OverrideStr;
-  while (*S) {
-    const char *End = ::strchr(S, ' ');
-    if (!End)
-      End = S + strlen(S);
-    if (End != S)
-      ApplyOneQAOverride(*OS, Args, std::string(S, End), SavedStrings);
-    S = End;
-    if (*S != '\0')
-      ++S;
-  }
+static const char *GetStableCStr(llvm::StringSet<> &SavedStrings, StringRef S) {
+  return SavedStrings.insert(S).first->getKeyData();
 }
 
 extern int cc1_main(ArrayRef<const char *> Argv, const char *Argv0,
@@ -215,7 +87,7 @@ extern int cc1gen_reproducer_main(ArrayRef<const char *> Argv,
 
 static void insertTargetAndModeArgs(const ParsedClangName &NameParts,
                                     SmallVectorImpl<const char *> &ArgVector,
-                                    std::set<std::string> &SavedStrings) {
+                                    llvm::StringSet<> &SavedStrings) {
   // Put target and mode arguments at the start of argument list so that
   // arguments specified in command line could override them. Avoid putting
   // them at index 0, as an option like '-cc1' must remain the first.
@@ -419,12 +291,13 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) {
     }
   }
 
-  std::set<std::string> SavedStrings;
+  llvm::StringSet<> SavedStrings;
   // Handle CCC_OVERRIDE_OPTIONS, used for editing a command line behind the
   // scenes.
   if (const char *OverrideStr = ::getenv("CCC_OVERRIDE_OPTIONS")) {
     // FIXME: Driver shouldn't take extra initial argument.
-    ApplyQAOverride(Args, OverrideStr, SavedStrings);
+    driver::applyOverrideOptions(Args, OverrideStr, SavedStrings,
+                                 &llvm::errs());
   }
 
   std::string Path = GetExecutablePath(ToolContext.Path, CanonicalPrefixes);
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 418b152ba4a13f..454ee1e42aed1d 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -1778,6 +1778,10 @@ bool CursorVisitor::VisitAttributedTypeLoc(AttributedTypeLoc TL) {
   return Visit(TL.getModifiedLoc());
 }
 
+bool CursorVisitor::VisitCountAttributedTypeLoc(CountAttributedTypeLoc TL) {
+  return Visit(TL.getInnerLoc());
+}
+
 bool CursorVisitor::VisitBTFTagAttributedTypeLoc(BTFTagAttributedTypeLoc TL) {
   return Visit(TL.getWrappedLoc());
 }
diff --git a/clang/unittests/AST/DeclPrinterTest.cpp b/clang/unittests/AST/DeclPrinterTest.cpp
index 0e09ab2a7bba88..e024c41e03b484 100644
--- a/clang/unittests/AST/DeclPrinterTest.cpp
+++ b/clang/unittests/AST/DeclPrinterTest.cpp
@@ -358,6 +358,59 @@ TEST(DeclPrinter, TestCXXRecordDecl11) {
     "class A : virtual public Z, private Y {}"));
 }
 
+TEST(DeclPrinter, TestCXXRecordDecl12) {
+  ASSERT_TRUE(
+      PrintedDeclCXX98Matches("struct S { int x; };"
+                              "namespace NS { class C {};}"
+                              "void foo() {using namespace NS; C c;}",
+                              "foo",
+                              "void foo() {\nusing namespace NS;\nclass "
+                              "NS::C c;\n}\n",
+                              [](PrintingPolicy &Policy) {
+                                Policy.SuppressTagKeyword = false;
+                                Policy.SuppressScope = true;
+                                Policy.TerseOutput = false;
+                              }));
+}
+
+TEST(DeclPrinter, TestCXXRecordDecl13) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches(
+      "struct S { int x; };"
+      "S s1;"
+      "S foo() {return s1;}",
+      "foo", "struct S foo() {\nreturn s1;\n}\n", [](PrintingPolicy &Policy) {
+        Policy.SuppressTagKeyword = false;
+        Policy.SuppressScope = true;
+        Policy.TerseOutput = false;
+      }));
+}
+
+TEST(DeclPrinter, TestCXXRecordDecl14) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches(
+      "struct S { int x; };"
+      "S foo(S s1) {return s1;}",
+      "foo", "struct S foo(struct S s1) {\nreturn s1;\n}\n",
+      [](PrintingPolicy &Policy) {
+        Policy.SuppressTagKeyword = false;
+        Policy.SuppressScope = true;
+        Policy.TerseOutput = false;
+      }));
+}
+TEST(DeclPrinter, TestCXXRecordDecl15) {
+  ASSERT_TRUE(PrintedDeclCXX98Matches(
+      "struct S { int x; };"
+      "namespace NS { class C {};}"
+      "S foo(S s1, NS::C c1) {using namespace NS; C c; return s1;}",
+      "foo",
+      "struct S foo(struct S s1, class NS::C c1) {\nusing namespace NS;\nclass "
+      "NS::C c;\nreturn s1;\n}\n",
+      [](PrintingPolicy &Policy) {
+        Policy.SuppressTagKeyword = false;
+        Policy.SuppressScope = true;
+        Policy.TerseOutput = false;
+      }));
+}
+
 TEST(DeclPrinter, TestFunctionDecl1) {
   ASSERT_TRUE(PrintedDeclCXX98Matches(
     "void A();",
diff --git a/clang/unittests/AST/TypePrinterTest.cpp b/clang/unittests/AST/TypePrinterTest.cpp
index f0a6eb7e9fd8c9..494085a2ebca6b 100644
--- a/clang/unittests/AST/TypePrinterTest.cpp
+++ b/clang/unittests/AST/TypePrinterTest.cpp
@@ -155,6 +155,22 @@ TEST(TypePrinter, TemplateIdWithNTTP) {
       }));
 }
 
+TEST(TypePrinter, TemplateArgumentsSubstitution) {
+  constexpr char Code[] = R"cpp(
+       template <typename Y> class X {};
+       typedef X<int> A;
+       int foo() {
+          return sizeof(A);
+       }
+  )cpp";
+  auto Matcher = typedefNameDecl(hasName("A"), hasType(qualType().bind("id")));
+  ASSERT_TRUE(PrintedTypeMatches(Code, {}, Matcher, "X<int>",
+                                 [](PrintingPolicy &Policy) {
+                                   Policy.SuppressTagKeyword = false;
+                                   Policy.SuppressScope = true;
+                                 }));
+}
+
 TEST(TypePrinter, TemplateArgumentsSubstitution_Expressions) {
   /// Tests clang::isSubstitutedDefaultArgument on TemplateArguments
   /// that are of kind TemplateArgument::Expression
diff --git a/clang/unittests/Analysis/FlowSensitive/DeterminismTest.cpp b/clang/unittests/Analysis/FlowSensitive/DeterminismTest.cpp
index 5ba26a08320364..e794bd4943f232 100644
--- a/clang/unittests/Analysis/FlowSensitive/DeterminismTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/DeterminismTest.cpp
@@ -8,7 +8,7 @@
 
 #include "TestingSupport.h"
 #include "clang/AST/Decl.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysis.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
@@ -34,14 +34,14 @@ std::string analyzeAndPrintExitCondition(llvm::StringRef Code) {
   const auto *Target =
       cast<FunctionDecl>(test::findValueDecl(AST.context(), "target"));
   Environment InitEnv(DACtx, *Target);
-  auto CFCtx = cantFail(ControlFlowContext::build(*Target));
+  auto ACFG = cantFail(AdornedCFG::build(*Target));
 
   NoopAnalysis Analysis(AST.context(), DataflowAnalysisOptions{});
 
-  auto Result = runDataflowAnalysis(CFCtx, Analysis, InitEnv);
+  auto Result = runDataflowAnalysis(ACFG, Analysis, InitEnv);
   EXPECT_FALSE(!Result) << Result.takeError();
 
-  Atom FinalFC = (*Result)[CFCtx.getCFG().getExit().getBlockID()]
+  Atom FinalFC = (*Result)[ACFG.getCFG().getExit().getBlockID()]
                      ->Env.getFlowConditionToken();
   std::string Textual;
   llvm::raw_string_ostream OS(Textual);
diff --git a/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp b/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
index 57920c49a7d3dd..88630119ba8a1a 100644
--- a/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
@@ -57,7 +57,7 @@ class TestLogger : public Logger {
 private:
   llvm::raw_string_ostream OS;
 
-  void beginAnalysis(const ControlFlowContext &,
+  void beginAnalysis(const AdornedCFG &,
                      TypeErasedDataflowAnalysis &) override {
     logText("beginAnalysis()");
   }
diff --git a/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp b/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
index cd6a37d370e854..55baa4e2a53775 100644
--- a/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
@@ -198,7 +198,7 @@ TEST(RecordOpsTest, RecordsEqual) {
       });
 }
 
-TEST(TransferTest, CopyRecordFromDerivedToBase) {
+TEST(TransferTest, CopyRecordBetweenDerivedAndBase) {
   std::string Code = R"(
     struct A {
       int i;
@@ -212,8 +212,23 @@ TEST(TransferTest, CopyRecordFromDerivedToBase) {
       // [[p]]
     }
   )";
+  auto SyntheticFieldCallback = [](QualType Ty) -> llvm::StringMap<QualType> {
+    CXXRecordDecl *ADecl = nullptr;
+    if (Ty.getAsString() == "A")
+      ADecl = Ty->getAsCXXRecordDecl();
+    else if (Ty.getAsString() == "B")
+      ADecl = Ty->getAsCXXRecordDecl()
+                  ->bases_begin()
+                  ->getType()
+                  ->getAsCXXRecordDecl();
+    else
+      return {};
+    QualType IntTy = getFieldNamed(ADecl, "i")->getType();
+    return {{"synth_int", IntTy}};
+  };
+  // Test copying derived to base class.
   runDataflow(
-      Code, /*SyntheticFieldCallback=*/{},
+      Code, SyntheticFieldCallback,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
         Environment Env = getEnvironmentAtAnnotation(Results, "p").fork();
@@ -224,11 +239,38 @@ TEST(TransferTest, CopyRecordFromDerivedToBase) {
 
         EXPECT_NE(Env.getValue(*A.getChild(*IDecl)),
                   Env.getValue(*B.getChild(*IDecl)));
+        EXPECT_NE(Env.getValue(A.getSyntheticField("synth_int")),
+                  Env.getValue(B.getSyntheticField("synth_int")));
 
         copyRecord(B, A, Env);
 
         EXPECT_EQ(Env.getValue(*A.getChild(*IDecl)),
                   Env.getValue(*B.getChild(*IDecl)));
+        EXPECT_EQ(Env.getValue(A.getSyntheticField("synth_int")),
+                  Env.getValue(B.getSyntheticField("synth_int")));
+      });
+  // Test copying base to derived class.
+  runDataflow(
+      Code, SyntheticFieldCallback,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        Environment Env = getEnvironmentAtAnnotation(Results, "p").fork();
+
+        const ValueDecl *IDecl = findValueDecl(ASTCtx, "i");
+        auto &A = getLocForDecl<RecordStorageLocation>(ASTCtx, Env, "a");
+        auto &B = getLocForDecl<RecordStorageLocation>(ASTCtx, Env, "b");
+
+        EXPECT_NE(Env.getValue(*A.getChild(*IDecl)),
+                  Env.getValue(*B.getChild(*IDecl)));
+        EXPECT_NE(Env.getValue(A.getSyntheticField("synth_int")),
+                  Env.getValue(B.getSyntheticField("synth_int")));
+
+        copyRecord(A, B, Env);
+
+        EXPECT_EQ(Env.getValue(*A.getChild(*IDecl)),
+                  Env.getValue(*B.getChild(*IDecl)));
+        EXPECT_EQ(Env.getValue(A.getSyntheticField("synth_int")),
+                  Env.getValue(B.getSyntheticField("synth_int")));
       });
 }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
index b7cf6cc966edb0..e3c7ff685f5724 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
@@ -28,7 +28,7 @@
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/ASTMatchers/ASTMatchersInternal.h"
 #include "clang/Analysis/CFG.h"
-#include "clang/Analysis/FlowSensitive/ControlFlowContext.h"
+#include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysis.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
@@ -100,7 +100,7 @@ struct AnalysisOutputs {
   const FunctionDecl *Target;
   /// Contains the control flow graph built from the body of the `Target`
   /// function and is analyzed.
-  const ControlFlowContext &CFCtx;
+  const AdornedCFG &ACFG;
   /// The analysis to be run.
   TypeErasedDataflowAnalysis &Analysis;
   /// Initial state to start the analysis.
@@ -261,9 +261,10 @@ checkDataflow(AnalysisInputs<AnalysisT> AI,
           llvm::errc::invalid_argument, "Could not find the target function.");
 
     // Build the control flow graph for the target function.
-    auto MaybeCFCtx = ControlFlowContext::build(*Target);
-    if (!MaybeCFCtx) return MaybeCFCtx.takeError();
-    auto &CFCtx = *MaybeCFCtx;
+    auto MaybeACFG = AdornedCFG::build(*Target);
+    if (!MaybeACFG)
+      return MaybeACFG.takeError();
+    auto &ACFG = *MaybeACFG;
 
     // Initialize states for running dataflow analysis.
     DataflowAnalysisContext DACtx(AI.SolverFactory(),
@@ -271,7 +272,7 @@ checkDataflow(AnalysisInputs<AnalysisT> AI,
     Environment InitEnv(DACtx, *Target);
     auto Analysis = AI.MakeAnalysis(Context, InitEnv);
 
-    AnalysisOutputs AO{AnnotatedCode, Context, Target, CFCtx,
+    AnalysisOutputs AO{AnnotatedCode, Context, Target, ACFG,
                        Analysis,      InitEnv, {}};
 
     // Additional test setup.
@@ -283,7 +284,7 @@ checkDataflow(AnalysisInputs<AnalysisT> AI,
     // the post-analysis states for the CFG blocks that have been evaluated.
     llvm::Expected<std::vector<std::optional<TypeErasedDataflowAnalysisState>>>
         MaybeBlockStates =
-            runTypeErasedDataflowAnalysis(CFCtx, Analysis, InitEnv,
+            runTypeErasedDataflowAnalysis(ACFG, Analysis, InitEnv,
                                           TypeErasedPostVisitCFG,
                                           MaxBlockVisitsInAnalysis);
     if (!MaybeBlockStates) return MaybeBlockStates.takeError();
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index a8c282f140b4cd..a243535d387257 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2313,8 +2313,6 @@ TEST(TransferTest, AssignmentOperator_ArgByValue) {
 }
 
 TEST(TransferTest, AssignmentOperatorFromBase) {
-  // This is a crash repro. We don't model the copy this case, so no
-  // expectations on the copied field of the base class are checked.
   std::string Code = R"(
     struct Base {
       int base;
@@ -2326,14 +2324,33 @@ TEST(TransferTest, AssignmentOperatorFromBase) {
     void target(Base B, Derived D) {
       D.base = 1;
       D.derived = 1;
+      // [[before]]
       D = B;
-      // [[p]]
+      // [[after]]
     }
   )";
   runDataflow(
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
-         ASTContext &ASTCtx) {});
+         ASTContext &ASTCtx) {
+        const Environment &EnvBefore =
+            getEnvironmentAtAnnotation(Results, "before");
+        const Environment &EnvAfter =
+            getEnvironmentAtAnnotation(Results, "after");
+
+        auto &BLoc =
+            getLocForDecl<RecordStorageLocation>(ASTCtx, EnvBefore, "B");
+        auto &DLoc =
+            getLocForDecl<RecordStorageLocation>(ASTCtx, EnvBefore, "D");
+
+        EXPECT_NE(getFieldValue(&BLoc, "base", ASTCtx, EnvBefore),
+                  getFieldValue(&DLoc, "base", ASTCtx, EnvBefore));
+        EXPECT_EQ(getFieldValue(&BLoc, "base", ASTCtx, EnvAfter),
+                  getFieldValue(&DLoc, "base", ASTCtx, EnvAfter));
+
+        EXPECT_EQ(getFieldValue(&DLoc, "derived", ASTCtx, EnvBefore),
+                  getFieldValue(&DLoc, "derived", ASTCtx, EnvAfter));
+      });
 }
 
 TEST(TransferTest, AssignmentOperatorFromCallResult) {
@@ -2924,6 +2941,36 @@ TEST(TransferTest, ResultObjectLocation) {
       });
 }
 
+TEST(TransferTest, ResultObjectLocationForDefaultArgExpr) {
+  std::string Code = R"(
+    struct S {};
+    void funcWithDefaultArg(S s = S());
+    void target() {
+      funcWithDefaultArg();
+      // [[p]]
+    }
+  )";
+
+  using ast_matchers::cxxDefaultArgExpr;
+  using ast_matchers::match;
+  using ast_matchers::selectFirst;
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        auto *DefaultArg = selectFirst<CXXDefaultArgExpr>(
+            "default_arg",
+            match(cxxDefaultArgExpr().bind("default_arg"), ASTCtx));
+        ASSERT_NE(DefaultArg, nullptr);
+
+        // The values for default arguments aren't modeled; we merely verify
+        // that we can get a result object location for a default arg.
+        Env.getResultObjectLocation(*DefaultArg);
+      });
+}
+
 TEST(TransferTest, ResultObjectLocationForDefaultInitExpr) {
   std::string Code = R"(
     struct S {};
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 9d05a0d6ca4010..bea00ab1a1f062 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -66,20 +66,20 @@ class DataflowAnalysisTest : public Test {
               AST->getASTContext()));
     assert(Func != nullptr);
 
-    CFCtx = std::make_unique<ControlFlowContext>(
-        llvm::cantFail(ControlFlowContext::build(*Func)));
+    ACFG =
+        std::make_unique<AdornedCFG>(llvm::cantFail(AdornedCFG::build(*Func)));
 
     AnalysisT Analysis = MakeAnalysis(AST->getASTContext());
     DACtx = std::make_unique<DataflowAnalysisContext>(
         std::make_unique<WatchedLiteralsSolver>());
     Environment Env(*DACtx, *Func);
 
-    return runDataflowAnalysis(*CFCtx, Analysis, Env);
+    return runDataflowAnalysis(*ACFG, Analysis, Env);
   }
 
   /// Returns the `CFGBlock` containing `S` (and asserts that it exists).
   const CFGBlock *blockForStmt(const Stmt &S) {
-    const CFGBlock *Block = CFCtx->getStmtToBlock().lookup(&S);
+    const CFGBlock *Block = ACFG->getStmtToBlock().lookup(&S);
     assert(Block != nullptr);
     return Block;
   }
@@ -105,7 +105,7 @@ class DataflowAnalysisTest : public Test {
   }
 
   std::unique_ptr<ASTUnit> AST;
-  std::unique_ptr<ControlFlowContext> CFCtx;
+  std::unique_ptr<AdornedCFG> ACFG;
   std::unique_ptr<DataflowAnalysisContext> DACtx;
 };
 
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
index b6e4973fd7cb2b..9430730004dbd2 100644
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
@@ -3383,6 +3383,66 @@ TEST_P(UncheckedOptionalAccessTest, LambdaCaptureStateNotPropagated) {
     }
   )");
 }
+
+TEST_P(UncheckedOptionalAccessTest, ClassDerivedFromOptional) {
+  ExpectDiagnosticsFor(R"(
+    #include "unchecked_optional_access_test.h"
+
+    struct Derived : public $ns::$optional<int> {};
+
+    void target(Derived opt) {
+      *opt;  // [[unsafe]]
+      if (opt.has_value())
+        *opt;
+
+      // The same thing, but with a pointer receiver.
+      Derived *popt = &opt;
+      **popt;  // [[unsafe]]
+      if (popt->has_value())
+        **popt;
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, ClassTemplateDerivedFromOptional) {
+  ExpectDiagnosticsFor(R"(
+    #include "unchecked_optional_access_test.h"
+
+    template <class T>
+    struct Derived : public $ns::$optional<T> {};
+
+    void target(Derived<int> opt) {
+      *opt;  // [[unsafe]]
+      if (opt.has_value())
+        *opt;
+
+      // The same thing, but with a pointer receiver.
+      Derived<int> *popt = &opt;
+      **popt;  // [[unsafe]]
+      if (popt->has_value())
+        **popt;
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, ClassDerivedPrivatelyFromOptional) {
+  // Classes that derive privately from optional can themselves still call
+  // member functions of optional. Check that we model the optional correctly
+  // in this situation.
+  ExpectDiagnosticsFor(R"(
+    #include "unchecked_optional_access_test.h"
+
+    struct Derived : private $ns::$optional<int> {
+      void Method() {
+        **this;  // [[unsafe]]
+        if (this->has_value())
+          **this;
+      }
+    };
+  )",
+                       ast_matchers::hasName("Method"));
+}
+
 // FIXME: Add support for:
 // - constructors (copy, move)
 // - assignment operators (default, copy, move)
diff --git a/clang/unittests/Format/.clang-format b/clang/unittests/Format/.clang-format
index f95602cab0f7fc..d7331b3c8cf02e 100644
--- a/clang/unittests/Format/.clang-format
+++ b/clang/unittests/Format/.clang-format
@@ -1,6 +1 @@
-BasedOnStyle: LLVM
-InsertBraces: true
-InsertNewlineAtEOF: true
-LineEnding: LF
-RemoveBracesLLVM: true
-RemoveParentheses: ReturnStatement
+BasedOnStyle: clang-format
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index fc367a7a5a8981..bea989c8c306db 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -3802,6 +3802,27 @@ TEST_F(FormatTest, FormatsEnum) {
                "  // Comment 2\n"
                "  TWO,\n"
                "};");
+  verifyFormat("enum [[clang::enum_extensibility(open)]] E {\n"
+               "  // Comment 1\n"
+               "  ONE,\n"
+               "  // Comment 2\n"
+               "  TWO\n"
+               "};");
+  verifyFormat("enum [[nodiscard]] [[clang::enum_extensibility(open)]] E {\n"
+               "  // Comment 1\n"
+               "  ONE,\n"
+               "  // Comment 2\n"
+               "  TWO\n"
+               "};");
+  verifyFormat("enum [[clang::enum_extensibility(open)]] E { // foo\n"
+               "  A,\n"
+               "  // bar\n"
+               "  B\n"
+               "};",
+               "enum [[clang::enum_extensibility(open)]] E{// foo\n"
+               "                                           A,\n"
+               "                                           // bar\n"
+               "                                           B};");
 
   // Not enums.
   verifyFormat("enum X f() {\n"
@@ -6938,6 +6959,10 @@ TEST_F(FormatTest, PutEmptyBlocksIntoOneLine) {
                "else if (b) { }\n"
                "else { }",
                Style);
+
+  Style = getLLVMStyle(FormatStyle::LK_CSharp);
+  Style.SpaceInEmptyBlock = true;
+  verifyFormat("Event += () => { };", Style);
 }
 
 TEST_F(FormatTest, FormatBeginBlockEndMacros) {
@@ -11450,6 +11475,11 @@ TEST_F(FormatTest, UnderstandsNewAndDelete) {
                "void new (link p);\n"
                "void delete (link p);");
 
+  verifyFormat("{ p->new(); }\n"
+               "{ p->delete(); }",
+               "{ p->new (); }\n"
+               "{ p->delete (); }");
+
   FormatStyle AfterPlacementOperator = getLLVMStyle();
   AfterPlacementOperator.SpaceBeforeParens = FormatStyle::SBPO_Custom;
   EXPECT_TRUE(
diff --git a/clang/unittests/Format/FormatTestTableGen.cpp b/clang/unittests/Format/FormatTestTableGen.cpp
index 76b871e2e1a522..c96866f0840f00 100644
--- a/clang/unittests/Format/FormatTestTableGen.cpp
+++ b/clang/unittests/Format/FormatTestTableGen.cpp
@@ -332,6 +332,85 @@ TEST_F(FormatTestTableGen, Assert) {
   verifyFormat("assert !le(DefVar1, 0), \"Assert1\";\n");
 }
 
+TEST_F(FormatTestTableGen, DAGArgBreakElements) {
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_TableGen);
+  Style.ColumnLimit = 60;
+  // By default, the DAGArg does not have a break inside.
+  ASSERT_EQ(Style.TableGenBreakInsideDAGArg, FormatStyle::DAS_DontBreak);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins a:$src1, aa:$src2, aaa:$src3)\n"
+               "}\n",
+               Style);
+  // This option forces to break inside the DAGArg.
+  Style.TableGenBreakInsideDAGArg = FormatStyle::DAS_BreakElements;
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins a:$src1,\n"
+               "                    aa:$src2,\n"
+               "                    aaa:$src3);\n"
+               "}\n",
+               Style);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (other a:$src1,\n"
+               "                      aa:$src2,\n"
+               "                      aaa:$src3);\n"
+               "}\n",
+               Style);
+  // Then, limit the DAGArg operator only to "ins".
+  Style.TableGenBreakingDAGArgOperators = {"ins"};
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins a:$src1,\n"
+               "                    aa:$src2,\n"
+               "                    aaa:$src3);\n"
+               "}\n",
+               Style);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (other a:$src1, aa:$src2, aaa:$src3)\n"
+               "}\n",
+               Style);
+}
+
+TEST_F(FormatTestTableGen, DAGArgBreakAll) {
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_TableGen);
+  Style.ColumnLimit = 60;
+  // By default, the DAGArg does not have a break inside.
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins a:$src1, aa:$src2, aaa:$src3)\n"
+               "}\n",
+               Style);
+  // This option forces to break inside the DAGArg.
+  Style.TableGenBreakInsideDAGArg = FormatStyle::DAS_BreakAll;
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins\n"
+               "      a:$src1,\n"
+               "      aa:$src2,\n"
+               "      aaa:$src3\n"
+               "  );\n"
+               "}\n",
+               Style);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (other\n"
+               "      a:$src1,\n"
+               "      aa:$src2,\n"
+               "      aaa:$src3\n"
+               "  );\n"
+               "}\n",
+               Style);
+  // Then, limit the DAGArg operator only to "ins".
+  Style.TableGenBreakingDAGArgOperators = {"ins"};
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (ins\n"
+               "      a:$src1,\n"
+               "      aa:$src2,\n"
+               "      aaa:$src3\n"
+               "  );\n"
+               "}\n",
+               Style);
+  verifyFormat("def Def : Parent {\n"
+               "  let dagarg = (other a:$src1, aa:$src2, aaa:$src3);\n"
+               "}\n",
+               Style);
+}
+
 TEST_F(FormatTestTableGen, CondOperatorAlignment) {
   FormatStyle Style = getGoogleStyle(FormatStyle::LK_TableGen);
   Style.ColumnLimit = 60;
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index fcba0e63c78254..1aa855b3419877 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2354,6 +2354,76 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   EXPECT_TOKEN(Tokens[4], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::greater, TT_TemplateCloser);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_FunctionLBrace);
+
+  // DAGArg breaking options. They use different token types depending on what
+  // is specified.
+  Style.TableGenBreakInsideDAGArg = FormatStyle::DAS_BreakElements;
+
+  // When TableGenBreakInsideDAGArg is DAS_BreakElements and
+  // TableGenBreakingDAGArgOperators is not specified, it makes all the DAGArg
+  // elements to have line break.
+  Tokens = AnnotateValue("(ins type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorID); // ins
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  Tokens = AnnotateValue("(other type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorID); // other
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  // For non-identifier operators, breaks after the operator.
+  Tokens = AnnotateValue("(!cast<Type>(\"Name\") type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 16u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[7], tok::r_paren, TT_TableGenDAGArgOperatorToBreak);
+  EXPECT_TOKEN(Tokens[11], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[15], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  Style.TableGenBreakInsideDAGArg = FormatStyle::DAS_BreakAll;
+
+  // When TableGenBreakInsideDAGArg is DAS_BreakAll and
+  // TableGenBreakingDAGArgOperators is not specified, it makes all the DAGArg
+  // to have line break inside it.
+  Tokens = AnnotateValue("(ins type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorToBreak); // ins
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  Tokens = AnnotateValue("(other type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorToBreak); // other
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  // If TableGenBreakingDAGArgOperators is specified, it is limited to the
+  // specified operators.
+  Style.TableGenBreakingDAGArgOperators = {"ins", "outs"};
+  Tokens = AnnotateValue("(ins type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpenerToBreak);
+  EXPECT_TOKEN(Tokens[1], tok::identifier,
+               TT_TableGenDAGArgOperatorToBreak); // ins
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListCommaToBreak);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
+
+  Tokens = AnnotateValue("(other type1:$src1, type2:$src2)");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_paren, TT_TableGenDAGArgOpener);
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_Unknown); // other
+  EXPECT_TOKEN(Tokens[5], tok::comma, TT_TableGenDAGArgListComma);
+  EXPECT_TOKEN(Tokens[9], tok::r_paren, TT_TableGenDAGArgCloser);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandConstructors) {
@@ -2632,6 +2702,11 @@ TEST_F(TokenAnnotatorTest, StartOfName) {
   EXPECT_TOKEN(Tokens[2], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[4], tok::identifier, TT_StartOfName);
+
+  Tokens = annotate("@interface NSCoder (TestCoder)");
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::at, TT_ObjCDecl);
+  EXPECT_TOKEN(Tokens[2], tok::identifier, TT_StartOfName);
 }
 
 TEST_F(TokenAnnotatorTest, BraceKind) {
diff --git a/clang/unittests/Tooling/StandardLibraryTest.cpp b/clang/unittests/Tooling/StandardLibraryTest.cpp
index edca31649accfa..e4c109f3d580d1 100644
--- a/clang/unittests/Tooling/StandardLibraryTest.cpp
+++ b/clang/unittests/Tooling/StandardLibraryTest.cpp
@@ -185,6 +185,19 @@ TEST(StdlibTest, RecognizerForC99) {
             stdlib::Symbol::named("", "uint8_t", stdlib::Lang::C));
 }
 
+TEST(StdlibTest, SpecialCMappings) {
+  TestInputs Input("typedef char size_t;");
+  Input.Language = TestLanguage::Lang_C99;
+  TestAST AST(Input);
+
+  auto &SizeT = lookup(AST, "size_t");
+  stdlib::Recognizer Recognizer;
+  auto ActualSym = Recognizer(&SizeT);
+  assert(ActualSym);
+  EXPECT_EQ(ActualSym, stdlib::Symbol::named("", "size_t", stdlib::Lang::C));
+  EXPECT_EQ(ActualSym->header()->name(), "<stddef.h>");
+}
+
 } // namespace
 } // namespace tooling
 } // namespace clang
diff --git a/clang/www/get_started.html b/clang/www/get_started.html
index dda914a46904e8..8e4d36640be735 100755
--- a/clang/www/get_started.html
+++ b/clang/www/get_started.html
@@ -86,13 +86,16 @@ <h3 id="buildNix">On Unix-like Systems</h3>
   </ul>
   </li>
 
-  <li>If you intend to use Clang's C++ support, you may need to tell it how
-      to find your C++ standard library headers. In general, Clang will detect
-      the best version of libstdc++ headers available and use them - it will
-      look both for system installations of libstdc++ as well as installations
-      adjacent to Clang itself. If your configuration fits neither of these
-      scenarios, you can use the <tt>-DGCC_INSTALL_PREFIX</tt> cmake option
-      to tell Clang where the gcc containing the desired libstdc++ is installed.
+  <li>On Linux, you may need GCC runtime libraries (e.g. <tt>crtbeginS.o,
+    libstdc++.so</tt>) and libstdc++ headers. In general, Clang will detect
+    well-known GCC installation paths matching the target triple (configured at
+    build time (see <tt>clang --version</tt>); overriden by
+    <tt>--target=</tt>) and use the largest version. If your configuration fits
+    none of the standard scenarios, you can set <tt>--gcc-install-dir=</tt> to
+    the GCC installation directory (something like
+    <tt>/usr/lib/gcc/$triple/$major</tt>). If your GCC installation is under
+    <tt>/usr/lib/gcc</tt> but uses a different triple, you can set
+    <tt>--gcc-triple=$triple</tt>.
   </li>
   <li>Try it out (assuming you add llvm/build/bin to your path):
   <ul>
diff --git a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
index 2bf115973a49b3..d5b2e7970f11cc 100644
--- a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
@@ -65,8 +65,9 @@ function(clang_compile object_file source)
   cmake_parse_arguments(SOURCE "" "" "CFLAGS;DEPS" ${ARGN})
   get_filename_component(source_rpath ${source} REALPATH)
   if(NOT COMPILER_RT_STANDALONE_BUILD)
-    list(APPEND SOURCE_DEPS clang compiler-rt-headers)
+    list(APPEND SOURCE_DEPS clang)
   endif()
+  list(APPEND SOURCE_DEPS compiler-rt-headers)
   if (TARGET CompilerRTUnitTestCheckCxx)
     list(APPEND SOURCE_DEPS CompilerRTUnitTestCheckCxx)
   endif()
diff --git a/compiler-rt/lib/hwasan/hwasan_checks.h b/compiler-rt/lib/hwasan/hwasan_checks.h
index 0911af30dcb8fc..735d21a5db7746 100644
--- a/compiler-rt/lib/hwasan/hwasan_checks.h
+++ b/compiler-rt/lib/hwasan/hwasan_checks.h
@@ -140,7 +140,6 @@ __attribute__((always_inline, nodebug)) static inline uptr ShortTagSize(
 
 __attribute__((always_inline, nodebug)) static inline bool
 PossiblyShortTagMatches(tag_t mem_tag, uptr ptr, uptr sz) {
-  DCHECK(IsAligned(ptr, kShadowAlignment));
   tag_t ptr_tag = GetTagFromPointer(ptr);
   if (ptr_tag == mem_tag)
     return true;
diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 1829358705fe82..a04175ba1e4b56 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -339,7 +339,7 @@ struct TrampolineMemoryRegion {
   uptr max_size;
 };
 
-UNUSED static const uptr kTrampolineScanLimitRange = 1 << 31;  // 2 gig
+UNUSED static const uptr kTrampolineScanLimitRange = 1ull << 31;  // 2 gig
 static const int kMaxTrampolineRegion = 1024;
 static TrampolineMemoryRegion TrampolineRegions[kMaxTrampolineRegion];
 
diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp
index 3cdf10c149902c..a2fc27de1901b4 100644
--- a/compiler-rt/lib/msan/msan.cpp
+++ b/compiler-rt/lib/msan/msan.cpp
@@ -467,7 +467,7 @@ void __msan_init() {
   __msan_clear_on_return();
   if (__msan_get_track_origins())
     VPrintf(1, "msan_track_origins\n");
-  if (!InitShadow(__msan_get_track_origins())) {
+  if (!InitShadowWithReExec(__msan_get_track_origins())) {
     Printf("FATAL: MemorySanitizer can not mmap the shadow memory.\n");
     Printf("FATAL: Make sure to compile with -fPIE and to link with -pie.\n");
     Printf("FATAL: Disabling ASLR is known to cause this error.\n");
diff --git a/compiler-rt/lib/msan/msan.h b/compiler-rt/lib/msan/msan.h
index 710447a3e1a357..7fb58be67a02cd 100644
--- a/compiler-rt/lib/msan/msan.h
+++ b/compiler-rt/lib/msan/msan.h
@@ -33,12 +33,18 @@ struct MappingDesc {
   uptr start;
   uptr end;
   enum Type {
-    INVALID, APP, SHADOW, ORIGIN
+    INVALID = 1,
+    ALLOCATOR = 2,
+    APP = 4,
+    SHADOW = 8,
+    ORIGIN = 16,
   } type;
   const char *name;
 };
 
-
+// Note: MappingDesc::ALLOCATOR entries are only used to check for memory
+// layout compatibility. The actual allocation settings are in
+// msan_allocator.cpp, which need to be kept in sync.
 #if SANITIZER_LINUX && defined(__mips64)
 
 // MIPS64 maps:
@@ -84,7 +90,8 @@ const MappingDesc kMemoryLayout[] = {
     {0X0B00000000000, 0X0C00000000000, MappingDesc::SHADOW, "shadow-10-13"},
     {0X0C00000000000, 0X0D00000000000, MappingDesc::INVALID, "invalid"},
     {0X0D00000000000, 0X0E00000000000, MappingDesc::ORIGIN, "origin-10-13"},
-    {0X0E00000000000, 0X1000000000000, MappingDesc::APP, "app-15"},
+    {0x0E00000000000, 0x0E40000000000, MappingDesc::ALLOCATOR, "allocator"},
+    {0X0E40000000000, 0X1000000000000, MappingDesc::APP, "app-15"},
 };
 # define MEM_TO_SHADOW(mem) ((uptr)mem ^ 0xB00000000000ULL)
 # define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x200000000000ULL)
@@ -106,7 +113,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x510000000000ULL, 0x600000000000ULL, MappingDesc::APP, "app-2"},
     {0x600000000000ULL, 0x610000000000ULL, MappingDesc::ORIGIN, "origin-1"},
     {0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x700000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
+    {0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
 #  define MEM_TO_SHADOW(mem) (((uptr)(mem)) ^ 0x500000000000ULL)
 #  define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x100000000000ULL)
 
@@ -118,7 +126,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x180200000000ULL, 0x1C0000000000ULL, MappingDesc::INVALID, "invalid"},
     {0x1C0000000000ULL, 0x2C0200000000ULL, MappingDesc::ORIGIN, "origin"},
     {0x2C0200000000ULL, 0x300000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x300000000000ULL, 0x800000000000ULL, MappingDesc::APP, "high memory"}};
+    {0x300000000000ULL, 0x320000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x320000000000ULL, 0x800000000000ULL, MappingDesc::APP, "high memory"}};
 
 // Various kernels use different low end ranges but we can combine them into one
 // big range. They also use different high end ranges but we can map them all to
@@ -141,7 +150,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x180000000000ULL, 0x1C0000000000ULL, MappingDesc::INVALID, "invalid"},
     {0x1C0000000000ULL, 0x2C0000000000ULL, MappingDesc::ORIGIN, "origin"},
     {0x2C0000000000ULL, 0x440000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x440000000000ULL, 0x500000000000ULL, MappingDesc::APP, "high memory"}};
+    {0x440000000000ULL, 0x460000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x460000000000ULL, 0x500000000000ULL, MappingDesc::APP, "high memory"}};
 
 #define MEM_TO_SHADOW(mem) \
   ((((uptr)(mem)) & ~0xC00000000000ULL) + 0x080000000000ULL)
@@ -208,7 +218,8 @@ const MappingDesc kMemoryLayout[] = {
     {0x510000000000ULL, 0x600000000000ULL, MappingDesc::APP, "app-2"},
     {0x600000000000ULL, 0x610000000000ULL, MappingDesc::ORIGIN, "origin-1"},
     {0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"},
-    {0x700000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
+    {0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
 #define MEM_TO_SHADOW(mem) (((uptr)(mem)) ^ 0x500000000000ULL)
 #define SHADOW_TO_ORIGIN(mem) (((uptr)(mem)) + 0x100000000000ULL)
 
@@ -223,20 +234,22 @@ const uptr kMemoryLayoutSize = sizeof(kMemoryLayout) / sizeof(kMemoryLayout[0]);
 #ifndef __clang__
 __attribute__((optimize("unroll-loops")))
 #endif
-inline bool addr_is_type(uptr addr, MappingDesc::Type mapping_type) {
+inline bool
+addr_is_type(uptr addr, int mapping_types) {
 // It is critical for performance that this loop is unrolled (because then it is
 // simplified into just a few constant comparisons).
 #ifdef __clang__
 #pragma unroll
 #endif
   for (unsigned i = 0; i < kMemoryLayoutSize; ++i)
-    if (kMemoryLayout[i].type == mapping_type &&
+    if ((kMemoryLayout[i].type & mapping_types) &&
         addr >= kMemoryLayout[i].start && addr < kMemoryLayout[i].end)
       return true;
   return false;
 }
 
-#define MEM_IS_APP(mem) addr_is_type((uptr)(mem), MappingDesc::APP)
+#define MEM_IS_APP(mem) \
+  (addr_is_type((uptr)(mem), MappingDesc::APP | MappingDesc::ALLOCATOR))
 #define MEM_IS_SHADOW(mem) addr_is_type((uptr)(mem), MappingDesc::SHADOW)
 #define MEM_IS_ORIGIN(mem) addr_is_type((uptr)(mem), MappingDesc::ORIGIN)
 
@@ -250,7 +263,7 @@ extern bool msan_init_is_running;
 extern int msan_report_count;
 
 bool ProtectRange(uptr beg, uptr end);
-bool InitShadow(bool init_origins);
+bool InitShadowWithReExec(bool init_origins);
 char *GetProcSelfMaps();
 void InitializeInterceptors();
 
diff --git a/compiler-rt/lib/msan/msan_allocator.cpp b/compiler-rt/lib/msan/msan_allocator.cpp
index 0b2dd2b2f1883d..b1bc5b9390f75b 100644
--- a/compiler-rt/lib/msan/msan_allocator.cpp
+++ b/compiler-rt/lib/msan/msan_allocator.cpp
@@ -48,6 +48,9 @@ struct MsanMapUnmapCallback {
   }
 };
 
+// Note: to ensure that the allocator is compatible with the application memory
+// layout (especially with high-entropy ASLR), kSpaceBeg and kSpaceSize must be
+// duplicated as MappingDesc::ALLOCATOR in msan.h.
 #if defined(__mips64)
 static const uptr kMaxAllowedMallocSize = 2UL << 30;
 
diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp
index c7ecb7cad56661..cd2d9f5c720c57 100644
--- a/compiler-rt/lib/msan/msan_linux.cpp
+++ b/compiler-rt/lib/msan/msan_linux.cpp
@@ -20,6 +20,9 @@
 #  include <signal.h>
 #  include <stdio.h>
 #  include <stdlib.h>
+#  if SANITIZER_LINUX
+#    include <sys/personality.h>
+#  endif
 #  include <sys/resource.h>
 #  include <sys/time.h>
 #  include <unistd.h>
@@ -43,11 +46,13 @@ void ReportMapRange(const char *descr, uptr beg, uptr size) {
   }
 }
 
-static bool CheckMemoryRangeAvailability(uptr beg, uptr size) {
+static bool CheckMemoryRangeAvailability(uptr beg, uptr size, bool verbose) {
   if (size > 0) {
     uptr end = beg + size - 1;
     if (!MemoryRangeIsAvailable(beg, end)) {
-      Printf("FATAL: Memory range 0x%zx - 0x%zx is not available.\n", beg, end);
+      if (verbose)
+        Printf("FATAL: Memory range 0x%zx - 0x%zx is not available.\n", beg,
+               end);
       return false;
     }
   }
@@ -86,7 +91,7 @@ static void CheckMemoryLayoutSanity() {
     CHECK(addr_is_type(start, type));
     CHECK(addr_is_type((start + end) / 2, type));
     CHECK(addr_is_type(end - 1, type));
-    if (type == MappingDesc::APP) {
+    if (type == MappingDesc::APP || type == MappingDesc::ALLOCATOR) {
       uptr addr = start;
       CHECK(MEM_IS_SHADOW(MEM_TO_SHADOW(addr)));
       CHECK(MEM_IS_ORIGIN(MEM_TO_ORIGIN(addr)));
@@ -106,7 +111,7 @@ static void CheckMemoryLayoutSanity() {
   }
 }
 
-bool InitShadow(bool init_origins) {
+static bool InitShadow(bool init_origins, bool dry_run) {
   // Let user know mapping parameters first.
   VPrintf(1, "__msan_init %p\n", reinterpret_cast<void *>(&__msan_init));
   for (unsigned i = 0; i < kMemoryLayoutSize; ++i)
@@ -116,8 +121,9 @@ bool InitShadow(bool init_origins) {
   CheckMemoryLayoutSanity();
 
   if (!MEM_IS_APP(&__msan_init)) {
-    Printf("FATAL: Code %p is out of application range. Non-PIE build?\n",
-           reinterpret_cast<void *>(&__msan_init));
+    if (!dry_run)
+      Printf("FATAL: Code %p is out of application range. Non-PIE build?\n",
+             reinterpret_cast<void *>(&__msan_init));
     return false;
   }
 
@@ -138,20 +144,26 @@ bool InitShadow(bool init_origins) {
     bool protect = type == MappingDesc::INVALID ||
                    (!init_origins && type == MappingDesc::ORIGIN);
     CHECK(!(map && protect));
-    if (!map && !protect)
-      CHECK(type == MappingDesc::APP);
+    if (!map && !protect) {
+      CHECK(type == MappingDesc::APP || type == MappingDesc::ALLOCATOR);
+
+      if (dry_run && type == MappingDesc::ALLOCATOR &&
+          !CheckMemoryRangeAvailability(start, size, !dry_run))
+        return false;
+    }
     if (map) {
-      if (!CheckMemoryRangeAvailability(start, size))
+      if (dry_run && !CheckMemoryRangeAvailability(start, size, !dry_run))
         return false;
-      if (!MmapFixedSuperNoReserve(start, size, kMemoryLayout[i].name))
+      if (!dry_run &&
+          !MmapFixedSuperNoReserve(start, size, kMemoryLayout[i].name))
         return false;
-      if (common_flags()->use_madv_dontdump)
+      if (!dry_run && common_flags()->use_madv_dontdump)
         DontDumpShadowMemory(start, size);
     }
     if (protect) {
-      if (!CheckMemoryRangeAvailability(start, size))
+      if (dry_run && !CheckMemoryRangeAvailability(start, size, !dry_run))
         return false;
-      if (!ProtectMemoryRange(start, size, kMemoryLayout[i].name))
+      if (!dry_run && !ProtectMemoryRange(start, size, kMemoryLayout[i].name))
         return false;
     }
   }
@@ -159,6 +171,35 @@ bool InitShadow(bool init_origins) {
   return true;
 }
 
+bool InitShadowWithReExec(bool init_origins) {
+  // Start with dry run: check layout is ok, but don't print warnings because
+  // warning messages will cause tests to fail (even if we successfully re-exec
+  // after the warning).
+  bool success = InitShadow(__msan_get_track_origins(), true);
+  if (!success) {
+#  if SANITIZER_LINUX
+    // Perhaps ASLR entropy is too high. If ASLR is enabled, re-exec without it.
+    int old_personality = personality(0xffffffff);
+    bool aslr_on =
+        (old_personality != -1) && ((old_personality & ADDR_NO_RANDOMIZE) == 0);
+
+    if (aslr_on) {
+      VReport(1,
+              "WARNING: MemorySanitizer: memory layout is incompatible, "
+              "possibly due to high-entropy ASLR.\n"
+              "Re-execing with fixed virtual address space.\n"
+              "N.B. reducing ASLR entropy is preferable.\n");
+      CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
+      ReExec();
+    }
+#  endif
+  }
+
+  // The earlier dry run didn't actually map or protect anything. Run again in
+  // non-dry run mode.
+  return success && InitShadow(__msan_get_track_origins(), false);
+}
+
 static void MsanAtExit(void) {
   if (flags()->print_stats && (flags()->atexit || msan_report_count > 0))
     ReportStats();
diff --git a/compiler-rt/lib/msan/tests/CMakeLists.txt b/compiler-rt/lib/msan/tests/CMakeLists.txt
index bc58c0b9fabf8f..4f09f1e6a691fa 100644
--- a/compiler-rt/lib/msan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/msan/tests/CMakeLists.txt
@@ -120,9 +120,7 @@ macro(add_msan_tests_for_arch arch kind cflags)
   set(MSAN_TEST_DEPS ${MSAN_TEST_OBJECTS} libcxx_msan_${arch}-build
                      ${MSAN_LOADABLE_SO}
 		     "${MSAN_LIBCXX_DIR}/libc++.a" "${MSAN_LIBCXX_DIR}/libc++abi.a")
-  if(NOT COMPILER_RT_STANDALONE_BUILD)
-    list(APPEND MSAN_TEST_DEPS msan)
-  endif()
+  list(APPEND MSAN_TEST_DEPS msan)
   get_target_flags_for_arch(${arch} TARGET_LINK_FLAGS)
   add_compiler_rt_test(MsanUnitTests "Msan-${arch}${kind}-Test" ${arch}
     OBJECTS ${MSAN_TEST_OBJECTS} "${MSAN_LIBCXX_DIR}/libc++.a" "${MSAN_LIBCXX_DIR}/libc++abi.a"
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
index 3605d0d666e3fd..ece2d7d63dd619 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -120,6 +120,9 @@ void DisableCoreDumperIfNecessary() {
     // The alternative to using RLIMIT_CORE=1 would be to use prctl() with the
     // PR_SET_DUMPABLE flag, however that also prevents ptrace(), so makes it
     // impossible to attach a debugger.
+    //
+    // Note: we use rlim_max in the Min() call here since that is the upper
+    // limit for what can be set without getting an EINVAL error.
     rlim.rlim_cur = Min<rlim_t>(SANITIZER_LINUX ? 1 : 0, rlim.rlim_max);
     CHECK_EQ(0, setrlimit(RLIMIT_CORE, &rlim));
   }
diff --git a/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
index 8e1a2ae1491486..12ed505883e277 100644
--- a/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
@@ -21,13 +21,13 @@ void make_access(S *s) {
 int main(void) {
   S *s = (S*)malloc(sizeof(S));
   free(s);
-// CHECK: [[ADDR]] is located 0 bytes inside of 4-byte region
-// CHECK-LABEL: freed by thread T0 here:
-// CHECK:   {{#0 .* free }}
-// CHECK:   {{#1 .* main .*bitfield_uaf.cpp}}:[[@LINE-4]]
-// CHECK-LABEL: previously allocated by thread T0 here:
-// CHECK:   {{#0 .* malloc }}
-// CHECK:   {{#1 .* main .*bitfield_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 4-byte region
+  // CHECK-LABEL: freed by thread T0 here:
+  // CHECK:   {{#0 .* free }}
+  // CHECK:   {{ #[1-2] .* main .*bitfield_uaf.cpp}}:[[@LINE-4]]
+  // CHECK-LABEL: previously allocated by thread T0 here:
+  // CHECK:   {{#0 .* malloc }}
+  // CHECK:   {{ #[1-2] .* main .*bitfield_uaf.cpp}}:[[@LINE-8]]
   make_access(s);
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
index b22c359b3dc42f..e96fb6190f5a85 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   int *buffer = (int*)calloc(42, sizeof(int));
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 4 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*calloc_left_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 4 bytes before 168-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* calloc }}
-// CHECK-NEXT: {{#1 .* main .*calloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 4 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*calloc_left_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 4 bytes before 168-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK: {{#0 .* calloc }}
+  // CHECK: {{ #[1-2] .* main .*calloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
index 9e12f9cf653e09..fe0fc20e191903 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   int *buffer = (int*)calloc(42, sizeof(int));
   buffer[42] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 4 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*calloc_right_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes after 168-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* calloc }}
-// CHECK-NEXT: {{#1 .* main .*calloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 4 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*calloc_right_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes after 168-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* calloc }}
+  // CHECK: {{ #[1-2] .* main .*calloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
index 6c225d4c676d08..bf13f7d3eb662a 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
@@ -7,14 +7,14 @@ int main() {
   int *buffer = (int*)calloc(42, sizeof(int));
   free(buffer);
   buffer[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 4 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*calloc_uaf.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
-// CHECK: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*calloc_uaf.cpp}}:[[@LINE-8]]
-// CHECK: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* calloc }}
-// CHECK-NEXT: {{#1 .* main .*calloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 4 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*calloc_uaf.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
+  // CHECK: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*calloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* calloc }}
+  // CHECK: {{ #[1-2] .* main .*calloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_heap_allocation.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_heap_allocation.cpp
index b8c2c1a24d4ce0..3606b1c8163171 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_heap_allocation.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_heap_allocation.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cl %LD %s %Fe%t.dll -DHEAP_LIBRARY %MD \
 // RUN:   %if target={{.*-windows-gnu}} %{ -Wl,--out-implib,%t.lib %}
-// RUN: %clang_cl %s %t.lib %Fe%t -fsanitize=address %MT
+// RUN: %clang_cl_asan %s %t.lib %Fe%t
 // RUN: %run %t 2>&1 | FileCheck %s
 
 // Check that ASan does not fail when releasing allocations that occurred within
diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_host.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_host.cpp
index 0757af90d211c3..85b7967e86b510 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_host.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_host.cpp
@@ -5,48 +5,6 @@
 // Just make sure we can compile this.
 // The actual compile&run sequence is to be done by the DLL tests.
 // RUN: %clang_cl_asan -Od %s -Fe%t
-//
-// Get the list of ASan wrappers exported by the main module RTL:
-// note: The mangling decoration (i.e. @4 )is removed because calling convention
-//       differ from 32-bit and 64-bit.
-// RUN: dumpbin /EXPORTS %t | grep -o "__asan_wrap[^ ]*" | sed -e s/@.*// > %t.exports1
-//
-// The exception handlers differ in 32-bit and 64-bit, so we ignore them:
-// RUN: grep '[E]XPORT:' %s | sed -e 's/.*[E]XPORT: //' > %t.exports2
-// EXPORT: __asan_wrap__except_handler3
-// EXPORT: __asan_wrap__except_handler4
-// EXPORT: __asan_wrap___C_specific_handler
-//
-// Get the list of ASan wrappers imported by the DLL RTL:
-// [BEWARE: be really careful with the sed commands, as this test can be run
-//  from different environments with different shells and seds]
-// RUN: grep INTERCEPT_LIBRARY_FUNCTION %p/../../../../lib/asan/asan_win_dll_thunk.cpp \
-// RUN:  | grep -v define | sed -e s/.*(/__asan_wrap_/ -e s/).*//              \
-// RUN:  > %t.imports1
-//
-// Add functions interecepted in asan_malloc.win.cpp and asan_win.cpp.
-// RUN: grep '[I]MPORT:' %s | sed -e 's/.*[I]MPORT: //' > %t.imports2
-// IMPORT: __asan_wrap_HeapAlloc
-// IMPORT: __asan_wrap_HeapFree
-// IMPORT: __asan_wrap_HeapReAlloc
-// IMPORT: __asan_wrap_HeapSize
-// IMPORT: __asan_wrap_CreateThread
-// IMPORT: __asan_wrap_RaiseException
-// IMPORT: __asan_wrap_RtlRaiseException
-// IMPORT: __asan_wrap_SetUnhandledExceptionFilter
-// IMPORT: __asan_wrap_RtlSizeHeap
-// IMPORT: __asan_wrap_RtlAllocateHeap
-// IMPORT: __asan_wrap_RtlReAllocateHeap
-// IMPORT: __asan_wrap_RtlFreeHeap
-//
-// RUN: cat %t.imports1 %t.imports2 | sort | uniq > %t.imports-sorted
-// RUN: cat %t.exports1 %t.exports2 | sort | uniq > %t.exports-sorted
-//
-// Now make sure the DLL thunk imports everything:
-// RUN: echo
-// RUN: echo "=== NOTE === If you see a mismatch below, please update asan_win_dll_thunk.cpp"
-// RUN: diff %t.imports-sorted %t.exports-sorted
-// REQUIRES: asan-static-runtime
 
 #include <stdio.h>
 #include <windows.h>
diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_malloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_malloc_left_oob.cpp
index 6d550eb966cda3..5ca48fbd683840 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_malloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_malloc_left_oob.cpp
@@ -7,17 +7,17 @@ extern "C" __declspec(dllexport)
 int test_function() {
   char *buffer = (char*)malloc(42);
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: test_function {{.*}}dll_malloc_left_oob.cpp:[[@LINE-3]]
-// CHECK-NEXT: main {{.*}}dll_host.cpp
-//
-// CHECK: [[ADDR]] is located 1 bytes before 42-byte region
-// CHECK-LABEL: allocated by thread T0 here:
-// CHECK-NEXT:   malloc
-// CHECK-NEXT:   test_function {{.*}}dll_malloc_left_oob.cpp:[[@LINE-10]]
-// CHECK-NEXT:   main {{.*}}dll_host.cpp
-// CHECK-LABEL: SUMMARY
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: test_function {{.*}}dll_malloc_left_oob.cpp:[[@LINE-3]]
+  // CHECK-NEXT: main {{.*}}dll_host.cpp
+  //
+  // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
+  // CHECK-LABEL: allocated by thread T0 here:
+  // CHECK-NEXT:   malloc
+  // CHECK:   test_function {{.*}}dll_malloc_left_oob.cpp:[[@LINE-10]]
+  // CHECK-NEXT:   main {{.*}}dll_host.cpp
+  // CHECK-LABEL: SUMMARY
   free(buffer);
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_malloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_malloc_uaf.cpp
index bc701e92961c99..ae23f86d7ddf49 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_malloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_malloc_uaf.cpp
@@ -9,20 +9,20 @@ int test_function() {
   int *buffer = (int*)malloc(42);
   free(buffer);
   buffer[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 4 at [[ADDR]] thread T0
-// CHECK-NEXT:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-3]]
-// CHECK-NEXT:  main {{.*}}dll_host
-//
-// CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
-// CHECK-LABEL: freed by thread T0 here:
-// CHECK-NEXT:  free
-// CHECK-NEXT:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-10]]
-// CHECK-NEXT:  main {{.*}}dll_host
-//
-// CHECK-LABEL: previously allocated by thread T0 here:
-// CHECK-NEXT:  malloc
-// CHECK-NEXT:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-16]]
-// CHECK-NEXT:  main {{.*}}dll_host
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 4 at [[ADDR]] thread T0
+  // CHECK-NEXT:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-3]]
+  // CHECK-NEXT:  main {{.*}}dll_host
+  //
+  // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
+  // CHECK-LABEL: freed by thread T0 here:
+  // CHECK-NEXT:  free
+  // CHECK:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-10]]
+  // CHECK-NEXT:  main {{.*}}dll_host
+  //
+  // CHECK-LABEL: previously allocated by thread T0 here:
+  // CHECK-NEXT:  malloc
+  // CHECK:  test_function {{.*}}dll_malloc_uaf.cpp:[[@LINE-16]]
+  // CHECK-NEXT:  main {{.*}}dll_host
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/double_free.cpp b/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
index 45568e50d0c824..e288b40fac47a3 100644
--- a/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/double_free.cpp
@@ -7,15 +7,15 @@ int main() {
   int *x = (int*)malloc(42 * sizeof(int));
   free(x);
   free(x);
-// CHECK: AddressSanitizer: attempting double-free on [[ADDR:0x[0-9a-f]+]]
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*double_free.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
-// CHECK-LABEL: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*double_free.cpp}}:[[@LINE-8]]
-// CHECK-LABEL: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc }}
-// CHECK-NEXT: {{#1 .* main .*double_free.cpp}}:[[@LINE-12]]
+  // CHECK: AddressSanitizer: attempting double-free on [[ADDR:0x[0-9a-f]+]]
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
+  // CHECK-LABEL: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-8]]
+  // CHECK-LABEL: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc }}
+  // CHECK: {{ #[1-2] .* main .*double_free.cpp}}:[[@LINE-12]]
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/interface_symbols_windows.cpp b/compiler-rt/test/asan/TestCases/Windows/interface_symbols_windows.cpp
deleted file mode 100644
index 1803911a43a9b0..00000000000000
--- a/compiler-rt/test/asan/TestCases/Windows/interface_symbols_windows.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// UNSUPPORTED: target={{.*-windows-gnu}}
-
-// Check that the interface exported by asan static lib matches the list of
-// functions mentioned in sanitizer_interface.inc.
-//
-// Just make sure we can compile this.
-// RUN: %clang_cl_asan -Od %s -Fe%t
-//
-// note: The mangling decoration (i.e. @4 )is removed because calling convention
-//       differ from 32-bit and 64-bit.
-//
-// RUN: dumpbin /EXPORTS %t | sed "s/=.*//"                                    \
-// RUN:   | grep -o "\(__asan_\|__ubsan_\|__sanitizer_\|__sancov_\)[^ ]*"      \
-// RUN:   | grep -v "__asan_wrap"                                              \
-// RUN:   | sed -e s/@.*// > %t.exports
-//
-// [BEWARE: be really careful with the sed commands, as this test can be run
-//  from different environments with different shells and seds]
-//
-// RUN: sed ':a;N;$!ba;s/([\n ]*/(/g'                                          \
-// RUN:  %p/../../../../lib/asan/asan_interface.inc                            \
-// RUN:  %p/../../../../lib/ubsan/ubsan_interface.inc                          \
-// RUN:  %p/../../../../lib/sanitizer_common/sanitizer_common_interface.inc    \
-// RUN:  %p/../../../../lib/sanitizer_common/sanitizer_coverage_interface.inc  \
-// RUN:  | grep -e "^INTERFACE_FUNCTION"                                       \
-// RUN:  | sed -e "s/.*(//" -e "s/).*//" > %t.imports1
-//
-// RUN: sed ':a;N;$!ba;s/([\n ]*/(/g'                                          \
-// RUN:  %p/../../../../lib/asan/asan_interface.inc                            \
-// RUN:  %p/../../../../lib/ubsan/ubsan_interface.inc                          \
-// RUN:  %p/../../../../lib/sanitizer_common/sanitizer_common_interface.inc    \
-// RUN:  %p/../../../../lib/sanitizer_common/sanitizer_coverage_interface.inc  \
-// RUN:  | grep -e "^INTERFACE_WEAK_FUNCTION"                                  \
-// RUN:  | sed -e "s/.*(//" -e "s/).*/__dll/" > %t.imports2
-//
-// Add functions not included in the interface lists:
-// RUN: grep '[I]MPORT:' %s | sed -e 's/.*[I]MPORT: //' > %t.imports3
-// IMPORT: __asan_shadow_memory_dynamic_address
-// IMPORT: __asan_get_shadow_memory_dynamic_address
-// IMPORT: __asan_option_detect_stack_use_after_return
-// IMPORT: __asan_should_detect_stack_use_after_return
-// IMPORT: __asan_set_seh_filter
-// IMPORT: __asan_unhandled_exception_filter
-// IMPORT: __asan_test_only_reported_buggy_pointer
-// IMPORT: __ubsan_vptr_type_cache
-//
-// RUN: cat %t.imports1 %t.imports2 %t.imports3 | sort | uniq > %t.imports-sorted
-// RUN: cat %t.exports | sort | uniq > %t.exports-sorted
-//
-// Now make sure the DLL thunk imports everything:
-// RUN: echo
-// RUN: echo "=== NOTE === If you see a mismatch below, please update interface.inc files."
-// RUN: diff %t.imports-sorted %t.exports-sorted
-// REQUIRES: asan-static-runtime
-
-int main() { return 0; }
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
index 2ee5fdc7abee2f..7ea95d2b2184a0 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_left_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   char *buffer = (char*)malloc(42);
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*malloc_left_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 1 bytes before 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc }}
-// CHECK-NEXT: {{#1 .* main .*malloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*malloc_left_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc }}
+  // CHECK: {{ #[1-2] .* main .*malloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
index dafca74b4812da..1495632456e081 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_right_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   char *buffer = (char*)malloc(42);
   buffer[42] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*malloc_right_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes after 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc }}
-// CHECK-NEXT: {{#1 .* main .*malloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*malloc_right_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes after 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc }}
+  // CHECK: {{ #[1-2] .* main .*malloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
index 256582deefe46e..d1eac7e55f6010 100644
--- a/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/malloc_uaf.cpp
@@ -7,14 +7,14 @@ int main() {
   char *buffer = (char*)malloc(42);
   free(buffer);
   buffer[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*malloc_uaf.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
-// CHECK: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*malloc_uaf.cpp}}:[[@LINE-8]]
-// CHECK: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc }}
-// CHECK-NEXT: {{#1 .* main .*malloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*malloc_uaf.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
+  // CHECK: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*malloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc }}
+  // CHECK: {{ #[1-2] .* main .*malloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
index 4a5a7ab8ecfdfb..ebde5f159ae383 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_left_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   char *buffer = (char*)realloc(0, 42);
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*realloc_left_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 1 bytes before 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*realloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*realloc_left_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*realloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
index 8f3109eed2d079..281efed5d30740 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_right_oob.cpp
@@ -6,12 +6,12 @@
 int main() {
   char *buffer = (char*)realloc(0, 42);
   buffer[42] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*realloc_right_oob.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes after 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*realloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*realloc_right_oob.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes after 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*realloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
index 074ac270f13f5c..6ff2217b11a257 100644
--- a/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/realloc_uaf.cpp
@@ -7,14 +7,14 @@ int main() {
   char *buffer = (char*)realloc(0, 42);
   free(buffer);
   buffer[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*realloc_uaf.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
-// CHECK: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* free }}
-// CHECK-NEXT: {{#1 .* main .*realloc_uaf.cpp}}:[[@LINE-8]]
-// CHECK: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*realloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*realloc_uaf.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 42-byte region
+  // CHECK: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* free }}
+  // CHECK: {{ #[1-2] .* main .*realloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*realloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp b/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
index d896da482f2ebc..be99c89e7083ef 100644
--- a/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/symbols_path.cpp
@@ -11,12 +11,12 @@
 int main() {
   char *buffer = (char*)malloc(42);
   buffer[-1] = 42;
-// CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*symbols_path.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 1 bytes before 42-byte region
-// CHECK: allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* malloc}}
-// CHECK-NEXT: {{#1 .* main .*symbols_path.cpp}}:[[@LINE-8]]
+  // CHECK: AddressSanitizer: heap-buffer-overflow on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*symbols_path.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 1 bytes before 42-byte region
+  // CHECK: allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* malloc}}
+  // CHECK: {{ #[1-2] .* main .*symbols_path.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp b/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
index dfd25cebb6bfef..4c32c63c38fa1f 100644
--- a/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/use_after_realloc.cpp
@@ -9,15 +9,15 @@ int main() {
   buffer = (char*)realloc(buffer, 64);
   // The 'stale' may now point to a free'd memory.
   stale[0] = 42;
-// CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
-// CHECK: WRITE of size 1 at [[ADDR]] thread T0
-// CHECK-NEXT: {{#0 .* main .*use_after_realloc.cpp}}:[[@LINE-3]]
-// CHECK: [[ADDR]] is located 0 bytes inside of 32-byte region
-// CHECK: freed by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*use_after_realloc.cpp}}:[[@LINE-9]]
-// CHECK: previously allocated by thread T0 here:
-// CHECK-NEXT: {{#0 .* realloc }}
-// CHECK-NEXT: {{#1 .* main .*use_after_realloc.cpp}}:[[@LINE-14]]
+  // CHECK: AddressSanitizer: heap-use-after-free on address [[ADDR:0x[0-9a-f]+]]
+  // CHECK: WRITE of size 1 at [[ADDR]] thread T0
+  // CHECK-NEXT: {{#0 .* main .*use_after_realloc.cpp}}:[[@LINE-3]]
+  // CHECK: [[ADDR]] is located 0 bytes inside of 32-byte region
+  // CHECK: freed by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*use_after_realloc.cpp}}:[[@LINE-9]]
+  // CHECK: previously allocated by thread T0 here:
+  // CHECK-NEXT: {{#0 .* realloc }}
+  // CHECK: {{ #[1-2] .* main .*use_after_realloc.cpp}}:[[@LINE-14]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/calloc-overflow.cpp b/compiler-rt/test/asan/TestCases/calloc-overflow.cpp
index 9a0d41ff826018..b930b65cd8c3b0 100644
--- a/compiler-rt/test/asan/TestCases/calloc-overflow.cpp
+++ b/compiler-rt/test/asan/TestCases/calloc-overflow.cpp
@@ -11,7 +11,7 @@ int main() {
   void *p = calloc(-1, 1000);
   // CHECK: {{ERROR: AddressSanitizer: calloc parameters overflow: count \* size \(.* \* 1000\) cannot be represented in type size_t}}
   // CHECK: {{#0 0x.* in .*calloc}}
-  // CHECK: {{#1 0x.* in main .*calloc-overflow.cpp:}}[[@LINE-3]]
+  // CHECK: {{#[1-3] 0x.* in main .*calloc-overflow.cpp:}}[[@LINE-3]]
   // CHECK: SUMMARY: AddressSanitizer: calloc-overflow
 
   printf("calloc returned: %zu\n", (size_t)p);
diff --git a/compiler-rt/test/asan/TestCases/deep_stack_uaf.cpp b/compiler-rt/test/asan/TestCases/deep_stack_uaf.cpp
index bdf0dbdb4a4bd0..7d8a09220689f5 100644
--- a/compiler-rt/test/asan/TestCases/deep_stack_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/deep_stack_uaf.cpp
@@ -1,7 +1,7 @@
 // Check that we can store lots of stack frames if asked to.
 
 // RUN: %clangxx_asan -O0 %s -o %t 2>&1
-// RUN: %env_asan_opts=malloc_context_size=120:redzone=512 not %run %t 2>&1 | FileCheck %s
+// RUN: %env_asan_opts=malloc_context_size=125:redzone=512 not %run %t 2>&1 | FileCheck %s
 // REQUIRES: stable-runtime
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/compiler-rt/test/asan/TestCases/double-free.cpp b/compiler-rt/test/asan/TestCases/double-free.cpp
index b01e41005a27a3..7b61df0715afaf 100644
--- a/compiler-rt/test/asan/TestCases/double-free.cpp
+++ b/compiler-rt/test/asan/TestCases/double-free.cpp
@@ -19,10 +19,10 @@ int main(int argc, char **argv) {
   free(x + argc - 1);  // BOOM
   // CHECK: AddressSanitizer: attempting double-free{{.*}}in thread T0
   // CHECK: #0 0x{{.*}} in {{.*}}free
-  // CHECK: #1 0x{{.*}} in main {{.*}}double-free.cpp:[[@LINE-3]]
+  // CHECK: #{{[1-3]}} 0x{{.*}} in main {{.*}}double-free.cpp:[[@LINE-3]]
   // CHECK: freed by thread T0 here:
   // MALLOC-CTX: #0 0x{{.*}} in {{.*}}free
-  // MALLOC-CTX: #1 0x{{.*}} in main {{.*}}double-free.cpp:[[@LINE-7]]
+  // MALLOC-CTX: #{{[1-3]}} 0x{{.*}} in main {{.*}}double-free.cpp:[[@LINE-7]]
   // CHECK: allocated by thread T0 here:
   // MALLOC-CTX: double-free.cpp:[[@LINE-12]]
   // CHECK-RECOVER: AddressSanitizer: attempting double-free{{.*}}in thread T0
diff --git a/compiler-rt/test/asan/TestCases/malloc-size-too-big.cpp b/compiler-rt/test/asan/TestCases/malloc-size-too-big.cpp
index 71d6a3e7148a3e..771640a4ac08d1 100644
--- a/compiler-rt/test/asan/TestCases/malloc-size-too-big.cpp
+++ b/compiler-rt/test/asan/TestCases/malloc-size-too-big.cpp
@@ -18,7 +18,7 @@ int main() {
   void *p = malloc(kMaxAllowedMallocSizePlusOne);
   // CHECK: {{ERROR: AddressSanitizer: requested allocation size .* \(.* after adjustments for alignment, red zones etc\.\) exceeds maximum supported size}}
   // CHECK: {{#0 0x.* in .*malloc}}
-  // CHECK: {{#1 0x.* in main .*malloc-size-too-big.cpp:}}[[@LINE-3]]
+  // CHECK: {{#[1-3] 0x.* in main .*malloc-size-too-big.cpp:}}[[@LINE-3]]
   // CHECK: SUMMARY: AddressSanitizer: allocation-size-too-big
 
   printf("malloc returned: %zu\n", (size_t)p);
diff --git a/compiler-rt/test/asan/TestCases/strncpy-overflow.cpp b/compiler-rt/test/asan/TestCases/strncpy-overflow.cpp
index d852ccebd9a778..ff84052a94987a 100644
--- a/compiler-rt/test/asan/TestCases/strncpy-overflow.cpp
+++ b/compiler-rt/test/asan/TestCases/strncpy-overflow.cpp
@@ -33,6 +33,6 @@ int main(int argc, char **argv) {
   // CHECK: {{0x.* is located 0 bytes after 9-byte region}}
   // CHECK: {{allocated by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*malloc}}
-  // CHECK: {{    #1 0x.* in main .*strncpy-overflow.cpp:}}[[@LINE-8]]
+  // CHECK: {{    #[1-3] 0x.* in main .*strncpy-overflow.cpp:}}[[@LINE-8]]
   return rval + sink_memory(9, short_buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/use-after-free-right.cpp b/compiler-rt/test/asan/TestCases/use-after-free-right.cpp
index 10dbe5b9056848..11011e4b4fb1aa 100644
--- a/compiler-rt/test/asan/TestCases/use-after-free-right.cpp
+++ b/compiler-rt/test/asan/TestCases/use-after-free-right.cpp
@@ -19,9 +19,9 @@ int main() {
   // CHECK: {{0x.* is located 0 bytes inside of 1-byte region .0x.*,0x.*}}
   // CHECK: {{freed by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*free}}
-  // CHECK: {{    #1 0x.* in main .*use-after-free-right.cpp:}}[[@LINE-9]]
+  // CHECK: {{    #[1-3] 0x.* in main .*use-after-free-right.cpp:}}[[@LINE-9]]
 
   // CHECK: {{previously allocated by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*malloc}}
-  // CHECK: {{    #1 0x.* in main .*use-after-free-right.cpp:}}[[@LINE-14]]
+  // CHECK: {{    #[1-3] 0x.* in main .*use-after-free-right.cpp:}}[[@LINE-14]]
 }
diff --git a/compiler-rt/test/asan/TestCases/use-after-free.cpp b/compiler-rt/test/asan/TestCases/use-after-free.cpp
index 4b20ee25500c06..f19c461960d36a 100644
--- a/compiler-rt/test/asan/TestCases/use-after-free.cpp
+++ b/compiler-rt/test/asan/TestCases/use-after-free.cpp
@@ -16,11 +16,11 @@ int main() {
   // CHECK: {{0x.* is located 5 bytes inside of 10-byte region .0x.*,0x.*}}
   // CHECK: {{freed by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*free}}
-  // CHECK: {{    #1 0x.* in main .*use-after-free.cpp:}}[[@LINE-9]]
+  // CHECK: {{    #[1-3] 0x.* in main .*use-after-free.cpp:}}[[@LINE-9]]
 
   // CHECK: {{previously allocated by thread T0 here:}}
   // CHECK: {{    #0 0x.* in .*malloc}}
-  // CHECK: {{    #1 0x.* in main .*use-after-free.cpp:}}[[@LINE-14]]
+  // CHECK: {{    #[1-3] 0x.* in main .*use-after-free.cpp:}}[[@LINE-14]]
   // CHECK: Shadow byte legend (one shadow byte represents {{[0-9]+}} application bytes):
   // CHECK: Global redzone:
   // CHECK: ASan internal:
diff --git a/compiler-rt/test/asan_abi/CMakeLists.txt b/compiler-rt/test/asan_abi/CMakeLists.txt
index f28cf6cd1da6ea..1114ed1b82793d 100644
--- a/compiler-rt/test/asan_abi/CMakeLists.txt
+++ b/compiler-rt/test/asan_abi/CMakeLists.txt
@@ -9,10 +9,7 @@ macro(get_bits_for_arch arch bits)
   endif()
 endmacro()
 
-set(ASAN_ABI_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
-if(NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND ASAN_ABI_TEST_DEPS asan_abi)
-endif()
+set(ASAN_ABI_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} asan_abi)
 
 set(ASAN_ABI_TEST_ARCH ${ASAN_ABI_SUPPORTED_ARCH})
 if(APPLE)
diff --git a/compiler-rt/test/fuzzer/CMakeLists.txt b/compiler-rt/test/fuzzer/CMakeLists.txt
index f0ba087a1b3260..5bcb44757bae46 100644
--- a/compiler-rt/test/fuzzer/CMakeLists.txt
+++ b/compiler-rt/test/fuzzer/CMakeLists.txt
@@ -1,12 +1,17 @@
-set(LIBFUZZER_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
+set(LIBFUZZER_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} fuzzer)
+if (COMPILER_RT_HAS_UBSAN)
+  list(APPEND LIBFUZZER_TEST_DEPS ubsan)
+endif()
+if (COMPILER_RT_HAS_ASAN)
+  list(APPEND LIBFUZZER_TEST_DEPS asan)
+endif()
+if (COMPILER_RT_HAS_MSAN)
+  list(APPEND LIBFUZZER_TEST_DEPS msan)
+endif()
+if (COMPILER_RT_HAS_DFSAN)
+  list(APPEND LIBFUZZER_TEST_DEPS dfsan)
+endif()
 if (NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND LIBFUZZER_TEST_DEPS fuzzer asan ubsan)
-  if (COMPILER_RT_HAS_MSAN)
-    list(APPEND LIBFUZZER_TEST_DEPS msan)
-  endif()
-  if (COMPILER_RT_HAS_DFSAN)
-    list(APPEND LIBFUZZER_TEST_DEPS dfsan)
-  endif()
   if(NOT APPLE AND COMPILER_RT_HAS_LLD AND "lld" IN_LIST LLVM_ENABLE_PROJECTS)
     list(APPEND LIBFUZZER_TEST_DEPS lld)
   endif()
diff --git a/compiler-rt/test/memprof/CMakeLists.txt b/compiler-rt/test/memprof/CMakeLists.txt
index 8a29919b177024..3f0ba3812485d2 100644
--- a/compiler-rt/test/memprof/CMakeLists.txt
+++ b/compiler-rt/test/memprof/CMakeLists.txt
@@ -11,12 +11,9 @@ macro(get_bits_for_arch arch bits)
   endif()
 endmacro()
 
-set(MEMPROF_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
-if(NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND MEMPROF_TEST_DEPS memprof)
-  if(COMPILER_RT_HAS_LLD AND TARGET lld)
-    list(APPEND MEMPROF_TEST_DEPS lld)
-  endif()
+set(MEMPROF_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} memprof)
+if(NOT COMPILER_RT_STANDALONE_BUILD AND COMPILER_RT_HAS_LLD AND TARGET lld)
+  list(APPEND MEMPROF_TEST_DEPS lld)
 endif()
 set(MEMPROF_DYNAMIC_TEST_DEPS ${MEMPROF_TEST_DEPS})
 
diff --git a/compiler-rt/test/msan/CMakeLists.txt b/compiler-rt/test/msan/CMakeLists.txt
index 01b832b5ae3f9d..9f784507c4ee1c 100644
--- a/compiler-rt/test/msan/CMakeLists.txt
+++ b/compiler-rt/test/msan/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(MSAN_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
 set(MSAN_TESTSUITES)
-set(MSAN_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
+set(MSAN_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} msan)
 
 set(MSAN_TEST_ARCH ${MSAN_SUPPORTED_ARCH})
 if(APPLE)
@@ -41,10 +41,6 @@ foreach(arch ${MSAN_TEST_ARCH})
   endif()
 endforeach()
 
-if(NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND MSAN_TEST_DEPS msan)
-endif()
-
 if(COMPILER_RT_INCLUDE_TESTS AND
    COMPILER_RT_LIBCXX_PATH AND
    COMPILER_RT_LIBCXXABI_PATH)
diff --git a/compiler-rt/test/sanitizer_common/TestCases/corelimit.cpp b/compiler-rt/test/sanitizer_common/TestCases/corelimit.cpp
index fed2e1d89cbffa..9b56471905aaf4 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/corelimit.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/corelimit.cpp
@@ -10,12 +10,9 @@ int main() {
   getrlimit(RLIMIT_CORE, &lim_core);
   void *p;
   if (sizeof(p) == 8) {
-#ifdef __linux__
-    // See comments in DisableCoreDumperIfNecessary().
-    assert(lim_core.rlim_cur == 1);
-#else
-    assert(lim_core.rlim_cur == 0);
-#endif
+    // rlim_cur will be set to zero or one depending on the target OS and
+    // initial core limits. See comments in DisableCoreDumperIfNecessary().
+    assert(lim_core.rlim_cur <= 1u);
   }
   return 0;
 }
diff --git a/compiler-rt/test/tsan/signal_errno.cpp b/compiler-rt/test/tsan/signal_errno.cpp
index 7e1fd4b0c5a557..99d4b6d84ca4b0 100644
--- a/compiler-rt/test/tsan/signal_errno.cpp
+++ b/compiler-rt/test/tsan/signal_errno.cpp
@@ -18,7 +18,7 @@ static void MyHandler(int, siginfo_t *s, void *c) {
 
 static void* sendsignal(void *p) {
   barrier_wait(&barrier);
-  pthread_kill(mainth, SIGPROF);
+  pthread_kill(mainth, SIGALRM);
   return 0;
 }
 
@@ -37,7 +37,7 @@ int main() {
   mainth = pthread_self();
   struct sigaction act = {};
   act.sa_sigaction = &MyHandler;
-  sigaction(SIGPROF, &act, 0);
+  sigaction(SIGALRM, &act, 0);
   pthread_t th;
   pthread_create(&th, 0, sendsignal, 0);
   loop();
@@ -46,7 +46,7 @@ int main() {
 }
 
 // CHECK: WARNING: ThreadSanitizer: signal handler spoils errno
-// CHECK:   Signal 27 handler invoked at:
+// CHECK:   Signal 14 handler invoked at:
 // CHECK:     #0 MyHandler(int, {{(__)?}}siginfo{{(_t)?}}*, void*) {{.*}}signal_errno.cpp
 // CHECK:     main
 // CHECK: SUMMARY: ThreadSanitizer: signal handler spoils errno{{.*}}MyHandler
diff --git a/compiler-rt/test/tsan/signal_reset.cpp b/compiler-rt/test/tsan/signal_reset.cpp
index 82758d882382f6..d76b7e5f3b5f70 100644
--- a/compiler-rt/test/tsan/signal_reset.cpp
+++ b/compiler-rt/test/tsan/signal_reset.cpp
@@ -28,12 +28,12 @@ static void* reset(void *p) {
   struct sigaction act = {};
   for (int i = 0; i < 1000000; i++) {
     act.sa_handler = &handler;
-    if (sigaction(SIGPROF, &act, 0)) {
+    if (sigaction(SIGALRM, &act, 0)) {
       perror("sigaction");
       exit(1);
     }
     act.sa_handler = SIG_IGN;
-    if (sigaction(SIGPROF, &act, 0)) {
+    if (sigaction(SIGALRM, &act, 0)) {
       perror("sigaction");
       exit(1);
     }
@@ -44,7 +44,7 @@ static void* reset(void *p) {
 int main() {
   struct sigaction act = {};
   act.sa_handler = SIG_IGN;
-  if (sigaction(SIGPROF, &act, 0)) {
+  if (sigaction(SIGALRM, &act, 0)) {
     perror("sigaction");
     exit(1);
   }
@@ -53,7 +53,7 @@ int main() {
   t.it_value.tv_sec = 0;
   t.it_value.tv_usec = 10;
   t.it_interval = t.it_value;
-  if (setitimer(ITIMER_PROF, &t, 0)) {
+  if (setitimer(ITIMER_REAL, &t, 0)) {
     perror("setitimer");
     exit(1);
   }
diff --git a/compiler-rt/test/tsan/signal_sync.cpp b/compiler-rt/test/tsan/signal_sync.cpp
index b529a1859f52a6..878b3f3b88b9f0 100644
--- a/compiler-rt/test/tsan/signal_sync.cpp
+++ b/compiler-rt/test/tsan/signal_sync.cpp
@@ -30,7 +30,7 @@ int main() {
 
   struct sigaction act = {};
   act.sa_handler = &handler;
-  if (sigaction(SIGPROF, &act, 0)) {
+  if (sigaction(SIGVTALRM, &act, 0)) {
     perror("sigaction");
     exit(1);
   }
@@ -39,7 +39,7 @@ int main() {
   t.it_value.tv_sec = 0;
   t.it_value.tv_usec = 10;
   t.it_interval = t.it_value;
-  if (setitimer(ITIMER_PROF, &t, 0)) {
+  if (setitimer(ITIMER_VIRTUAL, &t, 0)) {
     perror("setitimer");
     exit(1);
   }
diff --git a/compiler-rt/test/tsan/signal_thread.cpp b/compiler-rt/test/tsan/signal_thread.cpp
index aa91d1ddeb101c..7bba8159bf38fc 100644
--- a/compiler-rt/test/tsan/signal_thread.cpp
+++ b/compiler-rt/test/tsan/signal_thread.cpp
@@ -24,7 +24,7 @@ static void* thr(void *p) {
 int main() {
   struct sigaction act = {};
   act.sa_handler = &handler;
-  if (sigaction(SIGPROF, &act, 0)) {
+  if (sigaction(SIGVTALRM, &act, 0)) {
     perror("sigaction");
     exit(1);
   }
@@ -33,7 +33,7 @@ int main() {
   t.it_value.tv_sec = 0;
   t.it_value.tv_usec = 10;
   t.it_interval = t.it_value;
-  if (setitimer(ITIMER_PROF, &t, 0)) {
+  if (setitimer(ITIMER_VIRTUAL, &t, 0)) {
     perror("setitimer");
     exit(1);
   }
diff --git a/compiler-rt/test/tsan/signal_thread2.cpp b/compiler-rt/test/tsan/signal_thread2.cpp
index 9bde4f70b39d81..5236628e13b60a 100644
--- a/compiler-rt/test/tsan/signal_thread2.cpp
+++ b/compiler-rt/test/tsan/signal_thread2.cpp
@@ -40,7 +40,7 @@ static void *thr(void *p) {
 int main() {
   struct sigaction act = {};
   act.sa_handler = &handler;
-  if (sigaction(SIGPROF, &act, 0)) {
+  if (sigaction(SIGALRM, &act, 0)) {
     perror("sigaction");
     exit(1);
   }
@@ -49,7 +49,7 @@ int main() {
   t.it_value.tv_sec = 0;
   t.it_value.tv_usec = 10;
   t.it_interval = t.it_value;
-  if (setitimer(ITIMER_PROF, &t, 0)) {
+  if (setitimer(ITIMER_REAL, &t, 0)) {
     perror("setitimer");
     exit(1);
   }
diff --git a/compiler-rt/test/ubsan/CMakeLists.txt b/compiler-rt/test/ubsan/CMakeLists.txt
index b5040f79e607ba..52052c80960c24 100644
--- a/compiler-rt/test/ubsan/CMakeLists.txt
+++ b/compiler-rt/test/ubsan/CMakeLists.txt
@@ -23,9 +23,7 @@ macro(add_ubsan_testsuite test_mode sanitizer arch lld thinlto)
     ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py)
   list(APPEND UBSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
-  if(NOT COMPILER_RT_STANDALONE_BUILD)
-    list(APPEND UBSAN_TEST_DEPS ${sanitizer})
-  endif()
+  list(APPEND UBSAN_TEST_DEPS ${sanitizer})
 endmacro()
 
 macro(add_ubsan_testsuites test_mode sanitizer arch)
diff --git a/compiler-rt/test/xray/CMakeLists.txt b/compiler-rt/test/xray/CMakeLists.txt
index b97659ae08f9bd..0c008b6ea5577b 100644
--- a/compiler-rt/test/xray/CMakeLists.txt
+++ b/compiler-rt/test/xray/CMakeLists.txt
@@ -3,12 +3,11 @@ set(XRAY_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(XRAY_TESTSUITES)
 set(XRAY_FDR_TESTSUITES)
 
-set(XRAY_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
-set(XRAY_FDR_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
+set(XRAY_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS} xray)
 
 if(NOT COMPILER_RT_STANDALONE_BUILD AND COMPILER_RT_BUILD_XRAY AND
    COMPILER_RT_HAS_XRAY)
-  list(APPEND XRAY_TEST_DEPS xray llvm-xray)
+  list(APPEND XRAY_TEST_DEPS llvm-xray)
 endif()
 
 set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH})
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
index b251f1a0538ce3..1b0d4d5871cbeb 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/Debuggers.py
@@ -28,6 +28,7 @@
 from dex.debugger.visualstudio.VisualStudio2015 import VisualStudio2015
 from dex.debugger.visualstudio.VisualStudio2017 import VisualStudio2017
 from dex.debugger.visualstudio.VisualStudio2019 import VisualStudio2019
+from dex.debugger.visualstudio.VisualStudio2022 import VisualStudio2022
 
 
 def _get_potential_debuggers():  # noqa
@@ -41,6 +42,7 @@ def _get_potential_debuggers():  # noqa
         VisualStudio2015.get_option_name(): VisualStudio2015,
         VisualStudio2017.get_option_name(): VisualStudio2017,
         VisualStudio2019.get_option_name(): VisualStudio2019,
+        VisualStudio2022.get_option_name(): VisualStudio2022,
     }
 
 
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio2022.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio2022.py
new file mode 100644
index 00000000000000..6fcf8af4acabc1
--- /dev/null
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio2022.py
@@ -0,0 +1,23 @@
+# DExTer : Debugging Experience Tester
+# ~~~~~~   ~         ~~         ~   ~~
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Specializations for the Visual Studio 2022 interface."""
+
+from dex.debugger.visualstudio.VisualStudio import VisualStudio
+
+
+class VisualStudio2022(VisualStudio):
+    @classmethod
+    def get_name(cls):
+        return "Visual Studio 2022"
+
+    @classmethod
+    def get_option_name(cls):
+        return "vs2022"
+
+    @property
+    def _dte_version(self):
+        return "VisualStudio.DTE.17.0"
diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/list-debuggers/list-debuggers.test b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/list-debuggers/list-debuggers.test
index bbc9dd501b0049..2bce540ced1813 100644
--- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/list-debuggers/list-debuggers.test
+++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/list-debuggers/list-debuggers.test
@@ -5,3 +5,5 @@ RUN: %dexter_base list-debuggers | FileCheck %s
 CHECK: lldb
 CHECK: vs2015
 CHECK: vs2017
+CHECK: vs2019
+CHECK: vs2022
diff --git a/flang/include/flang/Common/optional.h b/flang/include/flang/Common/optional.h
new file mode 100644
index 00000000000000..b5623b84dbd369
--- /dev/null
+++ b/flang/include/flang/Common/optional.h
@@ -0,0 +1,243 @@
+//===-- include/flang/Common/optional.h -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of std::optional borrowed from LLVM's
+// libc/src/__support/CPP/optional.h with modifications (e.g. value_or, emplace
+// methods were added).
+//
+// The implementation defines optional in Fortran::common namespace.
+// This standalone implementation may be used if the target
+// does not support std::optional implementation (e.g. CUDA device env),
+// otherwise, Fortran::common::optional is an alias for std::optional.
+//
+// TODO: using libcu++ is the best option for CUDA, but there is a couple
+// of issues:
+//   * Older CUDA toolkits' libcu++ implementations do not support optional.
+//   * The include paths need to be set up such that all STD header files
+//     are taken from libcu++.
+//   * cuda:: namespace need to be forced for all std:: references.
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_COMMON_OPTIONAL_H
+#define FORTRAN_COMMON_OPTIONAL_H
+
+#include "flang/Runtime/api-attrs.h"
+#include <optional>
+#include <type_traits>
+
+#if !defined(STD_OPTIONAL_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_OPTIONAL_UNSUPPORTED 1
+#endif
+
+#define FORTRAN_OPTIONAL_INLINE_WITH_ATTRS inline RT_API_ATTRS
+#define FORTRAN_OPTIONAL_INLINE inline
+#define FORTRAN_OPTIONAL_INLINE_VAR inline
+
+namespace Fortran::common {
+
+#if STD_OPTIONAL_UNSUPPORTED
+// Trivial nullopt_t struct.
+struct nullopt_t {
+  constexpr explicit nullopt_t() = default;
+};
+
+// nullopt that can be used and returned.
+FORTRAN_OPTIONAL_INLINE_VAR constexpr nullopt_t nullopt{};
+
+// This is very simple implementation of the std::optional class. It makes
+// several assumptions that the underlying type is trivially constructible,
+// copyable, or movable.
+template <typename T> class optional {
+  template <typename U, bool = !std::is_trivially_destructible<U>::value>
+  struct OptionalStorage {
+    union {
+      char empty;
+      U stored_value;
+    };
+
+    bool in_use = false;
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS ~OptionalStorage() { reset(); }
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr OptionalStorage() : empty() {}
+
+    template <typename... Args>
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr explicit OptionalStorage(
+        std::in_place_t, Args &&...args)
+        : stored_value(std::forward<Args>(args)...) {}
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr void reset() {
+      if (in_use)
+        stored_value.~U();
+      in_use = false;
+    }
+  };
+
+  // The only difference is that this type U doesn't have a nontrivial
+  // destructor.
+  template <typename U> struct OptionalStorage<U, false> {
+    union {
+      char empty;
+      U stored_value;
+    };
+
+    bool in_use = false;
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr OptionalStorage() : empty() {}
+
+    template <typename... Args>
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr explicit OptionalStorage(
+        std::in_place_t, Args &&...args)
+        : stored_value(std::forward<Args>(args)...) {}
+
+    FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr void reset() {
+      in_use = false;
+    }
+  };
+
+  OptionalStorage<T> storage;
+
+public:
+  // The default methods do not use RT_API_ATTRS, which causes
+  // warnings in CUDA compilation of form:
+  //   __device__/__host__ annotation is ignored on a function .* that is
+  //   explicitly defaulted on its first declaration
+  FORTRAN_OPTIONAL_INLINE constexpr optional() = default;
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(nullopt_t) {}
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(const T &t)
+      : storage(std::in_place, t) {
+    storage.in_use = true;
+  }
+  FORTRAN_OPTIONAL_INLINE constexpr optional(const optional &) = default;
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(T &&t)
+      : storage(std::in_place, std::move(t)) {
+    storage.in_use = true;
+  }
+  FORTRAN_OPTIONAL_INLINE constexpr optional(optional &&O) = default;
+
+  template <typename... ArgTypes>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(
+      std::in_place_t, ArgTypes &&...Args)
+      : storage(std::in_place, std::forward<ArgTypes>(Args)...) {
+    storage.in_use = true;
+  }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional &operator=(T &&t) {
+    storage.stored_value = std::move(t);
+    storage.in_use = true;
+    return *this;
+  }
+
+  FORTRAN_OPTIONAL_INLINE constexpr optional &operator=(optional &&) = default;
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional &operator=(const T &t) {
+    storage.stored_value = t;
+    storage.in_use = true;
+    return *this;
+  }
+
+  FORTRAN_OPTIONAL_INLINE constexpr optional &operator=(
+      const optional &) = default;
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr void reset() { storage.reset(); }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr const T &value() const & {
+    return storage.stored_value;
+  }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T &value() & {
+    return storage.stored_value;
+  }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr explicit operator bool() const {
+    return storage.in_use;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr bool has_value() const {
+    return storage.in_use;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr const T *operator->() const {
+    return &storage.stored_value;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T *operator->() {
+    return &storage.stored_value;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr const T &operator*() const & {
+    return storage.stored_value;
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T &operator*() & {
+    return storage.stored_value;
+  }
+
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T &&value() && {
+    return std::move(storage.stored_value);
+  }
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T &&operator*() && {
+    return std::move(storage.stored_value);
+  }
+
+  template <typename VT>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T value_or(
+      VT &&default_value) const & {
+    return storage.in_use ? storage.stored_value
+                          : static_cast<T>(std::forward<VT>(default_value));
+  }
+
+  template <typename VT>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr T value_or(
+      VT &&default_value) && {
+    return storage.in_use ? std::move(storage.stored_value)
+                          : static_cast<T>(std::forward<VT>(default_value));
+  }
+
+  template <typename... ArgTypes>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS
+      std::enable_if_t<std::is_constructible_v<T, ArgTypes &&...>, T &>
+      emplace(ArgTypes &&...args) {
+    reset();
+    new (reinterpret_cast<void *>(std::addressof(storage.stored_value)))
+        T(std::forward<ArgTypes>(args)...);
+    storage.in_use = true;
+    return value();
+  }
+
+  template <typename U = T,
+      std::enable_if_t<(std::is_constructible_v<T, U &&> &&
+                           !std::is_same_v<std::decay_t<U>, std::in_place_t> &&
+                           !std::is_same_v<std::decay_t<U>, optional> &&
+                           std::is_convertible_v<U &&, T>),
+          bool> = true>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS constexpr optional(U &&value) {
+    new (reinterpret_cast<void *>(std::addressof(storage.stored_value)))
+        T(std::forward<U>(value));
+    storage.in_use = true;
+  }
+
+  template <typename U = T,
+      std::enable_if_t<(std::is_constructible_v<T, U &&> &&
+                           !std::is_same_v<std::decay_t<U>, std::in_place_t> &&
+                           !std::is_same_v<std::decay_t<U>, optional> &&
+                           !std::is_convertible_v<U &&, T>),
+          bool> = false>
+  FORTRAN_OPTIONAL_INLINE_WITH_ATTRS explicit constexpr optional(U &&value) {
+    new (reinterpret_cast<void *>(std::addressof(storage.stored_value)))
+        T(std::forward<U>(value));
+    storage.in_use = true;
+  }
+};
+#else // !STD_OPTIONAL_UNSUPPORTED
+using std::nullopt;
+using std::nullopt_t;
+using std::optional;
+#endif // !STD_OPTIONAL_UNSUPPORTED
+
+} // namespace Fortran::common
+
+#endif // FORTRAN_COMMON_OPTIONAL_H
diff --git a/flang/include/flang/Common/reference-wrapper.h b/flang/include/flang/Common/reference-wrapper.h
new file mode 100644
index 00000000000000..66f924662d9612
--- /dev/null
+++ b/flang/include/flang/Common/reference-wrapper.h
@@ -0,0 +1,114 @@
+//===-- include/flang/Common/reference-wrapper.h ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// clang-format off
+//
+// Implementation of std::reference_wrapper borrowed from libcu++
+// https://github.com/NVIDIA/libcudacxx/blob/f7e6cd07ed5ba826aeac0b742feafddfedc1e400/include/cuda/std/detail/libcxx/include/__functional/reference_wrapper.h#L1
+// with modifications.
+//
+// The original source code is distributed under the Apache License v2.0
+// with LLVM Exceptions.
+//
+// TODO: using libcu++ is the best option for CUDA, but there is a couple
+// of issues:
+//   * The include paths need to be set up such that all STD header files
+//     are taken from libcu++.
+//   * cuda:: namespace need to be forced for all std:: references.
+//
+// clang-format on
+
+#ifndef FORTRAN_COMMON_REFERENCE_WRAPPER_H
+#define FORTRAN_COMMON_REFERENCE_WRAPPER_H
+
+#include "flang/Runtime/api-attrs.h"
+#include <functional>
+#include <type_traits>
+
+#if !defined(STD_REFERENCE_WRAPPER_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_REFERENCE_WRAPPER_UNSUPPORTED 1
+#endif
+
+namespace Fortran::common {
+
+template <class _Tp>
+using __remove_cvref_t = std::remove_cv_t<std::remove_reference_t<_Tp>>;
+template <class _Tp, class _Up>
+struct __is_same_uncvref
+    : std::is_same<__remove_cvref_t<_Tp>, __remove_cvref_t<_Up>> {};
+
+#if STD_REFERENCE_WRAPPER_UNSUPPORTED
+template <class _Tp> class reference_wrapper {
+public:
+  // types
+  typedef _Tp type;
+
+private:
+  type *__f_;
+
+  static RT_API_ATTRS void __fun(_Tp &);
+  static void __fun(_Tp &&) = delete;
+
+public:
+  template <class _Up,
+      class =
+          std::enable_if_t<!__is_same_uncvref<_Up, reference_wrapper>::value,
+              decltype(__fun(std::declval<_Up>()))>>
+  constexpr RT_API_ATTRS reference_wrapper(_Up &&__u) {
+    type &__f = static_cast<_Up &&>(__u);
+    __f_ = std::addressof(__f);
+  }
+
+  // access
+  constexpr RT_API_ATTRS operator type &() const { return *__f_; }
+  constexpr RT_API_ATTRS type &get() const { return *__f_; }
+
+  // invoke
+  template <class... _ArgTypes>
+  constexpr RT_API_ATTRS typename std::invoke_result_t<type &, _ArgTypes...>
+  operator()(_ArgTypes &&...__args) const {
+    return std::invoke(get(), std::forward<_ArgTypes>(__args)...);
+  }
+};
+
+template <class _Tp> reference_wrapper(_Tp &) -> reference_wrapper<_Tp>;
+
+template <class _Tp>
+inline constexpr RT_API_ATTRS reference_wrapper<_Tp> ref(_Tp &__t) {
+  return reference_wrapper<_Tp>(__t);
+}
+
+template <class _Tp>
+inline constexpr RT_API_ATTRS reference_wrapper<_Tp> ref(
+    reference_wrapper<_Tp> __t) {
+  return __t;
+}
+
+template <class _Tp>
+inline constexpr RT_API_ATTRS reference_wrapper<const _Tp> cref(
+    const _Tp &__t) {
+  return reference_wrapper<const _Tp>(__t);
+}
+
+template <class _Tp>
+inline constexpr RT_API_ATTRS reference_wrapper<const _Tp> cref(
+    reference_wrapper<_Tp> __t) {
+  return __t;
+}
+
+template <class _Tp> void ref(const _Tp &&) = delete;
+template <class _Tp> void cref(const _Tp &&) = delete;
+#else // !STD_REFERENCE_WRAPPER_UNSUPPORTED
+using std::cref;
+using std::ref;
+using std::reference_wrapper;
+#endif // !STD_REFERENCE_WRAPPER_UNSUPPORTED
+
+} // namespace Fortran::common
+
+#endif // FORTRAN_COMMON_REFERENCE_WRAPPER_H
diff --git a/flang/include/flang/Common/visit.h b/flang/include/flang/Common/visit.h
index d3136be3f6a1f0..f733b726189c88 100644
--- a/flang/include/flang/Common/visit.h
+++ b/flang/include/flang/Common/visit.h
@@ -17,27 +17,6 @@
 //
 // Define FLANG_USE_STD_VISIT to avoid this code and make common::visit() an
 // alias for ::std::visit().
-//
-//
-// With GCC 9.3.0 on a Haswell x86 Ubuntu system, doing out-of-tree builds:
-// Before:
-//  build:
-//   6948.53user 212.48system 27:32.92elapsed 433%CPU
-//     (0avgtext+0avgdata 6429568maxresident)k
-//   36181912inputs+8943720outputs (3613684major+97908699minor)pagefaults 0swaps
-//  execution of tests:
-//   205.99user 26.05system 1:08.87elapsed 336%CPU
-//     (0avgtext+0avgdata 2671452maxresident)k
-//   244432inputs+355464outputs (422major+8746468minor)pagefaults 0swaps
-// After:
-//  build:
-//   6651.91user 182.57system 25:15.73elapsed 450%CPU
-//     (0avgtext+0avgdata 6209296maxresident)k
-//   17413480inputs+6376360outputs (1567210major+93068230minor)pagefaults 0swaps
-//  execution of tests:
-//   201.42user 25.91system 1:04.68elapsed 351%CPU
-//     (0avgtext+0avgdata 2661424maxresident)k
-//   238840inputs+295912outputs (428major+8489300minor)pagefaults 0swaps
 
 #ifndef FORTRAN_COMMON_VISIT_H_
 #define FORTRAN_COMMON_VISIT_H_
@@ -52,7 +31,22 @@ template <std::size_t LOW, std::size_t HIGH, typename RESULT, typename VISITOR,
     typename... VARIANT>
 inline RESULT Log2VisitHelper(
     VISITOR &&visitor, std::size_t which, VARIANT &&...u) {
-  if constexpr (LOW == HIGH) {
+  if constexpr (LOW + 7 >= HIGH) {
+    switch (which - LOW) {
+#define VISIT_CASE_N(N) \
+  case N: \
+    if constexpr (LOW + N <= HIGH) { \
+      return visitor(std::get<(LOW + N)>(std::forward<VARIANT>(u))...); \
+    }
+      VISIT_CASE_N(1)
+      VISIT_CASE_N(2)
+      VISIT_CASE_N(3)
+      VISIT_CASE_N(4)
+      VISIT_CASE_N(5)
+      VISIT_CASE_N(6)
+      VISIT_CASE_N(7)
+#undef VISIT_CASE_N
+    }
     return visitor(std::get<LOW>(std::forward<VARIANT>(u))...);
   } else {
     static constexpr std::size_t mid{(HIGH + LOW) / 2};
diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index f2f37866ecde86..82c31c0c404301 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -177,6 +177,14 @@ class TypeAndShape {
   int corank() const { return corank_; }
 
   int Rank() const { return GetRank(shape_); }
+
+  // Can sequence association apply to this argument?
+  bool CanBeSequenceAssociated() const {
+    constexpr Attrs notAssumedOrExplicitShape{
+        ~Attrs{Attr::AssumedSize, Attr::Coarray}};
+    return Rank() > 0 && (attrs() & notAssumedOrExplicitShape).none();
+  }
+
   bool IsCompatibleWith(parser::ContextualMessages &, const TypeAndShape &that,
       const char *thisIs = "pointer", const char *thatIs = "target",
       bool omitShapeConformanceCheck = false,
diff --git a/flang/include/flang/Evaluate/real.h b/flang/include/flang/Evaluate/real.h
index d0da9634651f36..b7af0ff6b431c8 100644
--- a/flang/include/flang/Evaluate/real.h
+++ b/flang/include/flang/Evaluate/real.h
@@ -221,20 +221,33 @@ class Real : public common::RealDetails<PREC> {
     // Normalize a fraction with just its LSB set and then multiply.
     // (Set the LSB, not the MSB, in case the scale factor needs to
     //  be subnormal.)
-    auto adjust{exponentBias + binaryPrecision - 1};
+    constexpr auto adjust{exponentBias + binaryPrecision - 1};
+    constexpr auto maxCoeffExpo{maxExponent + binaryPrecision - 1};
     auto expo{adjust + by.ToInt64()};
-    Real twoPow;
     RealFlags flags;
     int rMask{1};
     if (IsZero()) {
       expo = exponentBias; // ignore by, don't overflow
-    } else if (by > INT{maxExponent}) {
-      expo = maxExponent + binaryPrecision - 1;
-    } else if (by < INT{-adjust}) { // underflow
-      expo = 0;
-      rMask = 0;
-      flags.set(RealFlag::Underflow);
+    } else if (expo > maxCoeffExpo) {
+      if (Exponent() < exponentBias) {
+        // Must implement with two multiplications
+        return SCALE(INT{exponentBias})
+            .value.SCALE(by.SubtractSigned(INT{exponentBias}).value, rounding);
+      } else { // overflow
+        expo = maxCoeffExpo;
+      }
+    } else if (expo < 0) {
+      if (Exponent() > exponentBias) {
+        // Must implement with two multiplications
+        return SCALE(INT{-exponentBias})
+            .value.SCALE(by.AddSigned(INT{exponentBias}).value, rounding);
+      } else { // underflow to zero
+        expo = 0;
+        rMask = 0;
+        flags.set(RealFlag::Underflow);
+      }
     }
+    Real twoPow;
     flags |=
         twoPow.Normalize(false, static_cast<int>(expo), Fraction::MASKR(rMask));
     ValueWithRealFlags<Real> result{Multiply(twoPow, rounding)};
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 53896072675abc..9a32062440abc0 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -430,6 +430,28 @@ template <typename A> std::optional<CoarrayRef> ExtractCoarrayRef(const A &x) {
   }
 }
 
+struct ExtractSubstringHelper {
+  template <typename T> static std::optional<Substring> visit(T &&) {
+    return std::nullopt;
+  }
+
+  static std::optional<Substring> visit(const Substring &e) { return e; }
+
+  template <typename T>
+  static std::optional<Substring> visit(const Designator<T> &e) {
+    return std::visit([](auto &&s) { return visit(s); }, e.u);
+  }
+
+  template <typename T>
+  static std::optional<Substring> visit(const Expr<T> &e) {
+    return std::visit([](auto &&s) { return visit(s); }, e.u);
+  }
+};
+
+template <typename A> std::optional<Substring> ExtractSubstring(const A &x) {
+  return ExtractSubstringHelper::visit(x);
+}
+
 // If an expression is simply a whole symbol data designator,
 // extract and return that symbol, else null.
 template <typename A> const Symbol *UnwrapWholeSymbolDataRef(const A &x) {
@@ -1196,6 +1218,20 @@ bool CheckForCoindexedObject(parser::ContextualMessages &,
     const std::optional<ActualArgument> &, const std::string &procName,
     const std::string &argName);
 
+/// Check if any of the symbols part of the expression has a cuda data
+/// attribute.
+inline bool HasCUDAAttrs(const Expr<SomeType> &expr) {
+  for (const Symbol &sym : CollectSymbols(expr)) {
+    if (const auto *details =
+            sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+      if (details->cudaDataAttr()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 } // namespace Fortran::evaluate
 
 namespace Fortran::semantics {
diff --git a/flang/include/flang/ISO_Fortran_binding.h b/flang/include/flang/ISO_Fortran_binding.h
index 3f74a7e56f1755..757d7f2b10cba1 100644
--- a/flang/include/flang/ISO_Fortran_binding.h
+++ b/flang/include/flang/ISO_Fortran_binding.h
@@ -127,7 +127,7 @@ namespace cfi_internal {
 // because a struct cannot be empty.
 extern "C++" template <typename T> struct FlexibleArray : T {
   RT_API_ATTRS T &operator[](int index) { return *(this + index); }
-  const RT_API_ATTRS T &operator[](int index) const { return *(this + index); }
+  RT_API_ATTRS const T &operator[](int index) const { return *(this + index); }
   RT_API_ATTRS operator T *() { return this; }
   RT_API_ATTRS operator const T *() const { return this; }
 };
diff --git a/flang/include/flang/Lower/CallInterface.h b/flang/include/flang/Lower/CallInterface.h
index e77ac4e179ba86..80b05764253778 100644
--- a/flang/include/flang/Lower/CallInterface.h
+++ b/flang/include/flang/Lower/CallInterface.h
@@ -174,6 +174,12 @@ class CallInterface {
     /// May the dummy argument require INTENT(OUT) finalization
     /// on entry to the invoked procedure? Provides conservative answer.
     bool mayRequireIntentoutFinalization() const;
+    /// Is the dummy argument an explicit-shape or assumed-size array that
+    /// must be passed by descriptor? Sequence association imply the actual
+    /// argument shape/rank may differ with the dummy shape/rank (see F'2023
+    /// section 15.5.2.12), so care is needed when creating the descriptor
+    /// for the dummy argument.
+    bool isSequenceAssociatedDescriptor() const;
     /// How entity is passed by.
     PassEntityBy passBy;
     /// What is the entity (SymbolRef for callee/ActualArgument* for caller)
@@ -273,8 +279,6 @@ class CallerInterface : public CallInterface<CallerInterface> {
     actualInputs.resize(getNumFIRArguments());
   }
 
-  using ExprVisitor = std::function<void(evaluate::Expr<evaluate::SomeType>)>;
-
   /// CRTP callbacks
   bool hasAlternateReturns() const;
   std::string getMangledName() const;
@@ -312,12 +316,21 @@ class CallerInterface : public CallInterface<CallerInterface> {
   /// procedure.
   const Fortran::semantics::Symbol *getProcedureSymbol() const;
 
+  /// Return the dummy argument symbol if this is a call to a user
+  /// defined procedure with explicit interface. Returns nullptr if there
+  /// is no user defined explicit interface.
+  const Fortran::semantics::Symbol *
+  getDummySymbol(const PassedEntity &entity) const;
+
   /// Helpers to place the lowered arguments at the right place once they
   /// have been lowered.
   void placeInput(const PassedEntity &passedEntity, mlir::Value arg);
   void placeAddressAndLengthInput(const PassedEntity &passedEntity,
                                   mlir::Value addr, mlir::Value len);
 
+  /// Get lowered FIR argument given the Fortran argument.
+  mlir::Value getInput(const PassedEntity &passedEntity);
+
   /// If this is a call to a procedure pointer or dummy, returns the related
   /// procedure designator. Nullptr otherwise.
   const Fortran::evaluate::ProcedureDesignator *getIfIndirectCall() const;
@@ -333,13 +346,27 @@ class CallerInterface : public CallInterface<CallerInterface> {
   /// the result specification expressions (extents and lengths) ? If needed,
   /// this mapping must be done after argument lowering, and before the call
   /// itself.
-  bool mustMapInterfaceSymbols() const;
+  bool mustMapInterfaceSymbolsForResult() const;
+  /// Must the caller map function interface symbols in order to evaluate
+  /// the specification expressions of a given dummy argument?
+  bool mustMapInterfaceSymbolsForDummyArgument(const PassedEntity &) const;
+
+  /// Visitor for specification expression. Boolean indicate the specification
+  /// expression is for the last extent of an assumed size array.
+  using ExprVisitor =
+      std::function<void(evaluate::Expr<evaluate::SomeType>, bool)>;
 
   /// Walk the result non-deferred extent specification expressions.
-  void walkResultExtents(ExprVisitor) const;
+  void walkResultExtents(const ExprVisitor &) const;
 
   /// Walk the result non-deferred length specification expressions.
-  void walkResultLengths(ExprVisitor) const;
+  void walkResultLengths(const ExprVisitor &) const;
+  /// Walk non-deferred extent specification expressions of a dummy argument.
+  void walkDummyArgumentExtents(const PassedEntity &,
+                                const ExprVisitor &) const;
+  /// Walk non-deferred length specification expressions of a dummy argument.
+  void walkDummyArgumentLengths(const PassedEntity &,
+                                const ExprVisitor &) const;
 
   /// Get the mlir::Value that is passed as argument \p sym of the function
   /// being called. The arguments must have been placed before calling this
@@ -355,6 +382,9 @@ class CallerInterface : public CallInterface<CallerInterface> {
   /// returns the storage type.
   mlir::Type getResultStorageType() const;
 
+  /// Return FIR type of argument.
+  mlir::Type getDummyArgumentType(const PassedEntity &) const;
+
   // Copy of base implementation.
   static constexpr bool hasHostAssociated() { return false; }
   mlir::Type getHostAssociatedTy() const {
diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index b13bb412f0f3e7..ab30e317d1d9d4 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -93,9 +93,16 @@ void mapSymbolAttributes(AbstractConverter &, const semantics::SymbolRef &,
 /// Instantiate the variables that appear in the specification expressions
 /// of the result of a function call. The instantiated variables are added
 /// to \p symMap.
-void mapCallInterfaceSymbols(AbstractConverter &,
-                             const Fortran::lower::CallerInterface &caller,
-                             SymMap &symMap);
+void mapCallInterfaceSymbolsForResult(
+    AbstractConverter &, const Fortran::lower::CallerInterface &caller,
+    SymMap &symMap);
+
+/// Instantiate the variables that appear in the specification expressions
+/// of a dummy argument of a procedure call. The instantiated variables are
+/// added to \p symMap.
+void mapCallInterfaceSymbolsForDummyArgument(
+    AbstractConverter &, const Fortran::lower::CallerInterface &caller,
+    SymMap &symMap, const Fortran::semantics::Symbol &dummySymbol);
 
 // TODO: consider saving the initial expression symbol dependence analysis in
 // in the PFT variable and dealing with the dependent symbols instantiation in
diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def
index 9de69ac5c80f52..be080a4d29d73d 100644
--- a/flang/include/flang/Lower/LoweringOptions.def
+++ b/flang/include/flang/Lower/LoweringOptions.def
@@ -24,9 +24,6 @@ LOWERINGOPT(Name, Bits, Default)
 /// If true, lower transpose without a runtime call.
 ENUM_LOWERINGOPT(OptimizeTranspose, unsigned, 1, 1)
 
-/// If true, enable polymorphic type lowering feature. On by default.
-ENUM_LOWERINGOPT(PolymorphicTypeImpl, unsigned, 1, 1)
-
 /// If true, lower to High level FIR before lowering to FIR. On by default.
 ENUM_LOWERINGOPT(LowerToHighLevelFIR, unsigned, 1, 1)
 
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index ce87941d5382ca..035035601e2f25 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -434,6 +434,11 @@ std::pair<hlfir::Entity, mlir::Value>
 createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
                    hlfir::Entity mold);
 
+// TODO: this does not support polymorphic molds
+hlfir::Entity createStackTempFromMold(mlir::Location loc,
+                                      fir::FirOpBuilder &builder,
+                                      hlfir::Entity mold);
+
 hlfir::EntityWithAttributes convertCharacterKind(mlir::Location loc,
                                                  fir::FirOpBuilder &builder,
                                                  hlfir::Entity scalarChar,
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index ca15b4bc34b29e..6927488517e63b 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -208,6 +208,9 @@ struct IntrinsicLibrary {
   void genCFProcPointer(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+  template <mlir::arith::CmpIPredicate pred>
+  fir::ExtendedValue genCPtrCompare(mlir::Type,
+                                    llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genCosd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genDateAndTime(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genDim(mlir::Type, llvm::ArrayRef<mlir::Value>);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
index c1a72478e224c7..fec8c9906effed 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
@@ -30,6 +30,10 @@ mlir::Value genFraction(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value genMod(fir::FirOpBuilder &builder, mlir::Location loc,
                    mlir::Value a, mlir::Value p);
 
+/// Generate call to Modulo intrinsic runtime routine.
+mlir::Value genModulo(fir::FirOpBuilder &builder, mlir::Location loc,
+                      mlir::Value a, mlir::Value p);
+
 /// Generate call to Nearest intrinsic runtime routine.
 mlir::Value genNearest(fir::FirOpBuilder &builder, mlir::Location loc,
                        mlir::Value x, mlir::Value s);
diff --git a/flang/include/flang/Optimizer/CodeGen/CodeGen.h b/flang/include/flang/Optimizer/CodeGen/CodeGen.h
index cbf02ec3912363..26097dabf56c45 100644
--- a/flang/include/flang/Optimizer/CodeGen/CodeGen.h
+++ b/flang/include/flang/Optimizer/CodeGen/CodeGen.h
@@ -87,6 +87,9 @@ void populateFIRToLLVMConversionPatterns(fir::LLVMTypeConverter &converter,
                                          mlir::RewritePatternSet &patterns,
                                          fir::FIRToLLVMPassOptions &options);
 
+/// Populate the pattern set with the PreCGRewrite patterns.
+void populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns);
+
 // declarative passes
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/CodeGen/CGPasses.h.inc"
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 65a86d25333b5d..8a7e36e42457f5 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2454,6 +2454,7 @@ def fir_CUDAKernelLaunch : fir_Op<"cuda_kernel_launch", [CallOpInterface,
     SymbolRefAttr:$callee,
     I32:$grid_x,
     I32:$grid_y,
+    I32:$grid_z,
     I32:$block_x,
     I32:$block_y,
     I32:$block_z,
@@ -2463,9 +2464,9 @@ def fir_CUDAKernelLaunch : fir_Op<"cuda_kernel_launch", [CallOpInterface,
   );
 
   let assemblyFormat = [{
-    $callee `<` `<` `<` $grid_x `,` $grid_y `,` $block_x `,` $block_y `,`
-        $block_z ( `,` $bytes^ ( `,` $stream^ )? )? `>` `>` `>`
-        `` `(` ( $args^ `:` type($args) )? `)` attr-dict
+    $callee `<` `<` `<` $grid_x `,` $grid_y `,` $grid_z `,`$block_x `,`
+        $block_y `,` $block_z ( `,` $bytes^ ( `,` $stream^ )? )? `>` `>` `>`
+        `` `(` $args `)` `:` `(` type($args) `)` attr-dict
   }];
 
   let extraClassDeclaration = [{
@@ -3130,6 +3131,16 @@ def fir_BoxOffsetOp : fir_Op<"box_offset", [NoMemoryEffect]> {
 def fir_CUDAKernelOp : fir_Op<"cuda_kernel", [AttrSizedOperandSegments,
     DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
 
+  let description = [{
+    Represent the CUDA Fortran kernel directive. The operation is a loop like
+    operation that represents the iteration range of the embedded loop nest.
+
+    When grid or block variadic operands are empty, a `*` only syntax was used
+    in the Fortran code.
+    If the `*` is mixed with values for either grid or block, these are
+    represented by a 0 constant value.
+  }];
+
   let arguments = (ins
     Variadic<I32>:$grid, // empty means `*`
     Variadic<I32>:$block, // empty means `*`
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index a526b4ddf3b98c..7fcd9c1babf24f 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -237,6 +237,9 @@ inline mlir::Type unwrapSequenceType(mlir::Type t) {
   return t;
 }
 
+/// Return the nested sequence type if any.
+mlir::Type extractSequenceType(mlir::Type ty);
+
 inline mlir::Type unwrapRefType(mlir::Type t) {
   if (auto eleTy = dyn_cast_ptrEleTy(t))
     return eleTy;
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index e1d22c8c986da7..a60d39c8df08d8 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -11,6 +11,7 @@
 
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
 #include <memory>
@@ -37,7 +38,8 @@ namespace fir {
 #define GEN_PASS_DECL_ANNOTATECONSTANTOPERANDS
 #define GEN_PASS_DECL_ARRAYVALUECOPY
 #define GEN_PASS_DECL_CHARACTERCONVERSION
-#define GEN_PASS_DECL_CFGCONVERSION
+#define GEN_PASS_DECL_CFGCONVERSIONONFUNC
+#define GEN_PASS_DECL_CFGCONVERSIONONREDUCTION
 #define GEN_PASS_DECL_EXTERNALNAMECONVERSION
 #define GEN_PASS_DECL_MEMREFDATAFLOWOPT
 #define GEN_PASS_DECL_SIMPLIFYINTRINSICS
@@ -53,7 +55,8 @@ std::unique_ptr<mlir::Pass> createAbstractResultOnGlobalOptPass();
 std::unique_ptr<mlir::Pass> createAffineDemotionPass();
 std::unique_ptr<mlir::Pass>
 createArrayValueCopyPass(fir::ArrayValueCopyOptions options = {});
-std::unique_ptr<mlir::Pass> createFirToCfgPass();
+std::unique_ptr<mlir::Pass> createFirToCfgOnFuncPass();
+std::unique_ptr<mlir::Pass> createFirToCfgOnReductionPass();
 std::unique_ptr<mlir::Pass> createCharacterConversionPass();
 std::unique_ptr<mlir::Pass> createExternalNameConversionPass();
 std::unique_ptr<mlir::Pass>
@@ -96,6 +99,9 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath,
                        bool noNaNsFPMath, bool approxFuncFPMath,
                        bool noSignedZerosFPMath, bool unsafeFPMath);
 
+void populateCfgConversionRewrites(mlir::RewritePatternSet &patterns,
+                                   bool forceLoopToExecuteOnce = false);
+
 // declarative passes
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/Transforms/Passes.h.inc"
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 5fb576fd876254..e6ea92d814400f 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -145,7 +145,8 @@ def CharacterConversion : Pass<"character-conversion"> {
   ];
 }
 
-def CFGConversion : Pass<"cfg-conversion", "::mlir::func::FuncOp"> {
+class CFGConversionBase<string optExt, string operation>
+  : Pass<"cfg-conversion-on-" # optExt # "-opt", operation> {
   let summary = "Convert FIR structured control flow ops to CFG ops.";
   let description = [{
     Transform the `fir.do_loop`, `fir.if`, `fir.iterate_while` and
@@ -154,7 +155,6 @@ def CFGConversion : Pass<"cfg-conversion", "::mlir::func::FuncOp"> {
 
     This pass is required before code gen to the LLVM IR dialect.
   }];
-  let constructor = "::fir::createFirToCfgPass()";
   let dependentDialects = [
     "fir::FIROpsDialect", "mlir::func::FuncDialect"
   ];
@@ -165,6 +165,14 @@ def CFGConversion : Pass<"cfg-conversion", "::mlir::func::FuncOp"> {
   ];
 }
 
+def CFGConversionOnFunc : CFGConversionBase<"func", "mlir::func::FuncOp"> {
+  let constructor = "::fir::createFirToCfgOnFuncPass()";
+}
+
+def CFGConversionOnReduction : CFGConversionBase<"reduce", "mlir::omp::ReductionDeclareOp"> {
+  let constructor = "::fir::createFirToCfgOnReductionPass()";
+}
+
 def ExternalNameConversion : Pass<"external-name-interop", "mlir::ModuleOp"> {
   let summary = "Convert name for external interoperability";
   let description = [{
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 048008a8d80c79..b2c3d92909375c 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -233,6 +233,7 @@ class ParseTreeDumper {
   NODE(parser, CriticalStmt)
   NODE(parser, CUDAAttributesStmt)
   NODE(parser, CUFKernelDoConstruct)
+  NODE(CUFKernelDoConstruct, StarOrExpr)
   NODE(CUFKernelDoConstruct, Directive)
   NODE(parser, CycleStmt)
   NODE(parser, DataComponentDefStmt)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index f7b72c3af09164..c96abfba491d4b 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4297,16 +4297,18 @@ struct OpenACCConstruct {
 // CUF-kernel-do-construct ->
 //     !$CUF KERNEL DO [ (scalar-int-constant-expr) ] <<< grid, block [, stream]
 //     >>> do-construct
-// grid -> * | scalar-int-expr | ( scalar-int-expr-list )
-// block -> * | scalar-int-expr | ( scalar-int-expr-list )
+// star-or-expr -> * | scalar-int-expr
+// grid -> * | scalar-int-expr | ( star-or-expr-list )
+// block -> * | scalar-int-expr | ( star-or-expr-list )
 // stream -> 0, scalar-int-expr | STREAM = scalar-int-expr
 struct CUFKernelDoConstruct {
   TUPLE_CLASS_BOILERPLATE(CUFKernelDoConstruct);
+  WRAPPER_CLASS(StarOrExpr, std::optional<ScalarIntExpr>);
   struct Directive {
     TUPLE_CLASS_BOILERPLATE(Directive);
     CharBlock source;
-    std::tuple<std::optional<ScalarIntConstantExpr>, std::list<ScalarIntExpr>,
-        std::list<ScalarIntExpr>, std::optional<ScalarIntExpr>>
+    std::tuple<std::optional<ScalarIntConstantExpr>, std::list<StarOrExpr>,
+        std::list<StarOrExpr>, std::optional<ScalarIntExpr>>
         t;
   };
   std::tuple<Directive, std::optional<DoConstruct>> t;
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 7ad548d6c72b44..96d56d9b43a62b 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -102,7 +102,7 @@ class DescriptorAddendum {
       : derivedType_{dt}, len_{0} {}
   RT_API_ATTRS DescriptorAddendum &operator=(const DescriptorAddendum &);
 
-  const RT_API_ATTRS typeInfo::DerivedType *derivedType() const {
+  RT_API_ATTRS const typeInfo::DerivedType *derivedType() const {
     return derivedType_;
   }
   RT_API_ATTRS DescriptorAddendum &set_derivedType(
@@ -204,7 +204,7 @@ class Descriptor {
       ISO::CFI_attribute_t attribute = CFI_attribute_other);
 
   RT_API_ATTRS ISO::CFI_cdesc_t &raw() { return raw_; }
-  const RT_API_ATTRS ISO::CFI_cdesc_t &raw() const { return raw_; }
+  RT_API_ATTRS const ISO::CFI_cdesc_t &raw() const { return raw_; }
   RT_API_ATTRS std::size_t ElementBytes() const { return raw_.elem_len; }
   RT_API_ATTRS int rank() const { return raw_.rank; }
   RT_API_ATTRS TypeCode type() const { return TypeCode{raw_.type}; }
@@ -225,7 +225,7 @@ class Descriptor {
   RT_API_ATTRS Dimension &GetDimension(int dim) {
     return *reinterpret_cast<Dimension *>(&raw_.dim[dim]);
   }
-  const RT_API_ATTRS Dimension &GetDimension(int dim) const {
+  RT_API_ATTRS const Dimension &GetDimension(int dim) const {
     return *reinterpret_cast<const Dimension *>(&raw_.dim[dim]);
   }
 
@@ -345,7 +345,7 @@ class Descriptor {
       return nullptr;
     }
   }
-  const RT_API_ATTRS DescriptorAddendum *Addendum() const {
+  RT_API_ATTRS const DescriptorAddendum *Addendum() const {
     if (raw_.f18Addendum != 0) {
       return reinterpret_cast<const DescriptorAddendum *>(
           &GetDimension(rank()));
@@ -448,7 +448,7 @@ class alignas(Descriptor) StaticDescriptor {
   RT_API_ATTRS Descriptor &descriptor() {
     return *reinterpret_cast<Descriptor *>(storage_);
   }
-  const RT_API_ATTRS Descriptor &descriptor() const {
+  RT_API_ATTRS const Descriptor &descriptor() const {
     return *reinterpret_cast<const Descriptor *>(storage_);
   }
 
diff --git a/flang/include/flang/Runtime/type-code.h b/flang/include/flang/Runtime/type-code.h
index 3757840cfdef96..f7419249c2ba9c 100644
--- a/flang/include/flang/Runtime/type-code.h
+++ b/flang/include/flang/Runtime/type-code.h
@@ -10,6 +10,7 @@
 #define FORTRAN_RUNTIME_TYPE_CODE_H_
 
 #include "flang/Common/Fortran.h"
+#include "flang/Common/optional.h"
 #include "flang/ISO_Fortran_binding_wrapper.h"
 #include <optional>
 #include <utility>
@@ -54,7 +55,7 @@ class TypeCode {
     return IsValid() && !IsDerived();
   }
 
-  RT_API_ATTRS std::optional<std::pair<TypeCategory, int>>
+  RT_API_ATTRS Fortran::common::optional<std::pair<TypeCategory, int>>
   GetCategoryAndKind() const;
 
   RT_API_ATTRS bool operator==(TypeCode that) const {
diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h
index a4f27b00152e26..91773ae3ea9a3e 100644
--- a/flang/include/flang/Semantics/openmp-directive-sets.h
+++ b/flang/include/flang/Semantics/openmp-directive-sets.h
@@ -20,32 +20,27 @@ namespace llvm::omp {
 // Directive sets for single directives
 //===----------------------------------------------------------------------===//
 // - top<Directive>Set: The directive appears alone or as the first in a
-//   combined construct.
-// - all<Directive>Set: All standalone or combined uses of the directive.
+//   compound construct.
+// - all<Directive>Set: All standalone or compound uses of the directive.
 
-static const OmpDirectiveSet topParallelSet{
-    Directive::OMPD_parallel,
-    Directive::OMPD_parallel_do,
-    Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_parallel_sections,
-    Directive::OMPD_parallel_workshare,
-};
-
-static const OmpDirectiveSet allParallelSet{
+static const OmpDirectiveSet topDistributeSet{
+    Directive::OMPD_distribute,
     Directive::OMPD_distribute_parallel_do,
     Directive::OMPD_distribute_parallel_do_simd,
-    Directive::OMPD_parallel,
-    Directive::OMPD_parallel_do,
-    Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_parallel_sections,
-    Directive::OMPD_parallel_workshare,
-    Directive::OMPD_target_parallel,
-    Directive::OMPD_target_parallel_do,
-    Directive::OMPD_target_parallel_do_simd,
-    Directive::OMPD_target_teams_distribute_parallel_do,
-    Directive::OMPD_target_teams_distribute_parallel_do_simd,
-    Directive::OMPD_teams_distribute_parallel_do,
-    Directive::OMPD_teams_distribute_parallel_do_simd,
+    Directive::OMPD_distribute_simd,
+};
+
+static const OmpDirectiveSet allDistributeSet{
+    OmpDirectiveSet{
+        llvm::omp::OMPD_target_teams_distribute,
+        llvm::omp::OMPD_target_teams_distribute_parallel_do,
+        llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
+        llvm::omp::OMPD_target_teams_distribute_simd,
+        llvm::omp::OMPD_teams_distribute,
+        llvm::omp::OMPD_teams_distribute_parallel_do,
+        llvm::omp::OMPD_teams_distribute_parallel_do_simd,
+        llvm::omp::OMPD_teams_distribute_simd,
+    } | topDistributeSet,
 };
 
 static const OmpDirectiveSet topDoSet{
@@ -54,26 +49,69 @@ static const OmpDirectiveSet topDoSet{
 };
 
 static const OmpDirectiveSet allDoSet{
-    Directive::OMPD_distribute_parallel_do,
-    Directive::OMPD_distribute_parallel_do_simd,
+    OmpDirectiveSet{
+        Directive::OMPD_distribute_parallel_do,
+        Directive::OMPD_distribute_parallel_do_simd,
+        Directive::OMPD_parallel_do,
+        Directive::OMPD_parallel_do_simd,
+        Directive::OMPD_target_parallel_do,
+        Directive::OMPD_target_parallel_do_simd,
+        Directive::OMPD_target_teams_distribute_parallel_do,
+        Directive::OMPD_target_teams_distribute_parallel_do_simd,
+        Directive::OMPD_teams_distribute_parallel_do,
+        Directive::OMPD_teams_distribute_parallel_do_simd,
+    } | topDoSet,
+};
+
+static const OmpDirectiveSet topParallelSet{
+    Directive::OMPD_parallel,
     Directive::OMPD_parallel_do,
     Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_do,
-    Directive::OMPD_do_simd,
-    Directive::OMPD_target_parallel_do,
-    Directive::OMPD_target_parallel_do_simd,
-    Directive::OMPD_target_teams_distribute_parallel_do,
-    Directive::OMPD_target_teams_distribute_parallel_do_simd,
-    Directive::OMPD_teams_distribute_parallel_do,
-    Directive::OMPD_teams_distribute_parallel_do_simd,
+    Directive::OMPD_parallel_masked_taskloop,
+    Directive::OMPD_parallel_masked_taskloop_simd,
+    Directive::OMPD_parallel_master_taskloop,
+    Directive::OMPD_parallel_master_taskloop_simd,
+    Directive::OMPD_parallel_sections,
+    Directive::OMPD_parallel_workshare,
 };
 
-static const OmpDirectiveSet topTaskloopSet{
-    Directive::OMPD_taskloop,
-    Directive::OMPD_taskloop_simd,
+static const OmpDirectiveSet allParallelSet{
+    OmpDirectiveSet{
+        Directive::OMPD_distribute_parallel_do,
+        Directive::OMPD_distribute_parallel_do_simd,
+        Directive::OMPD_target_parallel,
+        Directive::OMPD_target_parallel_do,
+        Directive::OMPD_target_parallel_do_simd,
+        Directive::OMPD_target_teams_distribute_parallel_do,
+        Directive::OMPD_target_teams_distribute_parallel_do_simd,
+        Directive::OMPD_teams_distribute_parallel_do,
+        Directive::OMPD_teams_distribute_parallel_do_simd,
+    } | topParallelSet,
+};
+
+static const OmpDirectiveSet topSimdSet{
+    Directive::OMPD_simd,
 };
 
-static const OmpDirectiveSet allTaskloopSet{topTaskloopSet};
+static const OmpDirectiveSet allSimdSet{
+    OmpDirectiveSet{
+        Directive::OMPD_distribute_parallel_do_simd,
+        Directive::OMPD_distribute_simd,
+        Directive::OMPD_do_simd,
+        Directive::OMPD_masked_taskloop_simd,
+        Directive::OMPD_master_taskloop_simd,
+        Directive::OMPD_parallel_do_simd,
+        Directive::OMPD_parallel_masked_taskloop_simd,
+        Directive::OMPD_parallel_master_taskloop_simd,
+        Directive::OMPD_target_parallel_do_simd,
+        Directive::OMPD_target_simd,
+        Directive::OMPD_target_teams_distribute_parallel_do_simd,
+        Directive::OMPD_target_teams_distribute_simd,
+        Directive::OMPD_taskloop_simd,
+        Directive::OMPD_teams_distribute_parallel_do_simd,
+        Directive::OMPD_teams_distribute_simd,
+    } | topSimdSet,
+};
 
 static const OmpDirectiveSet topTargetSet{
     Directive::OMPD_target,
@@ -90,23 +128,22 @@ static const OmpDirectiveSet topTargetSet{
 
 static const OmpDirectiveSet allTargetSet{topTargetSet};
 
-static const OmpDirectiveSet topSimdSet{
-    Directive::OMPD_simd,
+static const OmpDirectiveSet topTaskloopSet{
+    Directive::OMPD_taskloop,
+    Directive::OMPD_taskloop_simd,
 };
 
-static const OmpDirectiveSet allSimdSet{
-    Directive::OMPD_distribute_parallel_do_simd,
-    Directive::OMPD_distribute_simd,
-    Directive::OMPD_do_simd,
-    Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_simd,
-    Directive::OMPD_target_parallel_do_simd,
-    Directive::OMPD_target_simd,
-    Directive::OMPD_target_teams_distribute_parallel_do_simd,
-    Directive::OMPD_target_teams_distribute_simd,
-    Directive::OMPD_taskloop_simd,
-    Directive::OMPD_teams_distribute_parallel_do_simd,
-    Directive::OMPD_teams_distribute_simd,
+static const OmpDirectiveSet allTaskloopSet{
+    OmpDirectiveSet{
+        Directive::OMPD_masked_taskloop,
+        Directive::OMPD_masked_taskloop_simd,
+        Directive::OMPD_master_taskloop,
+        Directive::OMPD_master_taskloop_simd,
+        Directive::OMPD_parallel_masked_taskloop,
+        Directive::OMPD_parallel_masked_taskloop_simd,
+        Directive::OMPD_parallel_master_taskloop,
+        Directive::OMPD_parallel_master_taskloop_simd,
+    } | topTaskloopSet,
 };
 
 static const OmpDirectiveSet topTeamsSet{
@@ -118,178 +155,134 @@ static const OmpDirectiveSet topTeamsSet{
 };
 
 static const OmpDirectiveSet allTeamsSet{
-    llvm::omp::OMPD_target_teams,
-    llvm::omp::OMPD_target_teams_distribute,
-    llvm::omp::OMPD_target_teams_distribute_parallel_do,
-    llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
-    llvm::omp::OMPD_target_teams_distribute_simd,
-    llvm::omp::OMPD_teams,
-    llvm::omp::OMPD_teams_distribute,
-    llvm::omp::OMPD_teams_distribute_parallel_do,
-    llvm::omp::OMPD_teams_distribute_parallel_do_simd,
-    llvm::omp::OMPD_teams_distribute_simd,
-};
-
-static const OmpDirectiveSet topDistributeSet{
-    Directive::OMPD_distribute,
-    Directive::OMPD_distribute_parallel_do,
-    Directive::OMPD_distribute_parallel_do_simd,
-    Directive::OMPD_distribute_simd,
-};
-
-static const OmpDirectiveSet allDistributeSet{
-    llvm::omp::OMPD_distribute,
-    llvm::omp::OMPD_distribute_parallel_do,
-    llvm::omp::OMPD_distribute_parallel_do_simd,
-    llvm::omp::OMPD_distribute_simd,
-    llvm::omp::OMPD_target_teams_distribute,
-    llvm::omp::OMPD_target_teams_distribute_parallel_do,
-    llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
-    llvm::omp::OMPD_target_teams_distribute_simd,
-    llvm::omp::OMPD_teams_distribute,
-    llvm::omp::OMPD_teams_distribute_parallel_do,
-    llvm::omp::OMPD_teams_distribute_parallel_do_simd,
-    llvm::omp::OMPD_teams_distribute_simd,
+    OmpDirectiveSet{
+        llvm::omp::OMPD_target_teams,
+        llvm::omp::OMPD_target_teams_distribute,
+        llvm::omp::OMPD_target_teams_distribute_parallel_do,
+        llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
+        llvm::omp::OMPD_target_teams_distribute_simd,
+    } | topTeamsSet,
 };
 
 //===----------------------------------------------------------------------===//
 // Directive sets for groups of multiple directives
 //===----------------------------------------------------------------------===//
 
+// Composite constructs
+static const OmpDirectiveSet allDistributeParallelDoSet{
+    allDistributeSet & allParallelSet & allDoSet};
+static const OmpDirectiveSet allDistributeParallelDoSimdSet{
+    allDistributeSet & allParallelSet & allDoSet & allSimdSet};
+static const OmpDirectiveSet allDistributeSimdSet{
+    allDistributeSet & allSimdSet};
 static const OmpDirectiveSet allDoSimdSet{allDoSet & allSimdSet};
+static const OmpDirectiveSet allTaskloopSimdSet{allTaskloopSet & allSimdSet};
 
-static const OmpDirectiveSet workShareSet{
-    OmpDirectiveSet{
-        Directive::OMPD_workshare,
-        Directive::OMPD_parallel_workshare,
-        Directive::OMPD_parallel_sections,
-        Directive::OMPD_sections,
-        Directive::OMPD_single,
-    } | allDoSet,
-};
-
-static const OmpDirectiveSet taskGeneratingSet{
-    OmpDirectiveSet{
-        Directive::OMPD_task,
-    } | allTaskloopSet,
-};
-
-static const OmpDirectiveSet nonPartialVarSet{
-    Directive::OMPD_allocate,
-    Directive::OMPD_allocators,
-    Directive::OMPD_threadprivate,
-    Directive::OMPD_declare_target,
+static const OmpDirectiveSet blockConstructSet{
+    Directive::OMPD_master,
+    Directive::OMPD_ordered,
+    Directive::OMPD_parallel,
+    Directive::OMPD_parallel_workshare,
+    Directive::OMPD_single,
+    Directive::OMPD_target,
+    Directive::OMPD_target_data,
+    Directive::OMPD_target_parallel,
+    Directive::OMPD_target_teams,
+    Directive::OMPD_task,
+    Directive::OMPD_taskgroup,
+    Directive::OMPD_teams,
+    Directive::OMPD_workshare,
 };
 
 static const OmpDirectiveSet loopConstructSet{
-    Directive::OMPD_distribute_parallel_do_simd,
+    Directive::OMPD_distribute,
     Directive::OMPD_distribute_parallel_do,
+    Directive::OMPD_distribute_parallel_do_simd,
     Directive::OMPD_distribute_simd,
-    Directive::OMPD_distribute,
-    Directive::OMPD_do_simd,
     Directive::OMPD_do,
-    Directive::OMPD_parallel_do_simd,
+    Directive::OMPD_do_simd,
+    Directive::OMPD_masked_taskloop,
+    Directive::OMPD_masked_taskloop_simd,
+    Directive::OMPD_master_taskloop,
+    Directive::OMPD_master_taskloop_simd,
     Directive::OMPD_parallel_do,
+    Directive::OMPD_parallel_do_simd,
+    Directive::OMPD_parallel_masked_taskloop,
+    Directive::OMPD_parallel_masked_taskloop_simd,
+    Directive::OMPD_parallel_master_taskloop,
+    Directive::OMPD_parallel_master_taskloop_simd,
     Directive::OMPD_simd,
-    Directive::OMPD_target_parallel_do_simd,
     Directive::OMPD_target_parallel_do,
+    Directive::OMPD_target_parallel_do_simd,
     Directive::OMPD_target_simd,
-    Directive::OMPD_target_teams_distribute_parallel_do_simd,
+    Directive::OMPD_target_teams_distribute,
     Directive::OMPD_target_teams_distribute_parallel_do,
+    Directive::OMPD_target_teams_distribute_parallel_do_simd,
     Directive::OMPD_target_teams_distribute_simd,
-    Directive::OMPD_target_teams_distribute,
-    Directive::OMPD_taskloop_simd,
     Directive::OMPD_taskloop,
-    Directive::OMPD_teams_distribute_parallel_do_simd,
+    Directive::OMPD_taskloop_simd,
+    Directive::OMPD_teams_distribute,
     Directive::OMPD_teams_distribute_parallel_do,
+    Directive::OMPD_teams_distribute_parallel_do_simd,
     Directive::OMPD_teams_distribute_simd,
-    Directive::OMPD_teams_distribute,
     Directive::OMPD_tile,
     Directive::OMPD_unroll,
 };
 
-static const OmpDirectiveSet blockConstructSet{
-    Directive::OMPD_master,
-    Directive::OMPD_ordered,
-    Directive::OMPD_parallel_workshare,
-    Directive::OMPD_parallel,
-    Directive::OMPD_single,
-    Directive::OMPD_target_data,
-    Directive::OMPD_target_parallel,
-    Directive::OMPD_target_teams,
-    Directive::OMPD_target,
-    Directive::OMPD_task,
-    Directive::OMPD_taskgroup,
-    Directive::OMPD_teams,
-    Directive::OMPD_workshare,
-};
-
-//===----------------------------------------------------------------------===//
-// Directive sets for allowed/not allowed nested directives
-//===----------------------------------------------------------------------===//
-
-static const OmpDirectiveSet nestedOrderedErrSet{
-    Directive::OMPD_critical,
-    Directive::OMPD_ordered,
-    Directive::OMPD_atomic,
-    Directive::OMPD_task,
-    Directive::OMPD_taskloop,
+static const OmpDirectiveSet nonPartialVarSet{
+    Directive::OMPD_allocate,
+    Directive::OMPD_allocators,
+    Directive::OMPD_threadprivate,
+    Directive::OMPD_declare_target,
 };
 
-static const OmpDirectiveSet nestedWorkshareErrSet{
+static const OmpDirectiveSet taskGeneratingSet{
     OmpDirectiveSet{
         Directive::OMPD_task,
-        Directive::OMPD_taskloop,
-        Directive::OMPD_critical,
-        Directive::OMPD_ordered,
-        Directive::OMPD_atomic,
-        Directive::OMPD_master,
-    } | workShareSet,
+    } | allTaskloopSet,
 };
 
-static const OmpDirectiveSet nestedMasterErrSet{
+static const OmpDirectiveSet workShareSet{
     OmpDirectiveSet{
-        Directive::OMPD_atomic,
-    } | taskGeneratingSet |
-        workShareSet,
+        Directive::OMPD_workshare,
+        Directive::OMPD_parallel_workshare,
+        Directive::OMPD_parallel_sections,
+        Directive::OMPD_sections,
+        Directive::OMPD_single,
+    } | allDoSet,
 };
 
+//===----------------------------------------------------------------------===//
+// Directive sets for allowed/not allowed nested directives
+//===----------------------------------------------------------------------===//
+
 static const OmpDirectiveSet nestedBarrierErrSet{
     OmpDirectiveSet{
-        Directive::OMPD_critical,
-        Directive::OMPD_ordered,
         Directive::OMPD_atomic,
+        Directive::OMPD_critical,
         Directive::OMPD_master,
+        Directive::OMPD_ordered,
     } | taskGeneratingSet |
         workShareSet,
 };
 
-static const OmpDirectiveSet nestedTeamsAllowedSet{
-    Directive::OMPD_parallel,
-    Directive::OMPD_parallel_do,
-    Directive::OMPD_parallel_do_simd,
-    Directive::OMPD_parallel_master,
-    Directive::OMPD_parallel_master_taskloop,
-    Directive::OMPD_parallel_master_taskloop_simd,
-    Directive::OMPD_parallel_sections,
-    Directive::OMPD_parallel_workshare,
-    Directive::OMPD_distribute,
+static const OmpDirectiveSet nestedCancelDoAllowedSet{
     Directive::OMPD_distribute_parallel_do,
-    Directive::OMPD_distribute_parallel_do_simd,
-    Directive::OMPD_distribute_simd,
+    Directive::OMPD_do,
+    Directive::OMPD_parallel_do,
+    Directive::OMPD_target_parallel_do,
+    Directive::OMPD_target_teams_distribute_parallel_do,
+    Directive::OMPD_teams_distribute_parallel_do,
 };
 
-static const OmpDirectiveSet nestedOrderedParallelErrSet{
+static const OmpDirectiveSet nestedCancelParallelAllowedSet{
     Directive::OMPD_parallel,
     Directive::OMPD_target_parallel,
-    Directive::OMPD_parallel_sections,
-    Directive::OMPD_parallel_workshare,
 };
 
-static const OmpDirectiveSet nestedOrderedDoAllowedSet{
-    Directive::OMPD_do,
-    Directive::OMPD_parallel_do,
-    Directive::OMPD_target_parallel_do,
+static const OmpDirectiveSet nestedCancelSectionsAllowedSet{
+    Directive::OMPD_parallel_sections,
+    Directive::OMPD_sections,
 };
 
 static const OmpDirectiveSet nestedCancelTaskgroupAllowedSet{
@@ -297,29 +290,64 @@ static const OmpDirectiveSet nestedCancelTaskgroupAllowedSet{
     Directive::OMPD_taskloop,
 };
 
-static const OmpDirectiveSet nestedCancelSectionsAllowedSet{
-    Directive::OMPD_sections,
-    Directive::OMPD_parallel_sections,
+static const OmpDirectiveSet nestedMasterErrSet{
+    OmpDirectiveSet{
+        Directive::OMPD_atomic,
+    } | taskGeneratingSet |
+        workShareSet,
 };
 
-static const OmpDirectiveSet nestedCancelDoAllowedSet{
+static const OmpDirectiveSet nestedOrderedDoAllowedSet{
     Directive::OMPD_do,
-    Directive::OMPD_distribute_parallel_do,
     Directive::OMPD_parallel_do,
     Directive::OMPD_target_parallel_do,
-    Directive::OMPD_target_teams_distribute_parallel_do,
-    Directive::OMPD_teams_distribute_parallel_do,
 };
 
-static const OmpDirectiveSet nestedCancelParallelAllowedSet{
+static const OmpDirectiveSet nestedOrderedErrSet{
+    Directive::OMPD_atomic,
+    Directive::OMPD_critical,
+    Directive::OMPD_ordered,
+    Directive::OMPD_task,
+    Directive::OMPD_taskloop,
+};
+
+static const OmpDirectiveSet nestedOrderedParallelErrSet{
     Directive::OMPD_parallel,
+    Directive::OMPD_parallel_sections,
+    Directive::OMPD_parallel_workshare,
     Directive::OMPD_target_parallel,
 };
 
 static const OmpDirectiveSet nestedReduceWorkshareAllowedSet{
     Directive::OMPD_do,
-    Directive::OMPD_sections,
     Directive::OMPD_do_simd,
+    Directive::OMPD_sections,
+};
+
+static const OmpDirectiveSet nestedTeamsAllowedSet{
+    Directive::OMPD_distribute,
+    Directive::OMPD_distribute_parallel_do,
+    Directive::OMPD_distribute_parallel_do_simd,
+    Directive::OMPD_distribute_simd,
+    Directive::OMPD_parallel,
+    Directive::OMPD_parallel_do,
+    Directive::OMPD_parallel_do_simd,
+    Directive::OMPD_parallel_master,
+    Directive::OMPD_parallel_master_taskloop,
+    Directive::OMPD_parallel_master_taskloop_simd,
+    Directive::OMPD_parallel_sections,
+    Directive::OMPD_parallel_workshare,
+};
+
+static const OmpDirectiveSet nestedWorkshareErrSet{
+    OmpDirectiveSet{
+        Directive::OMPD_atomic,
+        Directive::OMPD_critical,
+        Directive::OMPD_master,
+        Directive::OMPD_ordered,
+        Directive::OMPD_task,
+        Directive::OMPD_taskloop,
+    } | workShareSet,
 };
 } // namespace llvm::omp
 
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index df66e1adb55023..dc3cd6c894a2c2 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -180,7 +180,8 @@ const Symbol *IsFinalizable(const Symbol &,
 const Symbol *IsFinalizable(const DerivedTypeSpec &,
     std::set<const DerivedTypeSpec *> * = nullptr,
     bool withImpureFinalizer = false, std::optional<int> rank = std::nullopt);
-const Symbol *HasImpureFinal(const Symbol &);
+const Symbol *HasImpureFinal(
+    const Symbol &, std::optional<int> rank = std::nullopt);
 // Is this type finalizable or does it contain any polymorphic allocatable
 // ultimate components?
 bool MayRequireFinalization(const DerivedTypeSpec &derived);
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 68e504d0ccb512..21b36dba9ac704 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -123,7 +123,9 @@ static void addCanonicalizerPassWithoutRegionSimplification(
 
 inline void addCfgConversionPass(mlir::PassManager &pm) {
   addNestedPassConditionally<mlir::func::FuncOp>(
-      pm, disableCfgConversion, fir::createFirToCfgPass);
+      pm, disableCfgConversion, fir::createFirToCfgOnFuncPass);
+  addNestedPassConditionally<mlir::omp::ReductionDeclareOp>(
+      pm, disableCfgConversion, fir::createFirToCfgOnReductionPass);
 }
 
 inline void addAVC(
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 2e3fa1f6e66039..c830c7af2462c9 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1191,11 +1191,6 @@ bool CompilerInvocation::createFromArgs(
     invoc.loweringOpts.setLowerToHighLevelFIR(false);
   }
 
-  if (args.hasArg(
-          clang::driver::options::OPT_flang_experimental_polymorphism)) {
-    invoc.loweringOpts.setPolymorphicTypeImpl(true);
-  }
-
   // -fno-ppc-native-vector-element-order
   if (args.hasArg(clang::driver::options::OPT_fno_ppc_native_vec_elem_order)) {
     invoc.loweringOpts.setNoPPCNativeVecElemOrder(true);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index a668ba4116faab..c3cb9ba6a47e3d 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -748,7 +748,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
   void copyVar(mlir::Location loc, mlir::Value dst,
                mlir::Value src) override final {
-    copyVarHLFIR(loc, dst, src);
+    copyVarHLFIR(loc, Fortran::lower::SymbolBox::Intrinsic{dst},
+                 Fortran::lower::SymbolBox::Intrinsic{src});
   }
 
   void copyHostAssociateVar(
@@ -1009,10 +1010,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       // `omp.private`'s `alloc` block. If this is the case, we return this
       // `SymbolBox::Intrinsic` value.
       if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym))
-        return v.match(
-            [&](const Fortran::lower::SymbolBox::Intrinsic &)
-                -> Fortran::lower::SymbolBox { return v; },
-            [](const auto &) -> Fortran::lower::SymbolBox { return {}; });
+        return v;
 
       return {};
     }
@@ -1060,15 +1058,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                const Fortran::lower::SymbolBox &rhs_sb) {
     mlir::Location loc = genLocation(sym.name());
     if (lowerToHighLevelFIR())
-      copyVarHLFIR(loc, lhs_sb.getAddr(), rhs_sb.getAddr());
+      copyVarHLFIR(loc, lhs_sb, rhs_sb);
     else
       copyVarFIR(loc, sym, lhs_sb, rhs_sb);
   }
 
-  void copyVarHLFIR(mlir::Location loc, mlir::Value dst, mlir::Value src) {
+  void copyVarHLFIR(mlir::Location loc, Fortran::lower::SymbolBox dst,
+                    Fortran::lower::SymbolBox src) {
     assert(lowerToHighLevelFIR());
-    hlfir::Entity lhs{dst};
-    hlfir::Entity rhs{src};
+    hlfir::Entity lhs{dst.getAddr()};
+    hlfir::Entity rhs{src.getAddr()};
     // Temporary_lhs is set to true in hlfir.assign below to avoid user
     // assignment to be used and finalization to be called on the LHS.
     // This may or may not be correct but mimics the current behaviour
@@ -1082,7 +1081,22 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           /*keepLhsLengthInAllocatableAssignment=*/false,
           /*temporary_lhs=*/true);
     };
-    if (lhs.isAllocatable()) {
+
+    bool isBoxAllocatable = dst.match(
+        [](const fir::MutableBoxValue &box) { return box.isAllocatable(); },
+        [](const fir::FortranVariableOpInterface &box) {
+          return fir::FortranVariableOpInterface(box).isAllocatable();
+        },
+        [](const auto &box) { return false; });
+
+    bool isBoxPointer = dst.match(
+        [](const fir::MutableBoxValue &box) { return box.isPointer(); },
+        [](const fir::FortranVariableOpInterface &box) {
+          return fir::FortranVariableOpInterface(box).isPointer();
+        },
+        [](const auto &box) { return false; });
+
+    if (isBoxAllocatable) {
       // Deep copy allocatable if it is allocated.
       // Note that when allocated, the RHS is already allocated with the LHS
       // shape for copy on entry in createHostAssociateVarClone.
@@ -1097,7 +1111,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             copyData(lhs, rhs);
           })
           .end();
-    } else if (lhs.isPointer()) {
+    } else if (isBoxPointer) {
       // Set LHS target to the target of RHS (do not copy the RHS
       // target data into the LHS target storage).
       auto loadVal = builder->create<fir::LoadOp>(loc, rhs);
@@ -2508,19 +2522,51 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     if (nestedLoops > 1)
       n = builder->getIntegerAttr(builder->getI64Type(), nestedLoops);
 
-    const std::list<Fortran::parser::ScalarIntExpr> &grid = std::get<1>(dir.t);
-    const std::list<Fortran::parser::ScalarIntExpr> &block = std::get<2>(dir.t);
+    const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr> &grid =
+        std::get<1>(dir.t);
+    const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr> &block =
+        std::get<2>(dir.t);
     const std::optional<Fortran::parser::ScalarIntExpr> &stream =
         std::get<3>(dir.t);
 
+    auto isOnlyStars =
+        [&](const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr>
+                &list) -> bool {
+      for (const Fortran::parser::CUFKernelDoConstruct::StarOrExpr &expr :
+           list) {
+        if (expr.v)
+          return false;
+      }
+      return true;
+    };
+
+    mlir::Value zero =
+        builder->createIntegerConstant(loc, builder->getI32Type(), 0);
+
     llvm::SmallVector<mlir::Value> gridValues;
-    for (const Fortran::parser::ScalarIntExpr &expr : grid)
-      gridValues.push_back(fir::getBase(
-          genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)));
+    if (!isOnlyStars(grid)) {
+      for (const Fortran::parser::CUFKernelDoConstruct::StarOrExpr &expr :
+           grid) {
+        if (expr.v) {
+          gridValues.push_back(fir::getBase(
+              genExprValue(*Fortran::semantics::GetExpr(*expr.v), stmtCtx)));
+        } else {
+          gridValues.push_back(zero);
+        }
+      }
+    }
     llvm::SmallVector<mlir::Value> blockValues;
-    for (const Fortran::parser::ScalarIntExpr &expr : block)
-      blockValues.push_back(fir::getBase(
-          genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)));
+    if (!isOnlyStars(block)) {
+      for (const Fortran::parser::CUFKernelDoConstruct::StarOrExpr &expr :
+           block) {
+        if (expr.v) {
+          blockValues.push_back(fir::getBase(
+              genExprValue(*Fortran::semantics::GetExpr(*expr.v), stmtCtx)));
+        } else {
+          blockValues.push_back(zero);
+        }
+      }
+    }
     mlir::Value streamValue;
     if (stream)
       streamValue = fir::getBase(
@@ -3665,6 +3711,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
     mlir::Location loc = getCurrentLocation();
     fir::FirOpBuilder &builder = getFirOpBuilder();
+
+    if (Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
+        Fortran::evaluate::HasCUDAAttrs(assign.rhs))
+      TODO(loc, "Assignement with CUDA Fortran variables");
+
     // Gather some information about the assignment that will impact how it is
     // lowered.
     const bool isWholeAllocatableAssignment =
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index 5577a60f1daeac..f92d1a2bc7de18 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -25,6 +25,7 @@ add_flang_library(FortranLower
   Mangler.cpp
   OpenACC.cpp
   OpenMP/ClauseProcessor.cpp
+  OpenMP/Clauses.cpp
   OpenMP/DataSharingProcessor.cpp
   OpenMP/OpenMP.cpp
   OpenMP/ReductionProcessor.cpp
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 6b71aabf7fdc89..c65becc497459c 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -310,21 +310,22 @@ bool Fortran::lower::CallerInterface::verifyActualInputs() const {
   return true;
 }
 
-void Fortran::lower::CallerInterface::walkResultLengths(
-    ExprVisitor visitor) const {
-  assert(characteristic && "characteristic was not computed");
-  const Fortran::evaluate::characteristics::FunctionResult &result =
-      characteristic->functionResult.value();
-  const Fortran::evaluate::characteristics::TypeAndShape *typeAndShape =
-      result.GetTypeAndShape();
-  assert(typeAndShape && "no result type");
-  Fortran::evaluate::DynamicType dynamicType = typeAndShape->type();
-  // Visit result length specification expressions that are explicit.
+mlir::Value
+Fortran::lower::CallerInterface::getInput(const PassedEntity &passedEntity) {
+  return actualInputs[passedEntity.firArgument];
+}
+
+static void walkLengths(
+    const Fortran::evaluate::characteristics::TypeAndShape &typeAndShape,
+    const Fortran::lower::CallerInterface::ExprVisitor &visitor,
+    Fortran::lower::AbstractConverter &converter) {
+  Fortran::evaluate::DynamicType dynamicType = typeAndShape.type();
+  // Visit length specification expressions that are explicit.
   if (dynamicType.category() == Fortran::common::TypeCategory::Character) {
     if (std::optional<Fortran::evaluate::ExtentExpr> length =
             dynamicType.GetCharLength())
-      visitor(toEvExpr(*length));
-  } else if (dynamicType.category() == common::TypeCategory::Derived &&
+      visitor(toEvExpr(*length), /*assumedSize=*/false);
+  } else if (dynamicType.category() == Fortran::common::TypeCategory::Derived &&
              !dynamicType.IsUnlimitedPolymorphic()) {
     const Fortran::semantics::DerivedTypeSpec &derivedTypeSpec =
         dynamicType.GetDerivedTypeSpec();
@@ -334,11 +335,33 @@ void Fortran::lower::CallerInterface::walkResultLengths(
   }
 }
 
+void Fortran::lower::CallerInterface::walkResultLengths(
+    const ExprVisitor &visitor) const {
+  assert(characteristic && "characteristic was not computed");
+  const Fortran::evaluate::characteristics::FunctionResult &result =
+      characteristic->functionResult.value();
+  const Fortran::evaluate::characteristics::TypeAndShape *typeAndShape =
+      result.GetTypeAndShape();
+  assert(typeAndShape && "no result type");
+  return walkLengths(*typeAndShape, visitor, converter);
+}
+
+void Fortran::lower::CallerInterface::walkDummyArgumentLengths(
+    const PassedEntity &passedEntity, const ExprVisitor &visitor) const {
+  if (!passedEntity.characteristics)
+    return;
+  if (const auto *dummy =
+          std::get_if<Fortran::evaluate::characteristics::DummyDataObject>(
+              &passedEntity.characteristics->u))
+    walkLengths(dummy->type, visitor, converter);
+}
+
 // Compute extent expr from shapeSpec of an explicit shape.
-// TODO: Allow evaluate shape analysis to work in a mode where it disregards
-// the non-constant aspects when building the shape to avoid having this here.
 static Fortran::evaluate::ExtentExpr
 getExtentExpr(const Fortran::semantics::ShapeSpec &shapeSpec) {
+  if (shapeSpec.ubound().isStar())
+    // F'2023 18.5.3 point 5.
+    return Fortran::evaluate::ExtentExpr{-1};
   const auto &ubound = shapeSpec.ubound().GetExplicit();
   const auto &lbound = shapeSpec.lbound().GetExplicit();
   assert(lbound && ubound && "shape must be explicit");
@@ -346,20 +369,27 @@ getExtentExpr(const Fortran::semantics::ShapeSpec &shapeSpec) {
          Fortran::evaluate::ExtentExpr{1};
 }
 
+static void
+walkExtents(const Fortran::semantics::Symbol &symbol,
+            const Fortran::lower::CallerInterface::ExprVisitor &visitor) {
+  if (const auto *objectDetails =
+          symbol.detailsIf<Fortran::semantics::ObjectEntityDetails>())
+    if (objectDetails->shape().IsExplicitShape() ||
+        Fortran::semantics::IsAssumedSizeArray(symbol))
+      for (const Fortran::semantics::ShapeSpec &shapeSpec :
+           objectDetails->shape())
+        visitor(Fortran::evaluate::AsGenericExpr(getExtentExpr(shapeSpec)),
+                /*assumedSize=*/shapeSpec.ubound().isStar());
+}
+
 void Fortran::lower::CallerInterface::walkResultExtents(
-    ExprVisitor visitor) const {
+    const ExprVisitor &visitor) const {
   // Walk directly the result symbol shape (the characteristic shape may contain
   // descriptor inquiries to it that would fail to lower on the caller side).
   const Fortran::semantics::SubprogramDetails *interfaceDetails =
       getInterfaceDetails();
   if (interfaceDetails) {
-    const Fortran::semantics::Symbol &result = interfaceDetails->result();
-    if (const auto *objectDetails =
-            result.detailsIf<Fortran::semantics::ObjectEntityDetails>())
-      if (objectDetails->shape().IsExplicitShape())
-        for (const Fortran::semantics::ShapeSpec &shapeSpec :
-             objectDetails->shape())
-          visitor(Fortran::evaluate::AsGenericExpr(getExtentExpr(shapeSpec)));
+    walkExtents(interfaceDetails->result(), visitor);
   } else {
     if (procRef.Rank() != 0)
       fir::emitFatalError(
@@ -368,7 +398,18 @@ void Fortran::lower::CallerInterface::walkResultExtents(
   }
 }
 
-bool Fortran::lower::CallerInterface::mustMapInterfaceSymbols() const {
+void Fortran::lower::CallerInterface::walkDummyArgumentExtents(
+    const PassedEntity &passedEntity, const ExprVisitor &visitor) const {
+  const Fortran::semantics::SubprogramDetails *interfaceDetails =
+      getInterfaceDetails();
+  if (!interfaceDetails)
+    return;
+  const Fortran::semantics::Symbol *dummy = getDummySymbol(passedEntity);
+  assert(dummy && "dummy symbol was not set");
+  walkExtents(*dummy, visitor);
+}
+
+bool Fortran::lower::CallerInterface::mustMapInterfaceSymbolsForResult() const {
   assert(characteristic && "characteristic was not computed");
   const std::optional<Fortran::evaluate::characteristics::FunctionResult>
       &result = characteristic->functionResult;
@@ -376,7 +417,7 @@ bool Fortran::lower::CallerInterface::mustMapInterfaceSymbols() const {
       !getInterfaceDetails() || result->IsProcedurePointer())
     return false;
   bool allResultSpecExprConstant = true;
-  auto visitor = [&](const Fortran::lower::SomeExpr &e) {
+  auto visitor = [&](const Fortran::lower::SomeExpr &e, bool) {
     allResultSpecExprConstant &= Fortran::evaluate::IsConstantExpr(e);
   };
   walkResultLengths(visitor);
@@ -384,6 +425,17 @@ bool Fortran::lower::CallerInterface::mustMapInterfaceSymbols() const {
   return !allResultSpecExprConstant;
 }
 
+bool Fortran::lower::CallerInterface::mustMapInterfaceSymbolsForDummyArgument(
+    const PassedEntity &arg) const {
+  bool allResultSpecExprConstant = true;
+  auto visitor = [&](const Fortran::lower::SomeExpr &e, bool) {
+    allResultSpecExprConstant &= Fortran::evaluate::IsConstantExpr(e);
+  };
+  walkDummyArgumentLengths(arg, visitor);
+  walkDummyArgumentExtents(arg, visitor);
+  return !allResultSpecExprConstant;
+}
+
 mlir::Value Fortran::lower::CallerInterface::getArgumentValue(
     const semantics::Symbol &sym) const {
   mlir::Location loc = converter.getCurrentLocation();
@@ -401,6 +453,24 @@ mlir::Value Fortran::lower::CallerInterface::getArgumentValue(
   return actualInputs[mlirArgIndex];
 }
 
+const Fortran::semantics::Symbol *
+Fortran::lower::CallerInterface::getDummySymbol(
+    const PassedEntity &passedEntity) const {
+  const Fortran::semantics::SubprogramDetails *ifaceDetails =
+      getInterfaceDetails();
+  if (!ifaceDetails)
+    return nullptr;
+  std::size_t argPosition = 0;
+  for (const auto &arg : getPassedArguments()) {
+    if (&arg == &passedEntity)
+      break;
+    ++argPosition;
+  }
+  if (argPosition >= ifaceDetails->dummyArgs().size())
+    return nullptr;
+  return ifaceDetails->dummyArgs()[argPosition];
+}
+
 mlir::Type Fortran::lower::CallerInterface::getResultStorageType() const {
   if (passedResult)
     return fir::dyn_cast_ptrEleTy(inputs[passedResult->firArgument].type);
@@ -408,6 +478,11 @@ mlir::Type Fortran::lower::CallerInterface::getResultStorageType() const {
   return outputs[0].type;
 }
 
+mlir::Type Fortran::lower::CallerInterface::getDummyArgumentType(
+    const PassedEntity &passedEntity) const {
+  return inputs[passedEntity.firArgument].type;
+}
+
 const Fortran::semantics::Symbol &
 Fortran::lower::CallerInterface::getResultSymbol() const {
   mlir::Location loc = converter.getCurrentLocation();
@@ -975,12 +1050,6 @@ class Fortran::lower::CallInterfaceImpl {
     Fortran::common::TypeCategory cat = dynamicType.category();
     // DERIVED
     if (cat == Fortran::common::TypeCategory::Derived) {
-      // TODO is kept under experimental flag until feature is complete.
-      if (dynamicType.IsPolymorphic() &&
-          !getConverter().getLoweringOptions().getPolymorphicTypeImpl())
-        TODO(interface.converter.getCurrentLocation(),
-             "support for polymorphic types");
-
       if (dynamicType.IsUnlimitedPolymorphic())
         return mlir::NoneType::get(&mlirContext);
       return getConverter().genType(dynamicType.GetDerivedTypeSpec());
@@ -1387,6 +1456,17 @@ bool Fortran::lower::CallInterface<
   return Fortran::semantics::IsFinalizable(*derived);
 }
 
+template <typename T>
+bool Fortran::lower::CallInterface<
+    T>::PassedEntity::isSequenceAssociatedDescriptor() const {
+  if (!characteristics || passBy != PassEntityBy::Box)
+    return false;
+  const auto *dummy =
+      std::get_if<Fortran::evaluate::characteristics::DummyDataObject>(
+          &characteristics->u);
+  return dummy && dummy->type.CanBeSequenceAssociated();
+}
+
 template <typename T>
 void Fortran::lower::CallInterface<T>::determineInterface(
     bool isImplicit,
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 6e3ce101ef1af9..6eba243c237cf2 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -164,6 +164,125 @@ static mlir::Value readDim3Value(fir::FirOpBuilder &builder, mlir::Location loc,
   return hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{designate});
 }
 
+static mlir::Value remapActualToDummyDescriptor(
+    mlir::Location loc, Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::SymMap &symMap,
+    const Fortran::lower::CallerInterface::PassedEntity &arg,
+    Fortran::lower::CallerInterface &caller, bool isBindcCall) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mlir::IndexType idxTy = builder.getIndexType();
+  mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
+  Fortran::lower::StatementContext localStmtCtx;
+  auto lowerSpecExpr = [&](const auto &expr,
+                           bool isAssumedSizeExtent) -> mlir::Value {
+    mlir::Value convertExpr = builder.createConvert(
+        loc, idxTy, fir::getBase(converter.genExprValue(expr, localStmtCtx)));
+    if (isAssumedSizeExtent)
+      return convertExpr;
+    return fir::factory::genMaxWithZero(builder, loc, convertExpr);
+  };
+  bool mapSymbols = caller.mustMapInterfaceSymbolsForDummyArgument(arg);
+  if (mapSymbols) {
+    symMap.pushScope();
+    const Fortran::semantics::Symbol *sym = caller.getDummySymbol(arg);
+    assert(sym && "call must have explicit interface to map interface symbols");
+    Fortran::lower::mapCallInterfaceSymbolsForDummyArgument(converter, caller,
+                                                            symMap, *sym);
+  }
+  llvm::SmallVector<mlir::Value> extents;
+  llvm::SmallVector<mlir::Value> lengths;
+  mlir::Type dummyBoxType = caller.getDummyArgumentType(arg);
+  mlir::Type dummyBaseType = fir::unwrapPassByRefType(dummyBoxType);
+  if (dummyBaseType.isa<fir::SequenceType>())
+    caller.walkDummyArgumentExtents(
+        arg, [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
+          extents.emplace_back(lowerSpecExpr(e, isAssumedSizeExtent));
+        });
+  mlir::Value shape;
+  if (!extents.empty()) {
+    if (isBindcCall) {
+      // Preserve zero lower bounds (see F'2023 18.5.3).
+      llvm::SmallVector<mlir::Value> lowerBounds(extents.size(), zero);
+      shape = builder.genShape(loc, lowerBounds, extents);
+    } else {
+      shape = builder.genShape(loc, extents);
+    }
+  }
+
+  hlfir::Entity explicitArgument = hlfir::Entity{caller.getInput(arg)};
+  mlir::Type dummyElementType = fir::unwrapSequenceType(dummyBaseType);
+  if (auto recType = llvm::dyn_cast<fir::RecordType>(dummyElementType))
+    if (recType.getNumLenParams() > 0)
+      TODO(loc, "sequence association of length parameterized derived type "
+                "dummy arguments");
+  if (fir::isa_char(dummyElementType))
+    lengths.emplace_back(hlfir::genCharLength(loc, builder, explicitArgument));
+  mlir::Value baseAddr =
+      hlfir::genVariableRawAddress(loc, builder, explicitArgument);
+  baseAddr = builder.createConvert(loc, fir::ReferenceType::get(dummyBaseType),
+                                   baseAddr);
+  mlir::Value mold;
+  if (fir::isPolymorphicType(dummyBoxType))
+    mold = explicitArgument;
+  mlir::Value remapped =
+      builder.create<fir::EmboxOp>(loc, dummyBoxType, baseAddr, shape,
+                                   /*slice=*/mlir::Value{}, lengths, mold);
+  if (mapSymbols)
+    symMap.popScope();
+  return remapped;
+}
+
+/// Create a descriptor for sequenced associated descriptor that are passed
+/// by descriptor. Sequence association (F'2023 15.5.2.12) implies that the
+/// dummy shape and rank need to not be the same as the actual argument. This
+/// helper creates a descriptor based on the dummy shape and rank (sequence
+/// association can only happen with explicit and assumed-size array) so that it
+/// is safe to assume the rank of the incoming descriptor inside the callee.
+/// This helper must be called once all the actual arguments have been lowered
+/// and placed inside "caller". Copy-in/copy-out must already have been
+/// generated if needed using the actual argument shape (the dummy shape may be
+/// assumed-size).
+static void remapActualToDummyDescriptors(
+    mlir::Location loc, Fortran::lower::AbstractConverter &converter,
+    Fortran::lower::SymMap &symMap,
+    const Fortran::lower::PreparedActualArguments &loweredActuals,
+    Fortran::lower::CallerInterface &caller, bool isBindcCall) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  for (auto [preparedActual, arg] :
+       llvm::zip(loweredActuals, caller.getPassedArguments())) {
+    if (arg.isSequenceAssociatedDescriptor()) {
+      if (!preparedActual.value().handleDynamicOptional()) {
+        mlir::Value remapped = remapActualToDummyDescriptor(
+            loc, converter, symMap, arg, caller, isBindcCall);
+        caller.placeInput(arg, remapped);
+      } else {
+        // Absent optional actual argument descriptor cannot be read and
+        // remapped unconditionally.
+        mlir::Type dummyType = caller.getDummyArgumentType(arg);
+        mlir::Value isPresent = preparedActual.value().getIsPresent();
+        auto &argLambdaCapture = arg;
+        mlir::Value remapped =
+            builder
+                .genIfOp(loc, {dummyType}, isPresent,
+                         /*withElseRegion=*/true)
+                .genThen([&]() {
+                  mlir::Value newBox = remapActualToDummyDescriptor(
+                      loc, converter, symMap, argLambdaCapture, caller,
+                      isBindcCall);
+                  builder.create<fir::ResultOp>(loc, newBox);
+                })
+                .genElse([&]() {
+                  mlir::Value absent =
+                      builder.create<fir::AbsentOp>(loc, dummyType);
+                  builder.create<fir::ResultOp>(loc, absent);
+                })
+                .getResults()[0];
+        caller.placeInput(arg, remapped);
+      }
+    }
+  }
+}
+
 std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     mlir::Location loc, Fortran::lower::AbstractConverter &converter,
     Fortran::lower::SymMap &symMap, Fortran::lower::StatementContext &stmtCtx,
@@ -171,12 +290,11 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     std::optional<mlir::Type> resultType, bool isElemental) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   using PassBy = Fortran::lower::CallerInterface::PassEntityBy;
-  // Handle cases where caller must allocate the result or a fir.box for it.
   bool mustPopSymMap = false;
-  if (caller.mustMapInterfaceSymbols()) {
+  if (caller.mustMapInterfaceSymbolsForResult()) {
     symMap.pushScope();
     mustPopSymMap = true;
-    Fortran::lower::mapCallInterfaceSymbols(converter, caller, symMap);
+    Fortran::lower::mapCallInterfaceSymbolsForResult(converter, caller, symMap);
   }
   // If this is an indirect call, retrieve the function address. Also retrieve
   // the result length if this is a character function (note that this length
@@ -221,12 +339,16 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
       return {};
     mlir::Type type = caller.getResultStorageType();
     if (type.isa<fir::SequenceType>())
-      caller.walkResultExtents([&](const Fortran::lower::SomeExpr &e) {
-        extents.emplace_back(lowerSpecExpr(e));
-      });
-    caller.walkResultLengths([&](const Fortran::lower::SomeExpr &e) {
-      lengths.emplace_back(lowerSpecExpr(e));
-    });
+      caller.walkResultExtents(
+          [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
+            assert(!isAssumedSizeExtent && "result cannot be assumed-size");
+            extents.emplace_back(lowerSpecExpr(e));
+          });
+    caller.walkResultLengths(
+        [&](const Fortran::lower::SomeExpr &e, bool isAssumedSizeExtent) {
+          assert(!isAssumedSizeExtent && "result cannot be assumed-size");
+          lengths.emplace_back(lowerSpecExpr(e));
+        });
 
     // Result length parameters should not be provided to box storage
     // allocation and save_results, but they are still useful information to
@@ -416,7 +538,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
     mlir::Type i32Ty = builder.getI32Type();
     mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
 
-    mlir::Value grid_x, grid_y;
+    mlir::Value grid_x, grid_y, grid_z;
     if (caller.getCallDescription().chevrons()[0].GetType()->category() ==
         Fortran::common::TypeCategory::Integer) {
       // If grid is an integer, it is converted to dim3(grid,1,1). Since z is
@@ -426,11 +548,13 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
           fir::getBase(converter.genExprValue(
               caller.getCallDescription().chevrons()[0], stmtCtx)));
       grid_y = one;
+      grid_z = one;
     } else {
       auto dim3Addr = converter.genExprAddr(
           caller.getCallDescription().chevrons()[0], stmtCtx);
       grid_x = readDim3Value(builder, loc, fir::getBase(dim3Addr), "x");
       grid_y = readDim3Value(builder, loc, fir::getBase(dim3Addr), "y");
+      grid_z = readDim3Value(builder, loc, fir::getBase(dim3Addr), "z");
     }
 
     mlir::Value block_x, block_y, block_z;
@@ -466,8 +590,8 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
               caller.getCallDescription().chevrons()[3], stmtCtx)));
 
     builder.create<fir::CUDAKernelLaunch>(
-        loc, funcType.getResults(), funcSymbolAttr, grid_x, grid_y, block_x,
-        block_y, block_z, bytes, stream, operands);
+        loc, funcType.getResults(), funcSymbolAttr, grid_x, grid_y, grid_z,
+        block_x, block_y, block_z, bytes, stream, operands);
     callNumResults = 0;
   } else if (caller.requireDispatchCall()) {
     // Procedure call requiring a dynamic dispatch. Call is created with
@@ -1054,10 +1178,16 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // Create dummy type with actual argument rank when the dummy is an assumed
   // rank. That way, all the operation to create dummy descriptors are ranked if
   // the actual argument is ranked, which allows simple code generation.
+  // Also do the same when the dummy is a sequence associated descriptor
+  // because the actual shape/rank may mismatch with the dummy, and the dummy
+  // may be an assumed-size array, so any descriptor manipulation should use the
+  // actual argument shape information. A descriptor with the dummy shape
+  // information will be created later when all actual arguments are ready.
   mlir::Type dummyTypeWithActualRank = dummyType;
   if (auto baseBoxDummy = mlir::dyn_cast<fir::BaseBoxType>(dummyType))
     if (baseBoxDummy.isAssumedRank() ||
-        arg.testTKR(Fortran::common::IgnoreTKR::Rank))
+        arg.testTKR(Fortran::common::IgnoreTKR::Rank) ||
+        arg.isSequenceAssociatedDescriptor())
       dummyTypeWithActualRank =
           baseBoxDummy.getBoxTypeWithNewShape(actual.getType());
   // Preserve the actual type in the argument preparation in case IgnoreTKR(t)
@@ -1340,6 +1470,7 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
             mlir::FunctionType callSiteType, CallContext &callContext) {
   using PassBy = Fortran::lower::CallerInterface::PassEntityBy;
   mlir::Location loc = callContext.loc;
+  bool mustRemapActualToDummyDescriptors = false;
   fir::FirOpBuilder &builder = callContext.getBuilder();
   llvm::SmallVector<CallCleanUp> callCleanUps;
   for (auto [preparedActual, arg] :
@@ -1396,6 +1527,9 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
       callCleanUps.append(preparedDummy.cleanups.rbegin(),
                           preparedDummy.cleanups.rend());
       caller.placeInput(arg, preparedDummy.dummy);
+      if (arg.passBy == PassBy::Box)
+        mustRemapActualToDummyDescriptors |=
+            arg.isSequenceAssociatedDescriptor();
     } break;
     case PassBy::BoxProcRef: {
       PreparedDummyArgument preparedDummy =
@@ -1488,6 +1622,12 @@ genUserCall(Fortran::lower::PreparedActualArguments &loweredActuals,
     } break;
     }
   }
+  // Handle cases where caller must allocate the result or a fir.box for it.
+  if (mustRemapActualToDummyDescriptors)
+    remapActualToDummyDescriptors(loc, callContext.converter,
+                                  callContext.symMap, loweredActuals, caller,
+                                  callContext.isBindcCall());
+
   // Prepare lowered arguments according to the interface
   // and map the lowered values to the dummy
   // arguments.
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index 21564e8b81d70a..e6557d7f0b7675 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -263,10 +263,6 @@ struct TypeBuilderImpl {
         llvm::SmallVector<Fortran::lower::LenParameterTy> params;
         translateLenParameters(params, tySpec->category(), ultimate);
         ty = genFIRType(context, tySpec->category(), kind, params);
-      } else if (type->IsPolymorphic() &&
-                 !converter.getLoweringOptions().getPolymorphicTypeImpl()) {
-        // TODO is kept under experimental flag until feature is complete.
-        TODO(loc, "support for polymorphic types");
       } else if (type->IsUnlimitedPolymorphic()) {
         ty = mlir::NoneType::get(context);
       } else if (const Fortran::semantics::DerivedTypeSpec *tySpec =
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index a673a18cd20d91..94d849862099eb 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -2260,19 +2260,20 @@ void Fortran::lower::instantiateVariable(AbstractConverter &converter,
     instantiateLocal(converter, var, symMap);
 }
 
-void Fortran::lower::mapCallInterfaceSymbols(
-    AbstractConverter &converter, const Fortran::lower::CallerInterface &caller,
-    SymMap &symMap) {
+static void
+mapCallInterfaceSymbol(const Fortran::semantics::Symbol &interfaceSymbol,
+                       Fortran::lower::AbstractConverter &converter,
+                       const Fortran::lower::CallerInterface &caller,
+                       Fortran::lower::SymMap &symMap) {
   Fortran::lower::AggregateStoreMap storeMap;
-  const Fortran::semantics::Symbol &result = caller.getResultSymbol();
   for (Fortran::lower::pft::Variable var :
-       Fortran::lower::pft::getDependentVariableList(result)) {
+       Fortran::lower::pft::getDependentVariableList(interfaceSymbol)) {
     if (var.isAggregateStore()) {
       instantiateVariable(converter, var, symMap, storeMap);
       continue;
     }
     const Fortran::semantics::Symbol &sym = var.getSymbol();
-    if (&sym == &result)
+    if (&sym == &interfaceSymbol)
       continue;
     const auto *hostDetails =
         sym.detailsIf<Fortran::semantics::HostAssocDetails>();
@@ -2293,7 +2294,8 @@ void Fortran::lower::mapCallInterfaceSymbols(
       // instantiateVariable that would try to allocate a new storage.
       continue;
     }
-    if (Fortran::semantics::IsDummy(sym) && sym.owner() == result.owner()) {
+    if (Fortran::semantics::IsDummy(sym) &&
+        sym.owner() == interfaceSymbol.owner()) {
       // Get the argument for the dummy argument symbols of the current call.
       symMap.addSymbol(sym, caller.getArgumentValue(sym));
       // All the properties of the dummy variable may not come from the actual
@@ -2307,6 +2309,19 @@ void Fortran::lower::mapCallInterfaceSymbols(
   }
 }
 
+void Fortran::lower::mapCallInterfaceSymbolsForResult(
+    AbstractConverter &converter, const Fortran::lower::CallerInterface &caller,
+    SymMap &symMap) {
+  const Fortran::semantics::Symbol &result = caller.getResultSymbol();
+  mapCallInterfaceSymbol(result, converter, caller, symMap);
+}
+
+void Fortran::lower::mapCallInterfaceSymbolsForDummyArgument(
+    AbstractConverter &converter, const Fortran::lower::CallerInterface &caller,
+    SymMap &symMap, const Fortran::semantics::Symbol &dummySymbol) {
+  mapCallInterfaceSymbol(dummySymbol, converter, caller, symMap);
+}
+
 void Fortran::lower::mapSymbolAttributes(
     AbstractConverter &converter, const Fortran::semantics::SymbolRef &symbol,
     Fortran::lower::SymMap &symMap, Fortran::lower::StatementContext &stmtCtx,
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index d2c6006ecf914a..6539de4d88304c 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -406,19 +406,6 @@ fir::ShapeOp genShapeOp(mlir::OpBuilder &builder, fir::SequenceType seqTy,
   return builder.create<fir::ShapeOp>(loc, extents);
 }
 
-/// Return the nested sequence type if any.
-static mlir::Type extractSequenceType(mlir::Type ty) {
-  if (mlir::isa<fir::SequenceType>(ty))
-    return ty;
-  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
-    return extractSequenceType(boxTy.getEleTy());
-  if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
-    return extractSequenceType(heapTy.getEleTy());
-  if (auto ptrTy = mlir::dyn_cast<fir::PointerType>(ty))
-    return extractSequenceType(ptrTy.getEleTy());
-  return mlir::Type{};
-}
-
 template <typename RecipeOp>
 static void genPrivateLikeInitRegion(mlir::OpBuilder &builder, RecipeOp recipe,
                                      mlir::Type ty, mlir::Location loc) {
@@ -454,7 +441,7 @@ static void genPrivateLikeInitRegion(mlir::OpBuilder &builder, RecipeOp recipe,
       }
     }
   } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
-    mlir::Type innerTy = extractSequenceType(boxTy);
+    mlir::Type innerTy = fir::extractSequenceType(boxTy);
     if (!innerTy)
       TODO(loc, "Unsupported boxed type in OpenACC privatization");
     fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
@@ -688,7 +675,7 @@ mlir::acc::FirstprivateRecipeOp Fortran::lower::createOrGetFirstprivateRecipe(
   } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
     fir::FirOpBuilder firBuilder{builder, recipe.getOperation()};
     llvm::SmallVector<mlir::Value> tripletArgs;
-    mlir::Type innerTy = extractSequenceType(boxTy);
+    mlir::Type innerTy = fir::extractSequenceType(boxTy);
     fir::SequenceType seqTy =
         mlir::dyn_cast_or_null<fir::SequenceType>(innerTy);
     if (!seqTy)
@@ -1018,7 +1005,7 @@ static mlir::Value genReductionInitRegion(fir::FirOpBuilder &builder,
       return declareOp.getBase();
     }
   } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
-    mlir::Type innerTy = extractSequenceType(boxTy);
+    mlir::Type innerTy = fir::extractSequenceType(boxTy);
     if (!mlir::isa<fir::SequenceType>(innerTy))
       TODO(loc, "Unsupported boxed type for reduction");
     // Create the private copy from the initial fir.box.
@@ -1230,7 +1217,7 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
     builder.create<fir::StoreOp>(loc, res, addr1);
     builder.setInsertionPointAfter(loops[0]);
   } else if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
-    mlir::Type innerTy = extractSequenceType(boxTy);
+    mlir::Type innerTy = fir::extractSequenceType(boxTy);
     fir::SequenceType seqTy =
         mlir::dyn_cast_or_null<fir::SequenceType>(innerTy);
     if (!seqTy)
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index a41f8312a28c9e..13347c8cf7b658 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ClauseProcessor.h"
+#include "Clauses.h"
 
 #include "flang/Lower/PFTBuilder.h"
 #include "flang/Parser/tools.h"
@@ -30,64 +31,55 @@ static void checkMapType(mlir::Location location, mlir::Type type) {
 }
 
 static mlir::omp::ScheduleModifier
-translateScheduleModifier(const Fortran::parser::OmpScheduleModifierType &m) {
-  switch (m.v) {
-  case Fortran::parser::OmpScheduleModifierType::ModType::Monotonic:
+translateScheduleModifier(const omp::clause::Schedule::ModType &m) {
+  switch (m) {
+  case omp::clause::Schedule::ModType::Monotonic:
     return mlir::omp::ScheduleModifier::monotonic;
-  case Fortran::parser::OmpScheduleModifierType::ModType::Nonmonotonic:
+  case omp::clause::Schedule::ModType::Nonmonotonic:
     return mlir::omp::ScheduleModifier::nonmonotonic;
-  case Fortran::parser::OmpScheduleModifierType::ModType::Simd:
+  case omp::clause::Schedule::ModType::Simd:
     return mlir::omp::ScheduleModifier::simd;
   }
   return mlir::omp::ScheduleModifier::none;
 }
 
 static mlir::omp::ScheduleModifier
-getScheduleModifier(const Fortran::parser::OmpScheduleClause &x) {
-  const auto &modifier =
-      std::get<std::optional<Fortran::parser::OmpScheduleModifier>>(x.t);
+getScheduleModifier(const omp::clause::Schedule &clause) {
+  using ScheduleModifier = omp::clause::Schedule::ScheduleModifier;
+  const auto &modifier = std::get<std::optional<ScheduleModifier>>(clause.t);
   // The input may have the modifier any order, so we look for one that isn't
   // SIMD. If modifier is not set at all, fall down to the bottom and return
   // "none".
   if (modifier) {
-    const auto &modType1 =
-        std::get<Fortran::parser::OmpScheduleModifier::Modifier1>(modifier->t);
-    if (modType1.v.v ==
-        Fortran::parser::OmpScheduleModifierType::ModType::Simd) {
-      const auto &modType2 = std::get<
-          std::optional<Fortran::parser::OmpScheduleModifier::Modifier2>>(
-          modifier->t);
-      if (modType2 &&
-          modType2->v.v !=
-              Fortran::parser::OmpScheduleModifierType::ModType::Simd)
-        return translateScheduleModifier(modType2->v);
-
+    using ModType = omp::clause::Schedule::ModType;
+    const auto &modType1 = std::get<ModType>(modifier->t);
+    if (modType1 == ModType::Simd) {
+      const auto &modType2 = std::get<std::optional<ModType>>(modifier->t);
+      if (modType2 && *modType2 != ModType::Simd)
+        return translateScheduleModifier(*modType2);
       return mlir::omp::ScheduleModifier::none;
     }
 
-    return translateScheduleModifier(modType1.v);
+    return translateScheduleModifier(modType1);
   }
   return mlir::omp::ScheduleModifier::none;
 }
 
 static mlir::omp::ScheduleModifier
-getSimdModifier(const Fortran::parser::OmpScheduleClause &x) {
-  const auto &modifier =
-      std::get<std::optional<Fortran::parser::OmpScheduleModifier>>(x.t);
+getSimdModifier(const omp::clause::Schedule &clause) {
+  using ScheduleModifier = omp::clause::Schedule::ScheduleModifier;
+  const auto &modifier = std::get<std::optional<ScheduleModifier>>(clause.t);
   // Either of the two possible modifiers in the input can be the SIMD modifier,
   // so look in either one, and return simd if we find one. Not found = return
   // "none".
   if (modifier) {
-    const auto &modType1 =
-        std::get<Fortran::parser::OmpScheduleModifier::Modifier1>(modifier->t);
-    if (modType1.v.v == Fortran::parser::OmpScheduleModifierType::ModType::Simd)
+    using ModType = omp::clause::Schedule::ModType;
+    const auto &modType1 = std::get<ModType>(modifier->t);
+    if (modType1 == ModType::Simd)
       return mlir::omp::ScheduleModifier::simd;
 
-    const auto &modType2 = std::get<
-        std::optional<Fortran::parser::OmpScheduleModifier::Modifier2>>(
-        modifier->t);
-    if (modType2 && modType2->v.v ==
-                        Fortran::parser::OmpScheduleModifierType::ModType::Simd)
+    const auto &modType2 = std::get<std::optional<ModType>>(modifier->t);
+    if (modType2 && *modType2 == ModType::Simd)
       return mlir::omp::ScheduleModifier::simd;
   }
   return mlir::omp::ScheduleModifier::none;
@@ -95,29 +87,25 @@ getSimdModifier(const Fortran::parser::OmpScheduleClause &x) {
 
 static void
 genAllocateClause(Fortran::lower::AbstractConverter &converter,
-                  const Fortran::parser::OmpAllocateClause &ompAllocateClause,
+                  const omp::clause::Allocate &clause,
                   llvm::SmallVectorImpl<mlir::Value> &allocatorOperands,
                   llvm::SmallVectorImpl<mlir::Value> &allocateOperands) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::Location currentLocation = converter.getCurrentLocation();
   Fortran::lower::StatementContext stmtCtx;
 
-  mlir::Value allocatorOperand;
-  const Fortran::parser::OmpObjectList &ompObjectList =
-      std::get<Fortran::parser::OmpObjectList>(ompAllocateClause.t);
-  const auto &allocateModifier = std::get<
-      std::optional<Fortran::parser::OmpAllocateClause::AllocateModifier>>(
-      ompAllocateClause.t);
+  const omp::ObjectList &objectList = std::get<omp::ObjectList>(clause.t);
+  const auto &modifier =
+      std::get<std::optional<omp::clause::Allocate::Modifier>>(clause.t);
 
   // If the allocate modifier is present, check if we only use the allocator
   // submodifier.  ALIGN in this context is unimplemented
   const bool onlyAllocator =
-      allocateModifier &&
-      std::holds_alternative<
-          Fortran::parser::OmpAllocateClause::AllocateModifier::Allocator>(
-          allocateModifier->u);
+      modifier &&
+      std::holds_alternative<omp::clause::Allocate::Modifier::Allocator>(
+          modifier->u);
 
-  if (allocateModifier && !onlyAllocator) {
+  if (modifier && !onlyAllocator) {
     TODO(currentLocation, "OmpAllocateClause ALIGN modifier");
   }
 
@@ -125,37 +113,34 @@ genAllocateClause(Fortran::lower::AbstractConverter &converter,
   // to list of allocators, otherwise, add default allocator to
   // list of allocators.
   if (onlyAllocator) {
-    const auto &allocatorValue = std::get<
-        Fortran::parser::OmpAllocateClause::AllocateModifier::Allocator>(
-        allocateModifier->u);
-    allocatorOperand = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(allocatorValue.v), stmtCtx));
-    allocatorOperands.insert(allocatorOperands.end(), ompObjectList.v.size(),
-                             allocatorOperand);
+    const auto &value =
+        std::get<omp::clause::Allocate::Modifier::Allocator>(modifier->u);
+    mlir::Value operand =
+        fir::getBase(converter.genExprValue(value.v, stmtCtx));
+    allocatorOperands.append(objectList.size(), operand);
   } else {
-    allocatorOperand = firOpBuilder.createIntegerConstant(
+    mlir::Value operand = firOpBuilder.createIntegerConstant(
         currentLocation, firOpBuilder.getI32Type(), 1);
-    allocatorOperands.insert(allocatorOperands.end(), ompObjectList.v.size(),
-                             allocatorOperand);
+    allocatorOperands.append(objectList.size(), operand);
   }
-  genObjectList(ompObjectList, converter, allocateOperands);
+  genObjectList(objectList, converter, allocateOperands);
 }
 
-static mlir::omp::ClauseProcBindKindAttr genProcBindKindAttr(
-    fir::FirOpBuilder &firOpBuilder,
-    const Fortran::parser::OmpClause::ProcBind *procBindClause) {
+static mlir::omp::ClauseProcBindKindAttr
+genProcBindKindAttr(fir::FirOpBuilder &firOpBuilder,
+                    const omp::clause::ProcBind &clause) {
   mlir::omp::ClauseProcBindKind procBindKind;
-  switch (procBindClause->v.v) {
-  case Fortran::parser::OmpProcBindClause::Type::Master:
+  switch (clause.v) {
+  case omp::clause::ProcBind::Type::Master:
     procBindKind = mlir::omp::ClauseProcBindKind::Master;
     break;
-  case Fortran::parser::OmpProcBindClause::Type::Close:
+  case omp::clause::ProcBind::Type::Close:
     procBindKind = mlir::omp::ClauseProcBindKind::Close;
     break;
-  case Fortran::parser::OmpProcBindClause::Type::Spread:
+  case omp::clause::ProcBind::Type::Spread:
     procBindKind = mlir::omp::ClauseProcBindKind::Spread;
     break;
-  case Fortran::parser::OmpProcBindClause::Type::Primary:
+  case omp::clause::ProcBind::Type::Primary:
     procBindKind = mlir::omp::ClauseProcBindKind::Primary;
     break;
   }
@@ -165,20 +150,17 @@ static mlir::omp::ClauseProcBindKindAttr genProcBindKindAttr(
 
 static mlir::omp::ClauseTaskDependAttr
 genDependKindAttr(fir::FirOpBuilder &firOpBuilder,
-                  const Fortran::parser::OmpClause::Depend *dependClause) {
+                  const omp::clause::Depend &clause) {
   mlir::omp::ClauseTaskDepend pbKind;
-  switch (
-      std::get<Fortran::parser::OmpDependenceType>(
-          std::get<Fortran::parser::OmpDependClause::InOut>(dependClause->v.u)
-              .t)
-          .v) {
-  case Fortran::parser::OmpDependenceType::Type::In:
+  const auto &inOut = std::get<omp::clause::Depend::InOut>(clause.u);
+  switch (std::get<omp::clause::Depend::Type>(inOut.t)) {
+  case omp::clause::Depend::Type::In:
     pbKind = mlir::omp::ClauseTaskDepend::taskdependin;
     break;
-  case Fortran::parser::OmpDependenceType::Type::Out:
+  case omp::clause::Depend::Type::Out:
     pbKind = mlir::omp::ClauseTaskDepend::taskdependout;
     break;
-  case Fortran::parser::OmpDependenceType::Type::Inout:
+  case omp::clause::Depend::Type::Inout:
     pbKind = mlir::omp::ClauseTaskDepend::taskdependinout;
     break;
   default:
@@ -189,45 +171,41 @@ genDependKindAttr(fir::FirOpBuilder &firOpBuilder,
                                               pbKind);
 }
 
-static mlir::Value getIfClauseOperand(
-    Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::OmpClause::If *ifClause,
-    Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName,
-    mlir::Location clauseLocation) {
+static mlir::Value
+getIfClauseOperand(Fortran::lower::AbstractConverter &converter,
+                   const omp::clause::If &clause,
+                   omp::clause::If::DirectiveNameModifier directiveName,
+                   mlir::Location clauseLocation) {
   // Only consider the clause if it's intended for the given directive.
-  auto &directive = std::get<
-      std::optional<Fortran::parser::OmpIfClause::DirectiveNameModifier>>(
-      ifClause->v.t);
+  auto &directive =
+      std::get<std::optional<omp::clause::If::DirectiveNameModifier>>(clause.t);
   if (directive && directive.value() != directiveName)
     return nullptr;
 
   Fortran::lower::StatementContext stmtCtx;
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  auto &expr = std::get<Fortran::parser::ScalarLogicalExpr>(ifClause->v.t);
   mlir::Value ifVal = fir::getBase(
-      converter.genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx));
+      converter.genExprValue(std::get<omp::SomeExpr>(clause.t), stmtCtx));
   return firOpBuilder.createConvert(clauseLocation, firOpBuilder.getI1Type(),
                                     ifVal);
 }
 
 static void
 addUseDeviceClause(Fortran::lower::AbstractConverter &converter,
-                   const Fortran::parser::OmpObjectList &useDeviceClause,
+                   const omp::ObjectList &objects,
                    llvm::SmallVectorImpl<mlir::Value> &operands,
                    llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
                    llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
                    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
                        &useDeviceSymbols) {
-  genObjectList(useDeviceClause, converter, operands);
+  genObjectList(objects, converter, operands);
   for (mlir::Value &operand : operands) {
     checkMapType(operand.getLoc(), operand.getType());
     useDeviceTypes.push_back(operand.getType());
     useDeviceLocs.push_back(operand.getLoc());
   }
-  for (const Fortran::parser::OmpObject &ompObject : useDeviceClause.v) {
-    Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject);
-    useDeviceSymbols.push_back(sym);
-  }
+  for (const omp::Object &object : objects)
+    useDeviceSymbols.push_back(object.id());
 }
 
 //===----------------------------------------------------------------------===//
@@ -253,9 +231,8 @@ bool ClauseProcessor::processCollapse(
   }
 
   std::int64_t collapseValue = 1l;
-  if (auto *collapseClause = findUniqueClause<ClauseTy::Collapse>()) {
-    const auto *expr = Fortran::semantics::GetExpr(collapseClause->v);
-    collapseValue = Fortran::evaluate::ToInt64(*expr).value();
+  if (auto *clause = findUniqueClause<omp::clause::Collapse>()) {
+    collapseValue = Fortran::evaluate::ToInt64(clause->v).value();
     found = true;
   }
 
@@ -294,19 +271,19 @@ bool ClauseProcessor::processCollapse(
 }
 
 bool ClauseProcessor::processDefault() const {
-  if (auto *defaultClause = findUniqueClause<ClauseTy::Default>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Default>()) {
     // Private, Firstprivate, Shared, None
-    switch (defaultClause->v.v) {
-    case Fortran::parser::OmpDefaultClause::Type::Shared:
-    case Fortran::parser::OmpDefaultClause::Type::None:
+    switch (clause->v) {
+    case omp::clause::Default::Type::Shared:
+    case omp::clause::Default::Type::None:
       // Default clause with shared or none do not require any handling since
       // Shared is the default behavior in the IR and None is only required
       // for semantic checks.
       break;
-    case Fortran::parser::OmpDefaultClause::Type::Private:
+    case omp::clause::Default::Type::Private:
       // TODO Support default(private)
       break;
-    case Fortran::parser::OmpDefaultClause::Type::Firstprivate:
+    case omp::clause::Default::Type::Firstprivate:
       // TODO Support default(firstprivate)
       break;
     }
@@ -318,20 +295,17 @@ bool ClauseProcessor::processDefault() const {
 bool ClauseProcessor::processDevice(Fortran::lower::StatementContext &stmtCtx,
                                     mlir::Value &result) const {
   const Fortran::parser::CharBlock *source = nullptr;
-  if (auto *deviceClause = findUniqueClause<ClauseTy::Device>(&source)) {
+  if (auto *clause = findUniqueClause<omp::clause::Device>(&source)) {
     mlir::Location clauseLocation = converter.genLocation(*source);
-    if (auto deviceModifier = std::get<
-            std::optional<Fortran::parser::OmpDeviceClause::DeviceModifier>>(
-            deviceClause->v.t)) {
-      if (deviceModifier ==
-          Fortran::parser::OmpDeviceClause::DeviceModifier::Ancestor) {
+    if (auto deviceModifier =
+            std::get<std::optional<omp::clause::Device::DeviceModifier>>(
+                clause->t)) {
+      if (deviceModifier == omp::clause::Device::DeviceModifier::Ancestor) {
         TODO(clauseLocation, "OMPD_target Device Modifier Ancestor");
       }
     }
-    if (const auto *deviceExpr = Fortran::semantics::GetExpr(
-            std::get<Fortran::parser::ScalarIntExpr>(deviceClause->v.t))) {
-      result = fir::getBase(converter.genExprValue(*deviceExpr, stmtCtx));
-    }
+    const auto &deviceExpr = std::get<omp::SomeExpr>(clause->t);
+    result = fir::getBase(converter.genExprValue(deviceExpr, stmtCtx));
     return true;
   }
   return false;
@@ -339,16 +313,16 @@ bool ClauseProcessor::processDevice(Fortran::lower::StatementContext &stmtCtx,
 
 bool ClauseProcessor::processDeviceType(
     mlir::omp::DeclareTargetDeviceType &result) const {
-  if (auto *deviceTypeClause = findUniqueClause<ClauseTy::DeviceType>()) {
+  if (auto *clause = findUniqueClause<omp::clause::DeviceType>()) {
     // Case: declare target ... device_type(any | host | nohost)
-    switch (deviceTypeClause->v.v) {
-    case Fortran::parser::OmpDeviceTypeClause::Type::Nohost:
+    switch (clause->v) {
+    case omp::clause::DeviceType::Type::Nohost:
       result = mlir::omp::DeclareTargetDeviceType::nohost;
       break;
-    case Fortran::parser::OmpDeviceTypeClause::Type::Host:
+    case omp::clause::DeviceType::Type::Host:
       result = mlir::omp::DeclareTargetDeviceType::host;
       break;
-    case Fortran::parser::OmpDeviceTypeClause::Type::Any:
+    case omp::clause::DeviceType::Type::Any:
       result = mlir::omp::DeclareTargetDeviceType::any;
       break;
     }
@@ -360,12 +334,12 @@ bool ClauseProcessor::processDeviceType(
 bool ClauseProcessor::processFinal(Fortran::lower::StatementContext &stmtCtx,
                                    mlir::Value &result) const {
   const Fortran::parser::CharBlock *source = nullptr;
-  if (auto *finalClause = findUniqueClause<ClauseTy::Final>(&source)) {
+  if (auto *clause = findUniqueClause<omp::clause::Final>(&source)) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     mlir::Location clauseLocation = converter.genLocation(*source);
 
-    mlir::Value finalVal = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(finalClause->v), stmtCtx));
+    mlir::Value finalVal =
+        fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     result = firOpBuilder.createConvert(clauseLocation,
                                         firOpBuilder.getI1Type(), finalVal);
     return true;
@@ -374,10 +348,9 @@ bool ClauseProcessor::processFinal(Fortran::lower::StatementContext &stmtCtx,
 }
 
 bool ClauseProcessor::processHint(mlir::IntegerAttr &result) const {
-  if (auto *hintClause = findUniqueClause<ClauseTy::Hint>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Hint>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    const auto *expr = Fortran::semantics::GetExpr(hintClause->v);
-    int64_t hintValue = *Fortran::evaluate::ToInt64(*expr);
+    int64_t hintValue = *Fortran::evaluate::ToInt64(clause->v);
     result = firOpBuilder.getI64IntegerAttr(hintValue);
     return true;
   }
@@ -385,20 +358,19 @@ bool ClauseProcessor::processHint(mlir::IntegerAttr &result) const {
 }
 
 bool ClauseProcessor::processMergeable(mlir::UnitAttr &result) const {
-  return markClauseOccurrence<ClauseTy::Mergeable>(result);
+  return markClauseOccurrence<omp::clause::Mergeable>(result);
 }
 
 bool ClauseProcessor::processNowait(mlir::UnitAttr &result) const {
-  return markClauseOccurrence<ClauseTy::Nowait>(result);
+  return markClauseOccurrence<omp::clause::Nowait>(result);
 }
 
 bool ClauseProcessor::processNumTeams(Fortran::lower::StatementContext &stmtCtx,
                                       mlir::Value &result) const {
   // TODO Get lower and upper bounds for num_teams when parser is updated to
   // accept both.
-  if (auto *numTeamsClause = findUniqueClause<ClauseTy::NumTeams>()) {
-    result = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(numTeamsClause->v), stmtCtx));
+  if (auto *clause = findUniqueClause<omp::clause::NumTeams>()) {
+    result = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
@@ -406,23 +378,20 @@ bool ClauseProcessor::processNumTeams(Fortran::lower::StatementContext &stmtCtx,
 
 bool ClauseProcessor::processNumThreads(
     Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const {
-  if (auto *numThreadsClause = findUniqueClause<ClauseTy::NumThreads>()) {
+  if (auto *clause = findUniqueClause<omp::clause::NumThreads>()) {
     // OMPIRBuilder expects `NUM_THREADS` clause as a `Value`.
-    result = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(numThreadsClause->v), stmtCtx));
+    result = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
 }
 
 bool ClauseProcessor::processOrdered(mlir::IntegerAttr &result) const {
-  if (auto *orderedClause = findUniqueClause<ClauseTy::Ordered>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Ordered>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     int64_t orderedClauseValue = 0l;
-    if (orderedClause->v.has_value()) {
-      const auto *expr = Fortran::semantics::GetExpr(orderedClause->v);
-      orderedClauseValue = *Fortran::evaluate::ToInt64(*expr);
-    }
+    if (clause->v.has_value())
+      orderedClauseValue = *Fortran::evaluate::ToInt64(*clause->v);
     result = firOpBuilder.getI64IntegerAttr(orderedClauseValue);
     return true;
   }
@@ -431,9 +400,8 @@ bool ClauseProcessor::processOrdered(mlir::IntegerAttr &result) const {
 
 bool ClauseProcessor::processPriority(Fortran::lower::StatementContext &stmtCtx,
                                       mlir::Value &result) const {
-  if (auto *priorityClause = findUniqueClause<ClauseTy::Priority>()) {
-    result = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(priorityClause->v), stmtCtx));
+  if (auto *clause = findUniqueClause<omp::clause::Priority>()) {
+    result = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
@@ -441,20 +409,19 @@ bool ClauseProcessor::processPriority(Fortran::lower::StatementContext &stmtCtx,
 
 bool ClauseProcessor::processProcBind(
     mlir::omp::ClauseProcBindKindAttr &result) const {
-  if (auto *procBindClause = findUniqueClause<ClauseTy::ProcBind>()) {
+  if (auto *clause = findUniqueClause<omp::clause::ProcBind>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    result = genProcBindKindAttr(firOpBuilder, procBindClause);
+    result = genProcBindKindAttr(firOpBuilder, *clause);
     return true;
   }
   return false;
 }
 
 bool ClauseProcessor::processSafelen(mlir::IntegerAttr &result) const {
-  if (auto *safelenClause = findUniqueClause<ClauseTy::Safelen>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Safelen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    const auto *expr = Fortran::semantics::GetExpr(safelenClause->v);
     const std::optional<std::int64_t> safelenVal =
-        Fortran::evaluate::ToInt64(*expr);
+        Fortran::evaluate::ToInt64(clause->v);
     result = firOpBuilder.getI64IntegerAttr(*safelenVal);
     return true;
   }
@@ -465,41 +432,38 @@ bool ClauseProcessor::processSchedule(
     mlir::omp::ClauseScheduleKindAttr &valAttr,
     mlir::omp::ScheduleModifierAttr &modifierAttr,
     mlir::UnitAttr &simdModifierAttr) const {
-  if (auto *scheduleClause = findUniqueClause<ClauseTy::Schedule>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Schedule>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
     mlir::MLIRContext *context = firOpBuilder.getContext();
-    const Fortran::parser::OmpScheduleClause &scheduleType = scheduleClause->v;
-    const auto &scheduleClauseKind =
-        std::get<Fortran::parser::OmpScheduleClause::ScheduleType>(
-            scheduleType.t);
+    const auto &scheduleType =
+        std::get<omp::clause::Schedule::ScheduleType>(clause->t);
 
     mlir::omp::ClauseScheduleKind scheduleKind;
-    switch (scheduleClauseKind) {
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Static:
+    switch (scheduleType) {
+    case omp::clause::Schedule::ScheduleType::Static:
       scheduleKind = mlir::omp::ClauseScheduleKind::Static;
       break;
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Dynamic:
+    case omp::clause::Schedule::ScheduleType::Dynamic:
       scheduleKind = mlir::omp::ClauseScheduleKind::Dynamic;
       break;
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Guided:
+    case omp::clause::Schedule::ScheduleType::Guided:
       scheduleKind = mlir::omp::ClauseScheduleKind::Guided;
       break;
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Auto:
+    case omp::clause::Schedule::ScheduleType::Auto:
       scheduleKind = mlir::omp::ClauseScheduleKind::Auto;
       break;
-    case Fortran::parser::OmpScheduleClause::ScheduleType::Runtime:
+    case omp::clause::Schedule::ScheduleType::Runtime:
       scheduleKind = mlir::omp::ClauseScheduleKind::Runtime;
       break;
     }
 
-    mlir::omp::ScheduleModifier scheduleModifier =
-        getScheduleModifier(scheduleClause->v);
+    mlir::omp::ScheduleModifier scheduleModifier = getScheduleModifier(*clause);
 
     if (scheduleModifier != mlir::omp::ScheduleModifier::none)
       modifierAttr =
           mlir::omp::ScheduleModifierAttr::get(context, scheduleModifier);
 
-    if (getSimdModifier(scheduleClause->v) != mlir::omp::ScheduleModifier::none)
+    if (getSimdModifier(*clause) != mlir::omp::ScheduleModifier::none)
       simdModifierAttr = firOpBuilder.getUnitAttr();
 
     valAttr = mlir::omp::ClauseScheduleKindAttr::get(context, scheduleKind);
@@ -510,25 +474,19 @@ bool ClauseProcessor::processSchedule(
 
 bool ClauseProcessor::processScheduleChunk(
     Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const {
-  if (auto *scheduleClause = findUniqueClause<ClauseTy::Schedule>()) {
-    if (const auto &chunkExpr =
-            std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
-                scheduleClause->v.t)) {
-      if (const auto *expr = Fortran::semantics::GetExpr(*chunkExpr)) {
-        result = fir::getBase(converter.genExprValue(*expr, stmtCtx));
-      }
-    }
+  if (auto *clause = findUniqueClause<omp::clause::Schedule>()) {
+    if (const auto &chunkExpr = std::get<omp::MaybeExpr>(clause->t))
+      result = fir::getBase(converter.genExprValue(*chunkExpr, stmtCtx));
     return true;
   }
   return false;
 }
 
 bool ClauseProcessor::processSimdlen(mlir::IntegerAttr &result) const {
-  if (auto *simdlenClause = findUniqueClause<ClauseTy::Simdlen>()) {
+  if (auto *clause = findUniqueClause<omp::clause::Simdlen>()) {
     fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-    const auto *expr = Fortran::semantics::GetExpr(simdlenClause->v);
     const std::optional<std::int64_t> simdlenVal =
-        Fortran::evaluate::ToInt64(*expr);
+        Fortran::evaluate::ToInt64(clause->v);
     result = firOpBuilder.getI64IntegerAttr(*simdlenVal);
     return true;
   }
@@ -537,16 +495,15 @@ bool ClauseProcessor::processSimdlen(mlir::IntegerAttr &result) const {
 
 bool ClauseProcessor::processThreadLimit(
     Fortran::lower::StatementContext &stmtCtx, mlir::Value &result) const {
-  if (auto *threadLmtClause = findUniqueClause<ClauseTy::ThreadLimit>()) {
-    result = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(threadLmtClause->v), stmtCtx));
+  if (auto *clause = findUniqueClause<omp::clause::ThreadLimit>()) {
+    result = fir::getBase(converter.genExprValue(clause->v, stmtCtx));
     return true;
   }
   return false;
 }
 
 bool ClauseProcessor::processUntied(mlir::UnitAttr &result) const {
-  return markClauseOccurrence<ClauseTy::Untied>(result);
+  return markClauseOccurrence<omp::clause::Untied>(result);
 }
 
 //===----------------------------------------------------------------------===//
@@ -556,10 +513,10 @@ bool ClauseProcessor::processUntied(mlir::UnitAttr &result) const {
 bool ClauseProcessor::processAllocate(
     llvm::SmallVectorImpl<mlir::Value> &allocatorOperands,
     llvm::SmallVectorImpl<mlir::Value> &allocateOperands) const {
-  return findRepeatableClause<ClauseTy::Allocate>(
-      [&](const ClauseTy::Allocate *allocateClause,
+  return findRepeatableClause<omp::clause::Allocate>(
+      [&](const omp::clause::Allocate &clause,
           const Fortran::parser::CharBlock &) {
-        genAllocateClause(converter, allocateClause->v, allocatorOperands,
+        genAllocateClause(converter, clause, allocatorOperands,
                           allocateOperands);
       });
 }
@@ -576,12 +533,12 @@ bool ClauseProcessor::processCopyin() const {
         if (converter.isPresentShallowLookup(*sym))
           converter.copyHostAssociateVar(*sym, copyAssignIP);
       };
-  bool hasCopyin = findRepeatableClause<ClauseTy::Copyin>(
-      [&](const ClauseTy::Copyin *copyinClause,
+  bool hasCopyin = findRepeatableClause<omp::clause::Copyin>(
+      [&](const omp::clause::Copyin &clause,
           const Fortran::parser::CharBlock &) {
-        const Fortran::parser::OmpObjectList &ompObjectList = copyinClause->v;
-        for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) {
-          Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject);
+        for (const omp::Object &object : clause.v) {
+          Fortran::semantics::Symbol *sym = object.id();
+          assert(sym && "Expecting symbol");
           if (const auto *commonDetails =
                   sym->detailsIf<Fortran::semantics::CommonBlockDetails>()) {
             for (const auto &mem : commonDetails->objects())
@@ -620,7 +577,7 @@ class TypeInfo {
   }
 
   // Returns the shape of array types.
-  const llvm::SmallVector<int64_t> &getShape() const { return shape; }
+  llvm::ArrayRef<int64_t> getShape() const { return shape; }
 
   // Is the type inside a box?
   bool isBox() const { return inBox; }
@@ -745,13 +702,11 @@ bool ClauseProcessor::processCopyPrivate(
     copyPrivateFuncs.push_back(mlir::SymbolRefAttr::get(funcOp));
   };
 
-  bool hasCopyPrivate = findRepeatableClause<ClauseTy::Copyprivate>(
-      [&](const ClauseTy::Copyprivate *copyPrivateClause,
+  bool hasCopyPrivate = findRepeatableClause<clause::Copyprivate>(
+      [&](const clause::Copyprivate &clause,
           const Fortran::parser::CharBlock &) {
-        const Fortran::parser::OmpObjectList &ompObjectList =
-            copyPrivateClause->v;
-        for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) {
-          Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject);
+        for (const Object &object : clause.v) {
+          Fortran::semantics::Symbol *sym = object.id();
           if (const auto *commonDetails =
                   sym->detailsIf<Fortran::semantics::CommonBlockDetails>()) {
             for (const auto &mem : commonDetails->objects())
@@ -770,38 +725,30 @@ bool ClauseProcessor::processDepend(
     llvm::SmallVectorImpl<mlir::Value> &dependOperands) const {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  return findRepeatableClause<ClauseTy::Depend>(
-      [&](const ClauseTy::Depend *dependClause,
+  return findRepeatableClause<omp::clause::Depend>(
+      [&](const omp::clause::Depend &clause,
           const Fortran::parser::CharBlock &) {
-        const std::list<Fortran::parser::Designator> &depVal =
-            std::get<std::list<Fortran::parser::Designator>>(
-                std::get<Fortran::parser::OmpDependClause::InOut>(
-                    dependClause->v.u)
-                    .t);
+        assert(std::holds_alternative<omp::clause::Depend::InOut>(clause.u) &&
+               "Only InOut is handled at the moment");
+        const auto &inOut = std::get<omp::clause::Depend::InOut>(clause.u);
+        const auto &objects = std::get<omp::ObjectList>(inOut.t);
+
         mlir::omp::ClauseTaskDependAttr dependTypeOperand =
-            genDependKindAttr(firOpBuilder, dependClause);
-        dependTypeOperands.insert(dependTypeOperands.end(), depVal.size(),
-                                  dependTypeOperand);
-        for (const Fortran::parser::Designator &ompObject : depVal) {
-          Fortran::semantics::Symbol *sym = nullptr;
-          std::visit(
-              Fortran::common::visitors{
-                  [&](const Fortran::parser::DataRef &designator) {
-                    if (const Fortran::parser::Name *name =
-                            std::get_if<Fortran::parser::Name>(&designator.u)) {
-                      sym = name->symbol;
-                    } else if (std::get_if<Fortran::common::Indirection<
-                                   Fortran::parser::ArrayElement>>(
-                                   &designator.u)) {
-                      TODO(converter.getCurrentLocation(),
-                           "array sections not supported for task depend");
-                    }
-                  },
-                  [&](const Fortran::parser::Substring &designator) {
-                    TODO(converter.getCurrentLocation(),
-                         "substring not supported for task depend");
-                  }},
-              (ompObject).u);
+            genDependKindAttr(firOpBuilder, clause);
+        dependTypeOperands.append(objects.size(), dependTypeOperand);
+
+        for (const omp::Object &object : objects) {
+          assert(object.ref() && "Expecting designator");
+
+          if (Fortran::evaluate::ExtractSubstring(*object.ref())) {
+            TODO(converter.getCurrentLocation(),
+                 "substring not supported for task depend");
+          } else if (Fortran::evaluate::IsArrayElement(*object.ref())) {
+            TODO(converter.getCurrentLocation(),
+                 "array sections not supported for task depend");
+          }
+
+          Fortran::semantics::Symbol *sym = object.id();
           const mlir::Value variable = converter.getSymbolAddress(*sym);
           dependOperands.push_back(variable);
         }
@@ -809,14 +756,14 @@ bool ClauseProcessor::processDepend(
 }
 
 bool ClauseProcessor::processIf(
-    Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName,
+    omp::clause::If::DirectiveNameModifier directiveName,
     mlir::Value &result) const {
   bool found = false;
-  findRepeatableClause<ClauseTy::If>(
-      [&](const ClauseTy::If *ifClause,
+  findRepeatableClause<omp::clause::If>(
+      [&](const omp::clause::If &clause,
           const Fortran::parser::CharBlock &source) {
         mlir::Location clauseLocation = converter.genLocation(source);
-        mlir::Value operand = getIfClauseOperand(converter, ifClause,
+        mlir::Value operand = getIfClauseOperand(converter, clause,
                                                  directiveName, clauseLocation);
         // Assume that, at most, a single 'if' clause will be applicable to the
         // given directive.
@@ -830,20 +777,19 @@ bool ClauseProcessor::processIf(
 
 bool ClauseProcessor::processLink(
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const {
-  return findRepeatableClause<ClauseTy::Link>(
-      [&](const ClauseTy::Link *linkClause,
-          const Fortran::parser::CharBlock &) {
+  return findRepeatableClause<omp::clause::Link>(
+      [&](const omp::clause::Link &clause, const Fortran::parser::CharBlock &) {
         // Case: declare target link(var1, var2)...
         gatherFuncAndVarSyms(
-            linkClause->v, mlir::omp::DeclareTargetCaptureClause::link, result);
+            clause.v, mlir::omp::DeclareTargetCaptureClause::link, result);
       });
 }
 
 mlir::omp::MapInfoOp
 createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name,
-                mlir::SmallVector<mlir::Value> bounds,
-                mlir::SmallVector<mlir::Value> members, uint64_t mapType,
+                llvm::ArrayRef<mlir::Value> bounds,
+                llvm::ArrayRef<mlir::Value> members, uint64_t mapType,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
                 bool isVal) {
   if (auto boxTy = baseAddr.getType().dyn_cast<fir::BaseBoxType>()) {
@@ -872,7 +818,7 @@ bool ClauseProcessor::processMap(
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *mapSymbols)
     const {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  return findRepeatableClause<ClauseTy::Map>(
+  return findRepeatableClause2<ClauseTy::Map>(
       [&](const ClauseTy::Map *mapClause,
           const Fortran::parser::CharBlock &source) {
         mlir::Location clauseLocation = converter.genLocation(source);
@@ -964,43 +910,41 @@ bool ClauseProcessor::processReduction(
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols)
     const {
-  return findRepeatableClause<ClauseTy::Reduction>(
-      [&](const ClauseTy::Reduction *reductionClause,
+  return findRepeatableClause<omp::clause::Reduction>(
+      [&](const omp::clause::Reduction &clause,
           const Fortran::parser::CharBlock &) {
         ReductionProcessor rp;
-        rp.addReductionDecl(currentLocation, converter, reductionClause->v,
-                            reductionVars, reductionDeclSymbols,
-                            reductionSymbols);
+        rp.addReductionDecl(currentLocation, converter, clause, reductionVars,
+                            reductionDeclSymbols, reductionSymbols);
       });
 }
 
 bool ClauseProcessor::processSectionsReduction(
     mlir::Location currentLocation) const {
-  return findRepeatableClause<ClauseTy::Reduction>(
-      [&](const ClauseTy::Reduction *, const Fortran::parser::CharBlock &) {
+  return findRepeatableClause<omp::clause::Reduction>(
+      [&](const omp::clause::Reduction &, const Fortran::parser::CharBlock &) {
         TODO(currentLocation, "OMPC_Reduction");
       });
 }
 
 bool ClauseProcessor::processTo(
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const {
-  return findRepeatableClause<ClauseTy::To>(
-      [&](const ClauseTy::To *toClause, const Fortran::parser::CharBlock &) {
+  return findRepeatableClause<omp::clause::To>(
+      [&](const omp::clause::To &clause, const Fortran::parser::CharBlock &) {
         // Case: declare target to(func, var1, var2)...
-        gatherFuncAndVarSyms(toClause->v,
+        gatherFuncAndVarSyms(clause.v,
                              mlir::omp::DeclareTargetCaptureClause::to, result);
       });
 }
 
 bool ClauseProcessor::processEnter(
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const {
-  return findRepeatableClause<ClauseTy::Enter>(
-      [&](const ClauseTy::Enter *enterClause,
+  return findRepeatableClause<omp::clause::Enter>(
+      [&](const omp::clause::Enter &clause,
           const Fortran::parser::CharBlock &) {
         // Case: declare target enter(func, var1, var2)...
-        gatherFuncAndVarSyms(enterClause->v,
-                             mlir::omp::DeclareTargetCaptureClause::enter,
-                             result);
+        gatherFuncAndVarSyms(
+            clause.v, mlir::omp::DeclareTargetCaptureClause::enter, result);
       });
 }
 
@@ -1010,11 +954,11 @@ bool ClauseProcessor::processUseDeviceAddr(
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &useDeviceSymbols)
     const {
-  return findRepeatableClause<ClauseTy::UseDeviceAddr>(
-      [&](const ClauseTy::UseDeviceAddr *devAddrClause,
+  return findRepeatableClause<omp::clause::UseDeviceAddr>(
+      [&](const omp::clause::UseDeviceAddr &clause,
           const Fortran::parser::CharBlock &) {
-        addUseDeviceClause(converter, devAddrClause->v, operands,
-                           useDeviceTypes, useDeviceLocs, useDeviceSymbols);
+        addUseDeviceClause(converter, clause.v, operands, useDeviceTypes,
+                           useDeviceLocs, useDeviceSymbols);
       });
 }
 
@@ -1024,10 +968,10 @@ bool ClauseProcessor::processUseDevicePtr(
     llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &useDeviceSymbols)
     const {
-  return findRepeatableClause<ClauseTy::UseDevicePtr>(
-      [&](const ClauseTy::UseDevicePtr *devPtrClause,
+  return findRepeatableClause<omp::clause::UseDevicePtr>(
+      [&](const omp::clause::UseDevicePtr &clause,
           const Fortran::parser::CharBlock &) {
-        addUseDeviceClause(converter, devPtrClause->v, operands, useDeviceTypes,
+        addUseDeviceClause(converter, clause.v, operands, useDeviceTypes,
                            useDeviceLocs, useDeviceSymbols);
       });
 }
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 11aff0be25053d..3f6adcce8ae877 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -12,6 +12,7 @@
 #ifndef FORTRAN_LOWER_CLAUASEPROCESSOR_H
 #define FORTRAN_LOWER_CLAUASEPROCESSOR_H
 
+#include "Clauses.h"
 #include "DirectivesCommon.h"
 #include "ReductionProcessor.h"
 #include "Utils.h"
@@ -51,7 +52,8 @@ class ClauseProcessor {
   ClauseProcessor(Fortran::lower::AbstractConverter &converter,
                   Fortran::semantics::SemanticsContext &semaCtx,
                   const Fortran::parser::OmpClauseList &clauses)
-      : converter(converter), semaCtx(semaCtx), clauses(clauses) {}
+      : converter(converter), semaCtx(semaCtx), clauses2(clauses),
+        clauses(makeList(clauses, semaCtx)) {}
 
   // 'Unique' clauses: They can appear at most once in the clause list.
   bool
@@ -103,9 +105,8 @@ class ClauseProcessor {
                      llvm::SmallVectorImpl<mlir::Value> &dependOperands) const;
   bool
   processEnter(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
-  bool
-  processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName,
-            mlir::Value &result) const;
+  bool processIf(omp::clause::If::DirectiveNameModifier directiveName,
+                 mlir::Value &result) const;
   bool
   processLink(llvm::SmallVectorImpl<DeclareTargetCapturePair> &result) const;
 
@@ -155,11 +156,15 @@ class ClauseProcessor {
                    llvm::omp::Directive directive) const;
 
 private:
-  using ClauseIterator = std::list<ClauseTy>::const_iterator;
+  using ClauseIterator = List<Clause>::const_iterator;
+  using ClauseIterator2 = std::list<ClauseTy>::const_iterator;
 
   /// Utility to find a clause within a range in the clause list.
   template <typename T>
   static ClauseIterator findClause(ClauseIterator begin, ClauseIterator end);
+  template <typename T>
+  static ClauseIterator2 findClause2(ClauseIterator2 begin,
+                                     ClauseIterator2 end);
 
   /// Return the first instance of the given clause found in the clause list or
   /// `nullptr` if not present. If more than one instance is expected, use
@@ -172,6 +177,10 @@ class ClauseProcessor {
   /// if at least one instance was found.
   template <typename T>
   bool findRepeatableClause(
+      std::function<void(const T &, const Fortran::parser::CharBlock &source)>
+          callbackFn) const;
+  template <typename T>
+  bool findRepeatableClause2(
       std::function<void(const T *, const Fortran::parser::CharBlock &source)>
           callbackFn) const;
 
@@ -181,14 +190,15 @@ class ClauseProcessor {
 
   Fortran::lower::AbstractConverter &converter;
   Fortran::semantics::SemanticsContext &semaCtx;
-  const Fortran::parser::OmpClauseList &clauses;
+  const Fortran::parser::OmpClauseList &clauses2;
+  List<Clause> clauses;
 };
 
 template <typename T>
 bool ClauseProcessor::processMotionClauses(
     Fortran::lower::StatementContext &stmtCtx,
     llvm::SmallVectorImpl<mlir::Value> &mapOperands) {
-  return findRepeatableClause<T>(
+  return findRepeatableClause2<T>(
       [&](const T *motionClause, const Fortran::parser::CharBlock &source) {
         mlir::Location clauseLocation = converter.genLocation(source);
         fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -248,7 +258,7 @@ void ClauseProcessor::processTODO(mlir::Location currentLocation,
              " construct");
   };
 
-  for (ClauseIterator it = clauses.v.begin(); it != clauses.v.end(); ++it)
+  for (ClauseIterator2 it = clauses2.v.begin(); it != clauses2.v.end(); ++it)
     (checkUnhandledClause(std::get_if<Ts>(&it->u)), ...);
 }
 
@@ -263,11 +273,22 @@ ClauseProcessor::findClause(ClauseIterator begin, ClauseIterator end) {
   return end;
 }
 
+template <typename T>
+ClauseProcessor::ClauseIterator2
+ClauseProcessor::findClause2(ClauseIterator2 begin, ClauseIterator2 end) {
+  for (ClauseIterator2 it = begin; it != end; ++it) {
+    if (std::get_if<T>(&it->u))
+      return it;
+  }
+
+  return end;
+}
+
 template <typename T>
 const T *ClauseProcessor::findUniqueClause(
     const Fortran::parser::CharBlock **source) const {
-  ClauseIterator it = findClause<T>(clauses.v.begin(), clauses.v.end());
-  if (it != clauses.v.end()) {
+  ClauseIterator it = findClause<T>(clauses.begin(), clauses.end());
+  if (it != clauses.end()) {
     if (source)
       *source = &it->source;
     return &std::get<T>(it->u);
@@ -277,13 +298,31 @@ const T *ClauseProcessor::findUniqueClause(
 
 template <typename T>
 bool ClauseProcessor::findRepeatableClause(
-    std::function<void(const T *, const Fortran::parser::CharBlock &source)>
+    std::function<void(const T &, const Fortran::parser::CharBlock &source)>
         callbackFn) const {
   bool found = false;
-  ClauseIterator nextIt, endIt = clauses.v.end();
-  for (ClauseIterator it = clauses.v.begin(); it != endIt; it = nextIt) {
+  ClauseIterator nextIt, endIt = clauses.end();
+  for (ClauseIterator it = clauses.begin(); it != endIt; it = nextIt) {
     nextIt = findClause<T>(it, endIt);
 
+    if (nextIt != endIt) {
+      callbackFn(std::get<T>(nextIt->u), nextIt->source);
+      found = true;
+      ++nextIt;
+    }
+  }
+  return found;
+}
+
+template <typename T>
+bool ClauseProcessor::findRepeatableClause2(
+    std::function<void(const T *, const Fortran::parser::CharBlock &source)>
+        callbackFn) const {
+  bool found = false;
+  ClauseIterator2 nextIt, endIt = clauses2.v.end();
+  for (ClauseIterator2 it = clauses2.v.begin(); it != endIt; it = nextIt) {
+    nextIt = findClause2<T>(it, endIt);
+
     if (nextIt != endIt) {
       callbackFn(&std::get<T>(nextIt->u), nextIt->source);
       found = true;
diff --git a/flang/lib/Lower/OpenMP/ClauseT.h b/flang/lib/Lower/OpenMP/ClauseT.h
new file mode 100644
index 00000000000000..2aae29af29214a
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/ClauseT.h
@@ -0,0 +1,714 @@
+//===- ClauseT -- clause template definitions -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_LOWER_OPENMP_CLAUSET_H
+#define FORTRAN_LOWER_OPENMP_CLAUSET_H
+
+#include "flang/Parser/parse-tree.h" // For enum reuse
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <iterator>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "llvm/Frontend/OpenMP/OMP.h.inc"
+
+namespace tomp {
+
+template <typename T>
+using ListT = llvm::SmallVector<T, 0>;
+
+// A specialization of ObjectT<Id, Expr> must provide the following definitions:
+// {
+//    using IdType = Id;
+//    using ExprType = Expr;
+//
+//    auto id() const -> Id {
+//      return the identifier of the object (for use in tests for
+//         presence/absence of the object)
+//    }
+//
+//    auto ref() const -> const Expr& {
+//      return the expression accessing (referencing) the object
+//    }
+// }
+//
+// For example, the ObjectT instance created for "var[x+1]" would have
+// the `id()` return the identifier for `var`, and the `ref()` return the
+// representation of the array-access `var[x+1]`.
+template <typename Id, typename Expr>
+struct ObjectT;
+
+template <typename I, typename E>
+using ObjectListT = ListT<ObjectT<I, E>>;
+
+namespace clause {
+// Helper objects
+
+template <typename I, typename E>
+struct DefinedOperatorT {
+  struct DefinedOpName {
+    using WrapperTrait = std::true_type;
+    ObjectT<I, E> v;
+  };
+  using IntrinsicOperator = Fortran::parser::DefinedOperator::IntrinsicOperator;
+  using UnionTrait = std::true_type;
+  std::variant<DefinedOpName, IntrinsicOperator> u;
+};
+
+template <typename I, typename E>
+struct ProcedureDesignatorT {
+  using WrapperTrait = std::true_type;
+  ObjectT<I, E> v;
+};
+
+template <typename I, typename E>
+struct ReductionOperatorT {
+  using UnionTrait = std::true_type;
+  std::variant<DefinedOperatorT<I, E>, ProcedureDesignatorT<I, E>> u;
+};
+
+template <typename I, typename E>
+struct AcqRelT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AcquireT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AdjustArgsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AffinityT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AlignT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AppendArgsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct AtT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct BindT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct CancellationConstructTypeT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct CaptureT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct CompareT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DepobjT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DestroyT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DetachT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DoacrossT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct DynamicAllocatorsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ExclusiveT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct FailT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct FlushT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct FullT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct InbranchT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct InclusiveT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct IndirectT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct InitT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct MatchT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct MemoryOrderT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct MergeableT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct MessageT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct NogroupT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct NotinbranchT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct NowaitT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct OmpxAttributeT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct OmpxBareT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ReadT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct RelaxedT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ReleaseT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ReverseOffloadT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct SeqCstT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct SeverityT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct SimdT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ThreadprivateT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct ThreadsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UnifiedAddressT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UnifiedSharedMemoryT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UnknownT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UntiedT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UpdateT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UseT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct UsesAllocatorsT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct WeakT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct WhenT {
+  using EmptyTrait = std::true_type;
+};
+template <typename I, typename E>
+struct WriteT {
+  using EmptyTrait = std::true_type;
+};
+
+template <typename I, typename E>
+struct AlignedT {
+  using TupleTrait = std::true_type;
+  std::tuple<ObjectListT<I, E>, std::optional<E>> t;
+};
+
+template <typename I, typename E>
+struct AllocateT {
+  struct Modifier {
+    struct Allocator {
+      using WrapperTrait = std::true_type;
+      E v;
+    };
+    struct Align {
+      using WrapperTrait = std::true_type;
+      E v;
+    };
+    struct ComplexModifier {
+      using TupleTrait = std::true_type;
+      std::tuple<Allocator, Align> t;
+    };
+    using UnionTrait = std::true_type;
+    std::variant<Allocator, ComplexModifier, Align> u;
+  };
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<Modifier>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct AllocatorT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct AtomicDefaultMemOrderT {
+  using WrapperTrait = std::true_type;
+  using OmpAtomicDefaultMemOrderType =
+      Fortran::common::OmpAtomicDefaultMemOrderType;
+  OmpAtomicDefaultMemOrderType v;
+};
+
+template <typename I, typename E>
+struct CollapseT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct CopyinT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct CopyprivateT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct DefaultmapT {
+  using ImplicitBehavior =
+      Fortran::parser::OmpDefaultmapClause::ImplicitBehavior;
+  using VariableCategory =
+      Fortran::parser::OmpDefaultmapClause::VariableCategory;
+  using TupleTrait = std::true_type;
+  std::tuple<ImplicitBehavior, std::optional<VariableCategory>> t;
+};
+
+template <typename I, typename E>
+struct DefaultT {
+  using Type = Fortran::parser::OmpDefaultClause::Type;
+  using WrapperTrait = std::true_type;
+  Type v;
+};
+
+template <typename I, typename E>
+struct DependT {
+  struct Source {
+    using EmptyTrait = std::true_type;
+  };
+  struct Sink {
+    using Length = std::tuple<DefinedOperatorT<I, E>, E>;
+    using Vec = std::tuple<ObjectT<I, E>, std::optional<Length>>;
+    using WrapperTrait = std::true_type;
+    ListT<Vec> v;
+  };
+  using Type = Fortran::parser::OmpDependenceType::Type;
+  struct InOut {
+    using TupleTrait = std::true_type;
+    std::tuple<Type, ObjectListT<I, E>> t;
+  };
+  using UnionTrait = std::true_type;
+  std::variant<Source, Sink, InOut> u;
+};
+
+template <typename I, typename E>
+struct DeviceT {
+  using DeviceModifier = Fortran::parser::OmpDeviceClause::DeviceModifier;
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<DeviceModifier>, E> t;
+};
+
+template <typename I, typename E>
+struct DeviceTypeT {
+  using Type = Fortran::parser::OmpDeviceTypeClause::Type;
+  using WrapperTrait = std::true_type;
+  Type v;
+};
+
+template <typename I, typename E>
+struct DistScheduleT {
+  using WrapperTrait = std::true_type;
+  std::optional<E> v;
+};
+
+template <typename I, typename E>
+struct EnterT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct FilterT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct FinalT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct FirstprivateT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct FromT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct GrainsizeT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct HasDeviceAddrT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct HintT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct IfT {
+  using DirectiveNameModifier =
+      Fortran::parser::OmpIfClause::DirectiveNameModifier;
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<DirectiveNameModifier>, E> t;
+};
+
+template <typename I, typename E>
+struct InReductionT {
+  using TupleTrait = std::true_type;
+  std::tuple<ReductionOperatorT<I, E>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct IsDevicePtrT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct LastprivateT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct LinearT {
+  struct Modifier {
+    using Type = Fortran::parser::OmpLinearModifier::Type;
+    using WrapperTrait = std::true_type;
+    Type v;
+  };
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<Modifier>, ObjectListT<I, E>, std::optional<E>> t;
+};
+
+template <typename I, typename E>
+struct LinkT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct MapT {
+  struct MapType {
+    struct Always {
+      using EmptyTrait = std::true_type;
+    };
+    using Type = Fortran::parser::OmpMapType::Type;
+    using TupleTrait = std::true_type;
+    std::tuple<std::optional<Always>, Type> t;
+  };
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<MapType>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct NocontextT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct NontemporalT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct NovariantsT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct NumTasksT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct NumTeamsT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct NumThreadsT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct OmpxDynCgroupMemT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct OrderedT {
+  using WrapperTrait = std::true_type;
+  std::optional<E> v;
+};
+
+template <typename I, typename E>
+struct OrderT {
+  using Kind = Fortran::parser::OmpOrderModifier::Kind;
+  using Type = Fortran::parser::OmpOrderClause::Type;
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<Kind>, Type> t;
+};
+
+template <typename I, typename E>
+struct PartialT {
+  using WrapperTrait = std::true_type;
+  std::optional<E> v;
+};
+
+template <typename I, typename E>
+struct PriorityT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct PrivateT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct ProcBindT {
+  using Type = Fortran::parser::OmpProcBindClause::Type;
+  using WrapperTrait = std::true_type;
+  Type v;
+};
+
+template <typename I, typename E>
+struct ReductionT {
+  using TupleTrait = std::true_type;
+  std::tuple<ReductionOperatorT<I, E>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct SafelenT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct ScheduleT {
+  using ModType = Fortran::parser::OmpScheduleModifierType::ModType;
+  struct ScheduleModifier {
+    using TupleTrait = std::true_type;
+    std::tuple<ModType, std::optional<ModType>> t;
+  };
+  using ScheduleType = Fortran::parser::OmpScheduleClause::ScheduleType;
+  using TupleTrait = std::true_type;
+  std::tuple<std::optional<ScheduleModifier>, ScheduleType, std::optional<E>> t;
+};
+
+template <typename I, typename E>
+struct SharedT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct SimdlenT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct SizesT {
+  using WrapperTrait = std::true_type;
+  ListT<E> v;
+};
+
+template <typename I, typename E>
+struct TaskReductionT {
+  using TupleTrait = std::true_type;
+  std::tuple<ReductionOperatorT<I, E>, ObjectListT<I, E>> t;
+};
+
+template <typename I, typename E>
+struct ThreadLimitT {
+  using WrapperTrait = std::true_type;
+  E v;
+};
+
+template <typename I, typename E>
+struct ToT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct UniformT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct UseDeviceAddrT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+struct UseDevicePtrT {
+  using WrapperTrait = std::true_type;
+  ObjectListT<I, E> v;
+};
+
+template <typename I, typename E>
+using UnionOfAllClausesT = std::variant<
+    AcqRelT<I, E>, AcquireT<I, E>, AdjustArgsT<I, E>, AffinityT<I, E>,
+    AlignT<I, E>, AlignedT<I, E>, AllocateT<I, E>, AllocatorT<I, E>,
+    AppendArgsT<I, E>, AtT<I, E>, AtomicDefaultMemOrderT<I, E>, BindT<I, E>,
+    CancellationConstructTypeT<I, E>, CaptureT<I, E>, CollapseT<I, E>,
+    CompareT<I, E>, CopyprivateT<I, E>, CopyinT<I, E>, DefaultT<I, E>,
+    DefaultmapT<I, E>, DependT<I, E>, DepobjT<I, E>, DestroyT<I, E>,
+    DetachT<I, E>, DeviceT<I, E>, DeviceTypeT<I, E>, DistScheduleT<I, E>,
+    DoacrossT<I, E>, DynamicAllocatorsT<I, E>, EnterT<I, E>, ExclusiveT<I, E>,
+    FailT<I, E>, FilterT<I, E>, FinalT<I, E>, FirstprivateT<I, E>, FlushT<I, E>,
+    FromT<I, E>, FullT<I, E>, GrainsizeT<I, E>, HasDeviceAddrT<I, E>,
+    HintT<I, E>, IfT<I, E>, InReductionT<I, E>, InbranchT<I, E>,
+    InclusiveT<I, E>, IndirectT<I, E>, InitT<I, E>, IsDevicePtrT<I, E>,
+    LastprivateT<I, E>, LinearT<I, E>, LinkT<I, E>, MapT<I, E>, MatchT<I, E>,
+    MemoryOrderT<I, E>, MergeableT<I, E>, MessageT<I, E>, NogroupT<I, E>,
+    NowaitT<I, E>, NocontextT<I, E>, NontemporalT<I, E>, NotinbranchT<I, E>,
+    NovariantsT<I, E>, NumTasksT<I, E>, NumTeamsT<I, E>, NumThreadsT<I, E>,
+    OmpxAttributeT<I, E>, OmpxDynCgroupMemT<I, E>, OmpxBareT<I, E>,
+    OrderT<I, E>, OrderedT<I, E>, PartialT<I, E>, PriorityT<I, E>,
+    PrivateT<I, E>, ProcBindT<I, E>, ReadT<I, E>, ReductionT<I, E>,
+    RelaxedT<I, E>, ReleaseT<I, E>, ReverseOffloadT<I, E>, SafelenT<I, E>,
+    ScheduleT<I, E>, SeqCstT<I, E>, SeverityT<I, E>, SharedT<I, E>, SimdT<I, E>,
+    SimdlenT<I, E>, SizesT<I, E>, TaskReductionT<I, E>, ThreadLimitT<I, E>,
+    ThreadprivateT<I, E>, ThreadsT<I, E>, ToT<I, E>, UnifiedAddressT<I, E>,
+    UnifiedSharedMemoryT<I, E>, UniformT<I, E>, UnknownT<I, E>, UntiedT<I, E>,
+    UpdateT<I, E>, UseT<I, E>, UseDeviceAddrT<I, E>, UseDevicePtrT<I, E>,
+    UsesAllocatorsT<I, E>, WeakT<I, E>, WhenT<I, E>, WriteT<I, E>>;
+} // namespace clause
+
+template <typename Id, typename Expr>
+struct ClauseT {
+  llvm::omp::Clause id; // The numeric id of the clause
+  using UnionTrait = std::true_type;
+  clause::UnionOfAllClausesT<Id, Expr> u;
+};
+
+} // namespace tomp
+
+#endif // FORTRAN_LOWER_OPENMP_CLAUSET_H
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
new file mode 100644
index 00000000000000..70f232a4858e1b
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -0,0 +1,728 @@
+//===-- Clauses.cpp -- OpenMP clause handling -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Clauses.h"
+
+#include "flang/Common/idioms.h"
+#include "flang/Evaluate/expression.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/expression.h"
+#include "flang/Semantics/symbol.h"
+
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+
+#include <list>
+#include <optional>
+#include <tuple>
+#include <utility>
+#include <variant>
+
+namespace detail {
+template <typename C>
+llvm::omp::Clause getClauseIdForClass(C &&) {
+  using namespace Fortran;
+  using A = llvm::remove_cvref_t<C>; // A is referenced in OMP.inc
+  // The code included below contains a sequence of checks like the following
+  // for each OpenMP clause
+  //   if constexpr (std::is_same_v<A, parser::OmpClause::AcqRel>)
+  //     return llvm::omp::Clause::OMPC_acq_rel;
+  //   [...]
+#define GEN_FLANG_CLAUSE_PARSER_KIND_MAP
+#include "llvm/Frontend/OpenMP/OMP.inc"
+}
+} // namespace detail
+
+static llvm::omp::Clause getClauseId(const Fortran::parser::OmpClause &clause) {
+  return std::visit([](auto &&s) { return detail::getClauseIdForClass(s); },
+                    clause.u);
+}
+
+namespace Fortran::lower::omp {
+using SymbolWithDesignator = std::tuple<semantics::Symbol *, MaybeExpr>;
+
+struct SymbolAndDesignatorExtractor {
+  template <typename T>
+  static T &&AsRvalueRef(T &&t) {
+    return std::move(t);
+  }
+  template <typename T>
+  static T AsRvalueRef(const T &t) {
+    return t;
+  }
+
+  static semantics::Symbol *symbol_addr(const evaluate::SymbolRef &ref) {
+    // Symbols cannot be created after semantic checks, so all symbol
+    // pointers that are non-null must point to one of those pre-existing
+    // objects. Throughout the code, symbols are often pointed to by
+    // non-const pointers, so there is no harm in casting the constness
+    // away.
+    return const_cast<semantics::Symbol *>(&ref.get());
+  }
+
+  template <typename T>
+  static SymbolWithDesignator visit(T &&) {
+    // Use this to see missing overloads:
+    // llvm::errs() << "NULL: " << __PRETTY_FUNCTION__ << '\n';
+    return SymbolWithDesignator{};
+  }
+
+  template <typename T>
+  static SymbolWithDesignator visit(const evaluate::Designator<T> &e) {
+    return std::make_tuple(symbol_addr(*e.GetLastSymbol()),
+                           evaluate::AsGenericExpr(AsRvalueRef(e)));
+  }
+
+  static SymbolWithDesignator visit(const evaluate::ProcedureDesignator &e) {
+    return std::make_tuple(symbol_addr(*e.GetSymbol()), std::nullopt);
+  }
+
+  template <typename T>
+  static SymbolWithDesignator visit(const evaluate::Expr<T> &e) {
+    return std::visit([](auto &&s) { return visit(s); }, e.u);
+  }
+
+  static void verify(const SymbolWithDesignator &sd) {
+    const semantics::Symbol *symbol = std::get<0>(sd);
+    assert(symbol && "Expecting symbol");
+    auto &maybeDsg = std::get<1>(sd);
+    if (!maybeDsg)
+      return; // Symbol with no designator -> OK
+    std::optional<evaluate::DataRef> maybeRef =
+        evaluate::ExtractDataRef(*maybeDsg);
+    if (maybeRef) {
+      if (&maybeRef->GetLastSymbol() == symbol)
+        return; // Symbol with a designator for it -> OK
+      llvm_unreachable("Expecting designator for given symbol");
+    } else {
+      // This could still be a Substring or ComplexPart, but at least Substring
+      // is not allowed in OpenMP.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+      maybeDsg->dump();
+#endif
+      llvm_unreachable("Expecting DataRef designator");
+    }
+  }
+};
+
+SymbolWithDesignator getSymbolAndDesignator(const MaybeExpr &expr) {
+  if (!expr)
+    return SymbolWithDesignator{};
+  return std::visit(
+      [](auto &&s) { return SymbolAndDesignatorExtractor::visit(s); }, expr->u);
+}
+
+Object makeObject(const parser::Name &name,
+                  semantics::SemanticsContext &semaCtx) {
+  assert(name.symbol && "Expecting Symbol");
+  return Object{name.symbol, std::nullopt};
+}
+
+Object makeObject(const parser::Designator &dsg,
+                  semantics::SemanticsContext &semaCtx) {
+  evaluate::ExpressionAnalyzer ea{semaCtx};
+  SymbolWithDesignator sd = getSymbolAndDesignator(ea.Analyze(dsg));
+  SymbolAndDesignatorExtractor::verify(sd);
+  return Object{std::get<0>(sd), std::move(std::get<1>(sd))};
+}
+
+Object makeObject(const parser::StructureComponent &comp,
+                  semantics::SemanticsContext &semaCtx) {
+  evaluate::ExpressionAnalyzer ea{semaCtx};
+  SymbolWithDesignator sd = getSymbolAndDesignator(ea.Analyze(comp));
+  SymbolAndDesignatorExtractor::verify(sd);
+  return Object{std::get<0>(sd), std::move(std::get<1>(sd))};
+}
+
+Object makeObject(const parser::OmpObject &object,
+                  semantics::SemanticsContext &semaCtx) {
+  // If object is a common block, expression analyzer won't be able to
+  // do anything.
+  if (const auto *name = std::get_if<parser::Name>(&object.u)) {
+    assert(name->symbol && "Expecting Symbol");
+    return Object{name->symbol, std::nullopt};
+  }
+  // OmpObject is std::variant<Designator, /*common block*/ Name>;
+  return makeObject(std::get<parser::Designator>(object.u), semaCtx);
+}
+
+std::optional<Object>
+getBaseObject(const Object &object,
+              Fortran::semantics::SemanticsContext &semaCtx) {
+  // If it's just the symbol, then there is no base.
+  if (!object.id())
+    return std::nullopt;
+
+  auto maybeRef = evaluate::ExtractDataRef(*object.ref());
+  if (!maybeRef)
+    return std::nullopt;
+
+  evaluate::DataRef ref = *maybeRef;
+
+  if (std::get_if<evaluate::SymbolRef>(&ref.u)) {
+    return std::nullopt;
+  } else if (auto *comp = std::get_if<evaluate::Component>(&ref.u)) {
+    const evaluate::DataRef &base = comp->base();
+    return Object{
+        SymbolAndDesignatorExtractor::symbol_addr(base.GetLastSymbol()),
+        evaluate::AsGenericExpr(
+            SymbolAndDesignatorExtractor::AsRvalueRef(base))};
+  } else if (auto *arr = std::get_if<evaluate::ArrayRef>(&ref.u)) {
+    const evaluate::NamedEntity &base = arr->base();
+    evaluate::ExpressionAnalyzer ea{semaCtx};
+    if (auto *comp = base.UnwrapComponent()) {
+      return Object{SymbolAndDesignatorExtractor::symbol_addr(comp->symbol()),
+                    ea.Designate(evaluate::DataRef{
+                        SymbolAndDesignatorExtractor::AsRvalueRef(*comp)})};
+    } else if (base.UnwrapSymbolRef()) {
+      return std::nullopt;
+    }
+  } else {
+    assert(std::holds_alternative<evaluate::CoarrayRef>(ref.u) &&
+           "Unexpected variant alternative");
+    llvm_unreachable("Coarray reference not supported at the moment");
+  }
+  return std::nullopt;
+}
+
+namespace clause {
+// Helper objects
+#ifdef EMPTY_CLASS
+#undef EMPTY_CLASS
+#endif
+#define EMPTY_CLASS(cls)                                                       \
+  cls make(const parser::OmpClause::cls &, semantics::SemanticsContext &) {    \
+    return cls{};                                                              \
+  }                                                                            \
+  [[maybe_unused]] extern int xyzzy_semicolon_absorber
+
+#ifdef WRAPPER_CLASS
+#undef WRAPPER_CLASS
+#endif
+#define WRAPPER_CLASS(cls, content)                                            \
+  [[maybe_unused]] extern int xyzzy_semicolon_absorber
+#define GEN_FLANG_CLAUSE_PARSER_CLASSES
+#include "llvm/Frontend/OpenMP/OMP.inc"
+#undef EMPTY_CLASS
+#undef WRAPPER_CLASS
+
+DefinedOperator makeDefinedOperator(const parser::DefinedOperator &inp,
+                                    semantics::SemanticsContext &semaCtx) {
+  return std::visit(
+      common::visitors{
+          [&](const parser::DefinedOpName &s) {
+            return DefinedOperator{
+                DefinedOperator::DefinedOpName{makeObject(s.v, semaCtx)}};
+          },
+          [&](const parser::DefinedOperator::IntrinsicOperator &s) {
+            return DefinedOperator{s};
+          },
+      },
+      inp.u);
+}
+
+ProcedureDesignator
+makeProcedureDesignator(const parser::ProcedureDesignator &inp,
+                        semantics::SemanticsContext &semaCtx) {
+  return ProcedureDesignator{std::visit(
+      common::visitors{
+          [&](const parser::Name &t) { return makeObject(t, semaCtx); },
+          [&](const parser::ProcComponentRef &t) {
+            return makeObject(t.v.thing, semaCtx);
+          },
+      },
+      inp.u)};
+}
+
+ReductionOperator makeReductionOperator(const parser::OmpReductionOperator &inp,
+                                        semantics::SemanticsContext &semaCtx) {
+  return std::visit(
+      common::visitors{
+          [&](const parser::DefinedOperator &s) {
+            return ReductionOperator{makeDefinedOperator(s, semaCtx)};
+          },
+          [&](const parser::ProcedureDesignator &s) {
+            return ReductionOperator{makeProcedureDesignator(s, semaCtx)};
+          },
+      },
+      inp.u);
+}
+
+// Actual clauses. Each T (where OmpClause::T exists) has its "make".
+Aligned make(const parser::OmpClause::Aligned &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpAlignedClause
+  auto &t0 = std::get<parser::OmpObjectList>(inp.v.t);
+  auto &t1 = std::get<std::optional<parser::ScalarIntConstantExpr>>(inp.v.t);
+
+  return Aligned{{
+      makeList(t0, semaCtx),
+      maybeApply(makeExprFn(semaCtx), t1),
+  }};
+}
+
+Allocate make(const parser::OmpClause::Allocate &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpAllocateClause
+  using wrapped = parser::OmpAllocateClause;
+  auto &t0 = std::get<std::optional<wrapped::AllocateModifier>>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+
+  auto convert = [&](auto &&s) -> Allocate::Modifier {
+    using Modifier = Allocate::Modifier;
+    using Allocator = Modifier::Allocator;
+    using Align = Modifier::Align;
+    using ComplexModifier = Modifier::ComplexModifier;
+
+    return std::visit(
+        common::visitors{
+            [&](const wrapped::AllocateModifier::Allocator &v) {
+              return Modifier{Allocator{makeExpr(v.v, semaCtx)}};
+            },
+            [&](const wrapped::AllocateModifier::ComplexModifier &v) {
+              auto &s0 = std::get<wrapped::AllocateModifier::Allocator>(v.t);
+              auto &s1 = std::get<wrapped::AllocateModifier::Align>(v.t);
+              return Modifier{ComplexModifier{{
+                  Allocator{makeExpr(s0.v, semaCtx)},
+                  Align{makeExpr(s1.v, semaCtx)},
+              }}};
+            },
+            [&](const wrapped::AllocateModifier::Align &v) {
+              return Modifier{Align{makeExpr(v.v, semaCtx)}};
+            },
+        },
+        s.u);
+  };
+
+  return Allocate{{maybeApply(convert, t0), makeList(t1, semaCtx)}};
+}
+
+Allocator make(const parser::OmpClause::Allocator &inp,
+               semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return Allocator{makeExpr(inp.v, semaCtx)};
+}
+
+// Never called, but needed for using "make" as a Clause visitor.
+// See comment about "requires" clauses in Clauses.h.
+AtomicDefaultMemOrder make(const parser::OmpClause::AtomicDefaultMemOrder &inp,
+                           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpAtomicDefaultMemOrderClause
+  return AtomicDefaultMemOrder{inp.v.v};
+}
+
+Collapse make(const parser::OmpClause::Collapse &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntConstantExpr
+  return Collapse{makeExpr(inp.v, semaCtx)};
+}
+
+Copyin make(const parser::OmpClause::Copyin &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Copyin{makeList(inp.v, semaCtx)};
+}
+
+Copyprivate make(const parser::OmpClause::Copyprivate &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Copyprivate{makeList(inp.v, semaCtx)};
+}
+
+Defaultmap make(const parser::OmpClause::Defaultmap &inp,
+                semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDefaultmapClause
+  using wrapped = parser::OmpDefaultmapClause;
+
+  auto &t0 = std::get<wrapped::ImplicitBehavior>(inp.v.t);
+  auto &t1 = std::get<std::optional<wrapped::VariableCategory>>(inp.v.t);
+  return Defaultmap{{t0, t1}};
+}
+
+Default make(const parser::OmpClause::Default &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDefaultClause
+  return Default{inp.v.v};
+}
+
+Depend make(const parser::OmpClause::Depend &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDependClause
+  using wrapped = parser::OmpDependClause;
+
+  return std::visit(
+      common::visitors{
+          [&](const wrapped::Source &s) { return Depend{Depend::Source{}}; },
+          [&](const wrapped::Sink &s) {
+            auto convert = [&](const parser::OmpDependSinkVec &v) {
+              auto &t0 = std::get<parser::Name>(v.t);
+              auto &t1 =
+                  std::get<std::optional<parser::OmpDependSinkVecLength>>(v.t);
+              auto convert1 = [&](const parser::OmpDependSinkVecLength &u) {
+                auto &s0 = std::get<parser::DefinedOperator>(u.t);
+                auto &s1 = std::get<parser::ScalarIntConstantExpr>(u.t);
+                return Depend::Sink::Length{makeDefinedOperator(s0, semaCtx),
+                                            makeExpr(s1, semaCtx)};
+              };
+              return Depend::Sink::Vec{makeObject(t0, semaCtx),
+                                       maybeApply(convert1, t1)};
+            };
+            return Depend{Depend::Sink{makeList(s.v, convert)}};
+          },
+          [&](const wrapped::InOut &s) {
+            auto &t0 = std::get<parser::OmpDependenceType>(s.t);
+            auto &t1 = std::get<std::list<parser::Designator>>(s.t);
+            auto convert = [&](const parser::Designator &t) {
+              return makeObject(t, semaCtx);
+            };
+            return Depend{Depend::InOut{{t0.v, makeList(t1, convert)}}};
+          },
+      },
+      inp.v.u);
+}
+
+Device make(const parser::OmpClause::Device &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDeviceClause
+  using wrapped = parser::OmpDeviceClause;
+
+  auto &t0 = std::get<std::optional<wrapped::DeviceModifier>>(inp.v.t);
+  auto &t1 = std::get<parser::ScalarIntExpr>(inp.v.t);
+  return Device{{t0, makeExpr(t1, semaCtx)}};
+}
+
+DeviceType make(const parser::OmpClause::DeviceType &inp,
+                semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpDeviceTypeClause
+  return DeviceType{inp.v.v};
+}
+
+DistSchedule make(const parser::OmpClause::DistSchedule &inp,
+                  semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::ScalarIntExpr>
+  return DistSchedule{maybeApply(makeExprFn(semaCtx), inp.v)};
+}
+
+Enter make(const parser::OmpClause::Enter &inp,
+           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Enter{makeList(inp.v, semaCtx)};
+}
+
+Filter make(const parser::OmpClause::Filter &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return Filter{makeExpr(inp.v, semaCtx)};
+}
+
+Final make(const parser::OmpClause::Final &inp,
+           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarLogicalExpr
+  return Final{makeExpr(inp.v, semaCtx)};
+}
+
+Firstprivate make(const parser::OmpClause::Firstprivate &inp,
+                  semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Firstprivate{makeList(inp.v, semaCtx)};
+}
+
+From make(const parser::OmpClause::From &inp,
+          semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return From{makeList(inp.v, semaCtx)};
+}
+
+Grainsize make(const parser::OmpClause::Grainsize &inp,
+               semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return Grainsize{makeExpr(inp.v, semaCtx)};
+}
+
+HasDeviceAddr make(const parser::OmpClause::HasDeviceAddr &inp,
+                   semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return HasDeviceAddr{makeList(inp.v, semaCtx)};
+}
+
+Hint make(const parser::OmpClause::Hint &inp,
+          semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ConstantExpr
+  return Hint{makeExpr(inp.v, semaCtx)};
+}
+
+If make(const parser::OmpClause::If &inp,
+        semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpIfClause
+  using wrapped = parser::OmpIfClause;
+
+  auto &t0 = std::get<std::optional<wrapped::DirectiveNameModifier>>(inp.v.t);
+  auto &t1 = std::get<parser::ScalarLogicalExpr>(inp.v.t);
+  return If{{t0, makeExpr(t1, semaCtx)}};
+}
+
+InReduction make(const parser::OmpClause::InReduction &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpInReductionClause
+  auto &t0 = std::get<parser::OmpReductionOperator>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  return InReduction{
+      {makeReductionOperator(t0, semaCtx), makeList(t1, semaCtx)}};
+}
+
+IsDevicePtr make(const parser::OmpClause::IsDevicePtr &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return IsDevicePtr{makeList(inp.v, semaCtx)};
+}
+
+Lastprivate make(const parser::OmpClause::Lastprivate &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Lastprivate{makeList(inp.v, semaCtx)};
+}
+
+Linear make(const parser::OmpClause::Linear &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpLinearClause
+  using wrapped = parser::OmpLinearClause;
+
+  return std::visit(
+      common::visitors{
+          [&](const wrapped::WithModifier &s) {
+            return Linear{{Linear::Modifier{s.modifier.v},
+                           makeList(s.names, makeObjectFn(semaCtx)),
+                           maybeApply(makeExprFn(semaCtx), s.step)}};
+          },
+          [&](const wrapped::WithoutModifier &s) {
+            return Linear{{std::nullopt,
+                           makeList(s.names, makeObjectFn(semaCtx)),
+                           maybeApply(makeExprFn(semaCtx), s.step)}};
+          },
+      },
+      inp.v.u);
+}
+
+Link make(const parser::OmpClause::Link &inp,
+          semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Link{makeList(inp.v, semaCtx)};
+}
+
+Map make(const parser::OmpClause::Map &inp,
+         semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpMapClause
+  auto &t0 = std::get<std::optional<parser::OmpMapType>>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  auto convert = [](const parser::OmpMapType &s) {
+    auto &s0 = std::get<std::optional<parser::OmpMapType::Always>>(s.t);
+    auto &s1 = std::get<parser::OmpMapType::Type>(s.t);
+    auto convertT = [](parser::OmpMapType::Always) {
+      return Map::MapType::Always{};
+    };
+    return Map::MapType{{maybeApply(convertT, s0), s1}};
+  };
+  return Map{{maybeApply(convert, t0), makeList(t1, semaCtx)}};
+}
+
+Nocontext make(const parser::OmpClause::Nocontext &inp,
+               semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarLogicalExpr
+  return Nocontext{makeExpr(inp.v, semaCtx)};
+}
+
+Nontemporal make(const parser::OmpClause::Nontemporal &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::list<parser::Name>
+  return Nontemporal{makeList(inp.v, makeObjectFn(semaCtx))};
+}
+
+Novariants make(const parser::OmpClause::Novariants &inp,
+                semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarLogicalExpr
+  return Novariants{makeExpr(inp.v, semaCtx)};
+}
+
+NumTasks make(const parser::OmpClause::NumTasks &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return NumTasks{makeExpr(inp.v, semaCtx)};
+}
+
+NumTeams make(const parser::OmpClause::NumTeams &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return NumTeams{makeExpr(inp.v, semaCtx)};
+}
+
+NumThreads make(const parser::OmpClause::NumThreads &inp,
+                semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return NumThreads{makeExpr(inp.v, semaCtx)};
+}
+
+OmpxDynCgroupMem make(const parser::OmpClause::OmpxDynCgroupMem &inp,
+                      semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return OmpxDynCgroupMem{makeExpr(inp.v, semaCtx)};
+}
+
+Ordered make(const parser::OmpClause::Ordered &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::ScalarIntConstantExpr>
+  return Ordered{maybeApply(makeExprFn(semaCtx), inp.v)};
+}
+
+Order make(const parser::OmpClause::Order &inp,
+           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpOrderClause
+  using wrapped = parser::OmpOrderClause;
+  auto &t0 = std::get<std::optional<parser::OmpOrderModifier>>(inp.v.t);
+  auto &t1 = std::get<wrapped::Type>(inp.v.t);
+  auto convert = [](const parser::OmpOrderModifier &s) -> Order::Kind {
+    return std::get<parser::OmpOrderModifier::Kind>(s.u);
+  };
+  return Order{{maybeApply(convert, t0), t1}};
+}
+
+Partial make(const parser::OmpClause::Partial &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::ScalarIntConstantExpr>
+  return Partial{maybeApply(makeExprFn(semaCtx), inp.v)};
+}
+
+Priority make(const parser::OmpClause::Priority &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return Priority{makeExpr(inp.v, semaCtx)};
+}
+
+Private make(const parser::OmpClause::Private &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Private{makeList(inp.v, semaCtx)};
+}
+
+ProcBind make(const parser::OmpClause::ProcBind &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpProcBindClause
+  return ProcBind{inp.v.v};
+}
+
+Reduction make(const parser::OmpClause::Reduction &inp,
+               semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpReductionClause
+  auto &t0 = std::get<parser::OmpReductionOperator>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  return Reduction{{makeReductionOperator(t0, semaCtx), makeList(t1, semaCtx)}};
+}
+
+Safelen make(const parser::OmpClause::Safelen &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntConstantExpr
+  return Safelen{makeExpr(inp.v, semaCtx)};
+}
+
+Schedule make(const parser::OmpClause::Schedule &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpScheduleClause
+  using wrapped = parser::OmpScheduleClause;
+
+  auto &t0 = std::get<std::optional<parser::OmpScheduleModifier>>(inp.v.t);
+  auto &t1 = std::get<wrapped::ScheduleType>(inp.v.t);
+  auto &t2 = std::get<std::optional<parser::ScalarIntExpr>>(inp.v.t);
+
+  auto convert = [](auto &&s) -> Schedule::ScheduleModifier {
+    auto &s0 = std::get<parser::OmpScheduleModifier::Modifier1>(s.t);
+    auto &s1 =
+        std::get<std::optional<parser::OmpScheduleModifier::Modifier2>>(s.t);
+
+    auto convert1 = [](auto &&v) { // Modifier1 or Modifier2
+      return v.v.v;
+    };
+    return Schedule::ScheduleModifier{{s0.v.v, maybeApply(convert1, s1)}};
+  };
+
+  return Schedule{
+      {maybeApply(convert, t0), t1, maybeApply(makeExprFn(semaCtx), t2)}};
+}
+
+Shared make(const parser::OmpClause::Shared &inp,
+            semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return Shared{makeList(inp.v, semaCtx)};
+}
+
+Simdlen make(const parser::OmpClause::Simdlen &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntConstantExpr
+  return Simdlen{makeExpr(inp.v, semaCtx)};
+}
+
+Sizes make(const parser::OmpClause::Sizes &inp,
+           semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::list<parser::ScalarIntExpr>
+  return Sizes{makeList(inp.v, makeExprFn(semaCtx))};
+}
+
+TaskReduction make(const parser::OmpClause::TaskReduction &inp,
+                   semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpReductionClause
+  auto &t0 = std::get<parser::OmpReductionOperator>(inp.v.t);
+  auto &t1 = std::get<parser::OmpObjectList>(inp.v.t);
+  return TaskReduction{
+      {makeReductionOperator(t0, semaCtx), makeList(t1, semaCtx)}};
+}
+
+ThreadLimit make(const parser::OmpClause::ThreadLimit &inp,
+                 semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::ScalarIntExpr
+  return ThreadLimit{makeExpr(inp.v, semaCtx)};
+}
+
+To make(const parser::OmpClause::To &inp,
+        semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return To{makeList(inp.v, semaCtx)};
+}
+
+Uniform make(const parser::OmpClause::Uniform &inp,
+             semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::list<parser::Name>
+  return Uniform{makeList(inp.v, makeObjectFn(semaCtx))};
+}
+
+UseDeviceAddr make(const parser::OmpClause::UseDeviceAddr &inp,
+                   semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return UseDeviceAddr{makeList(inp.v, semaCtx)};
+}
+
+UseDevicePtr make(const parser::OmpClause::UseDevicePtr &inp,
+                  semantics::SemanticsContext &semaCtx) {
+  // inp.v -> parser::OmpObjectList
+  return UseDevicePtr{makeList(inp.v, semaCtx)};
+}
+} // namespace clause
+
+Clause makeClause(const Fortran::parser::OmpClause &cls,
+                  semantics::SemanticsContext &semaCtx) {
+  return std::visit(
+      [&](auto &&s) {
+        return makeClause(getClauseId(cls), clause::make(s, semaCtx),
+                          cls.source);
+      },
+      cls.u);
+}
+
+List<Clause> makeList(const parser::OmpClauseList &clauses,
+                      semantics::SemanticsContext &semaCtx) {
+  return makeList(clauses.v, [&](const parser::OmpClause &s) {
+    return makeClause(s, semaCtx);
+  });
+}
+} // namespace Fortran::lower::omp
diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h
new file mode 100644
index 00000000000000..3fba593b5349c1
--- /dev/null
+++ b/flang/lib/Lower/OpenMP/Clauses.h
@@ -0,0 +1,212 @@
+//===-- Clauses.h -- OpenMP clause handling -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef FORTRAN_LOWER_OPENMP_CLAUSES_H
+#define FORTRAN_LOWER_OPENMP_CLAUSES_H
+
+#include "ClauseT.h"
+
+#include "flang/Evaluate/expression.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/expression.h"
+#include "flang/Semantics/semantics.h"
+#include "flang/Semantics/symbol.h"
+
+#include "llvm/ADT/STLExtras.h"
+
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+namespace Fortran::lower::omp {
+using namespace Fortran;
+using SomeType = evaluate::SomeType;
+using SomeExpr = semantics::SomeExpr;
+using MaybeExpr = semantics::MaybeExpr;
+
+using SymIdent = semantics::Symbol *;
+using SymReference = SomeExpr;
+
+template <typename T>
+using List = tomp::ListT<T>;
+} // namespace Fortran::lower::omp
+
+namespace tomp {
+template <>
+struct ObjectT<Fortran::lower::omp::SymIdent,
+               Fortran::lower::omp::SymReference> {
+  using IdType = Fortran::lower::omp::SymIdent;
+  using ExprType = Fortran::lower::omp::SymReference;
+
+  const IdType &id() const { return symbol; }
+  const std::optional<ExprType> &ref() const { return designator; }
+
+  IdType symbol;
+  std::optional<ExprType> designator;
+};
+} // namespace tomp
+
+namespace Fortran::lower::omp {
+
+using Object = tomp::ObjectT<SymIdent, SymReference>;
+using ObjectList = tomp::ObjectListT<SymIdent, SymReference>;
+
+Object makeObject(const parser::OmpObject &object,
+                  semantics::SemanticsContext &semaCtx);
+Object makeObject(const parser::Name &name,
+                  semantics::SemanticsContext &semaCtx);
+Object makeObject(const parser::Designator &dsg,
+                  semantics::SemanticsContext &semaCtx);
+Object makeObject(const parser::StructureComponent &comp,
+                  semantics::SemanticsContext &semaCtx);
+
+inline auto makeObjectFn(semantics::SemanticsContext &semaCtx) {
+  return [&](auto &&s) { return makeObject(s, semaCtx); };
+}
+
+template <typename T>
+SomeExpr makeExpr(T &&pftExpr, semantics::SemanticsContext &semaCtx) {
+  auto maybeExpr = evaluate::ExpressionAnalyzer(semaCtx).Analyze(pftExpr);
+  assert(maybeExpr);
+  return std::move(*maybeExpr);
+}
+
+inline auto makeExprFn(semantics::SemanticsContext &semaCtx) {
+  return [&](auto &&s) { return makeExpr(s, semaCtx); };
+}
+
+template <
+    typename ContainerTy, typename FunctionTy,
+    typename ElemTy = typename llvm::remove_cvref_t<ContainerTy>::value_type,
+    typename ResultTy = std::invoke_result_t<FunctionTy, ElemTy>>
+List<ResultTy> makeList(ContainerTy &&container, FunctionTy &&func) {
+  List<ResultTy> v;
+  llvm::transform(container, std::back_inserter(v), func);
+  return v;
+}
+
+inline ObjectList makeList(const parser::OmpObjectList &objects,
+                           semantics::SemanticsContext &semaCtx) {
+  return makeList(objects.v, makeObjectFn(semaCtx));
+}
+
+template <typename FuncTy, typename ElemTy,
+          typename ResultTy = std::invoke_result_t<FuncTy, ElemTy>>
+std::optional<ResultTy> maybeApply(FuncTy &&func,
+                                   const std::optional<ElemTy> &inp) {
+  if (!inp)
+    return std::nullopt;
+  return std::move(func(*inp));
+}
+
+std::optional<Object>
+getBaseObject(const Object &object,
+              Fortran::semantics::SemanticsContext &semaCtx);
+
+namespace clause {
+using DefinedOperator = tomp::clause::DefinedOperatorT<SymIdent, SymReference>;
+using ProcedureDesignator =
+    tomp::clause::ProcedureDesignatorT<SymIdent, SymReference>;
+using ReductionOperator =
+    tomp::clause::ReductionOperatorT<SymIdent, SymReference>;
+
+#ifdef EMPTY_CLASS
+#undef EMPTY_CLASS
+#endif
+#define EMPTY_CLASS(cls)                                                       \
+  using cls = tomp::clause::cls##T<SymIdent, SymReference>
+
+#ifdef WRAPPER_CLASS
+#undef WRAPPER_CLASS
+#endif
+#define WRAPPER_CLASS(cls, content)                                            \
+  [[maybe_unused]] extern int xyzzy_semicolon_absorber
+#define GEN_FLANG_CLAUSE_PARSER_CLASSES
+#include "llvm/Frontend/OpenMP/OMP.inc"
+#undef EMPTY_CLASS
+#undef WRAPPER_CLASS
+
+// "Requires" clauses are handled early on, and the aggregated information
+// is stored in the Symbol details of modules, programs, and subprograms.
+// These clauses are still handled here to cover all alternatives in the
+// main clause variant.
+
+using Aligned = tomp::clause::AlignedT<SymIdent, SymReference>;
+using Allocate = tomp::clause::AllocateT<SymIdent, SymReference>;
+using Allocator = tomp::clause::AllocatorT<SymIdent, SymReference>;
+using AtomicDefaultMemOrder =
+    tomp::clause::AtomicDefaultMemOrderT<SymIdent, SymReference>;
+using Collapse = tomp::clause::CollapseT<SymIdent, SymReference>;
+using Copyin = tomp::clause::CopyinT<SymIdent, SymReference>;
+using Copyprivate = tomp::clause::CopyprivateT<SymIdent, SymReference>;
+using Defaultmap = tomp::clause::DefaultmapT<SymIdent, SymReference>;
+using Default = tomp::clause::DefaultT<SymIdent, SymReference>;
+using Depend = tomp::clause::DependT<SymIdent, SymReference>;
+using Device = tomp::clause::DeviceT<SymIdent, SymReference>;
+using DeviceType = tomp::clause::DeviceTypeT<SymIdent, SymReference>;
+using DistSchedule = tomp::clause::DistScheduleT<SymIdent, SymReference>;
+using Enter = tomp::clause::EnterT<SymIdent, SymReference>;
+using Filter = tomp::clause::FilterT<SymIdent, SymReference>;
+using Final = tomp::clause::FinalT<SymIdent, SymReference>;
+using Firstprivate = tomp::clause::FirstprivateT<SymIdent, SymReference>;
+using From = tomp::clause::FromT<SymIdent, SymReference>;
+using Grainsize = tomp::clause::GrainsizeT<SymIdent, SymReference>;
+using HasDeviceAddr = tomp::clause::HasDeviceAddrT<SymIdent, SymReference>;
+using Hint = tomp::clause::HintT<SymIdent, SymReference>;
+using If = tomp::clause::IfT<SymIdent, SymReference>;
+using InReduction = tomp::clause::InReductionT<SymIdent, SymReference>;
+using IsDevicePtr = tomp::clause::IsDevicePtrT<SymIdent, SymReference>;
+using Lastprivate = tomp::clause::LastprivateT<SymIdent, SymReference>;
+using Linear = tomp::clause::LinearT<SymIdent, SymReference>;
+using Link = tomp::clause::LinkT<SymIdent, SymReference>;
+using Map = tomp::clause::MapT<SymIdent, SymReference>;
+using Nocontext = tomp::clause::NocontextT<SymIdent, SymReference>;
+using Nontemporal = tomp::clause::NontemporalT<SymIdent, SymReference>;
+using Novariants = tomp::clause::NovariantsT<SymIdent, SymReference>;
+using NumTasks = tomp::clause::NumTasksT<SymIdent, SymReference>;
+using NumTeams = tomp::clause::NumTeamsT<SymIdent, SymReference>;
+using NumThreads = tomp::clause::NumThreadsT<SymIdent, SymReference>;
+using OmpxDynCgroupMem =
+    tomp::clause::OmpxDynCgroupMemT<SymIdent, SymReference>;
+using Ordered = tomp::clause::OrderedT<SymIdent, SymReference>;
+using Order = tomp::clause::OrderT<SymIdent, SymReference>;
+using Partial = tomp::clause::PartialT<SymIdent, SymReference>;
+using Priority = tomp::clause::PriorityT<SymIdent, SymReference>;
+using Private = tomp::clause::PrivateT<SymIdent, SymReference>;
+using ProcBind = tomp::clause::ProcBindT<SymIdent, SymReference>;
+using Reduction = tomp::clause::ReductionT<SymIdent, SymReference>;
+using Safelen = tomp::clause::SafelenT<SymIdent, SymReference>;
+using Schedule = tomp::clause::ScheduleT<SymIdent, SymReference>;
+using Shared = tomp::clause::SharedT<SymIdent, SymReference>;
+using Simdlen = tomp::clause::SimdlenT<SymIdent, SymReference>;
+using Sizes = tomp::clause::SizesT<SymIdent, SymReference>;
+using TaskReduction = tomp::clause::TaskReductionT<SymIdent, SymReference>;
+using ThreadLimit = tomp::clause::ThreadLimitT<SymIdent, SymReference>;
+using To = tomp::clause::ToT<SymIdent, SymReference>;
+using Uniform = tomp::clause::UniformT<SymIdent, SymReference>;
+using UseDeviceAddr = tomp::clause::UseDeviceAddrT<SymIdent, SymReference>;
+using UseDevicePtr = tomp::clause::UseDevicePtrT<SymIdent, SymReference>;
+} // namespace clause
+
+struct Clause : public tomp::ClauseT<SymIdent, SymReference> {
+  parser::CharBlock source;
+};
+
+template <typename Specific>
+Clause makeClause(llvm::omp::Clause id, Specific &&specific,
+                  parser::CharBlock source = {}) {
+  return Clause{{id, specific}, source};
+}
+
+Clause makeClause(const Fortran::parser::OmpClause &cls,
+                  semantics::SemanticsContext &semaCtx);
+
+List<Clause> makeList(const parser::OmpClauseList &clauses,
+                      semantics::SemanticsContext &semaCtx);
+} // namespace Fortran::lower::omp
+
+#endif // FORTRAN_LOWER_OPENMP_CLAUSES_H
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 717b8cc0276a30..a5c087e4524146 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -50,6 +50,7 @@ void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) {
 }
 
 void DataSharingProcessor::insertDeallocs() {
+  // TODO Extend delayed privatization to include a `dealloc` region.
   for (const Fortran::semantics::Symbol *sym : privatizedSymbols)
     if (Fortran::semantics::IsAllocatable(sym->GetUltimate())) {
       converter.createHostAssociateVarCloneDealloc(*sym);
@@ -81,30 +82,26 @@ void DataSharingProcessor::copyLastPrivateSymbol(
 }
 
 void DataSharingProcessor::collectOmpObjectListSymbol(
-    const Fortran::parser::OmpObjectList &ompObjectList,
+    const omp::ObjectList &objects,
     llvm::SetVector<const Fortran::semantics::Symbol *> &symbolSet) {
-  for (const Fortran::parser::OmpObject &ompObject : ompObjectList.v) {
-    Fortran::semantics::Symbol *sym = getOmpObjectSymbol(ompObject);
-    symbolSet.insert(sym);
-  }
+  for (const omp::Object &object : objects)
+    symbolSet.insert(object.id());
 }
 
 void DataSharingProcessor::collectSymbolsForPrivatization() {
   bool hasCollapse = false;
-  for (const Fortran::parser::OmpClause &clause : opClauseList.v) {
+  for (const omp::Clause &clause : clauses) {
     if (const auto &privateClause =
-            std::get_if<Fortran::parser::OmpClause::Private>(&clause.u)) {
+            std::get_if<omp::clause::Private>(&clause.u)) {
       collectOmpObjectListSymbol(privateClause->v, privatizedSymbols);
     } else if (const auto &firstPrivateClause =
-                   std::get_if<Fortran::parser::OmpClause::Firstprivate>(
-                       &clause.u)) {
+                   std::get_if<omp::clause::Firstprivate>(&clause.u)) {
       collectOmpObjectListSymbol(firstPrivateClause->v, privatizedSymbols);
     } else if (const auto &lastPrivateClause =
-                   std::get_if<Fortran::parser::OmpClause::Lastprivate>(
-                       &clause.u)) {
+                   std::get_if<omp::clause::Lastprivate>(&clause.u)) {
       collectOmpObjectListSymbol(lastPrivateClause->v, privatizedSymbols);
       hasLastPrivateOp = true;
-    } else if (std::get_if<Fortran::parser::OmpClause::Collapse>(&clause.u)) {
+    } else if (std::get_if<omp::clause::Collapse>(&clause.u)) {
       hasCollapse = true;
     }
   }
@@ -137,138 +134,135 @@ void DataSharingProcessor::insertBarrier() {
 void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
   bool cmpCreated = false;
   mlir::OpBuilder::InsertPoint localInsPt = firOpBuilder.saveInsertionPoint();
-  for (const Fortran::parser::OmpClause &clause : opClauseList.v) {
-    if (std::get_if<Fortran::parser::OmpClause::Lastprivate>(&clause.u)) {
-      // TODO: Add lastprivate support for simd construct
-      if (mlir::isa<mlir::omp::SectionOp>(op)) {
-        if (&eval == &eval.parentConstruct->getLastNestedEvaluation()) {
-          // For `omp.sections`, lastprivatized variables occur in
-          // lexically final `omp.section` operation. The following FIR
-          // shall be generated for the same:
-          //
-          // omp.sections lastprivate(...) {
-          //  omp.section {...}
-          //  omp.section {...}
-          //  omp.section {
-          //      fir.allocate for `private`/`firstprivate`
-          //      <More operations here>
-          //      fir.if %true {
-          //          ^%lpv_update_blk
-          //      }
-          //  }
-          // }
-          //
-          // To keep code consistency while handling privatization
-          // through this control flow, add a `fir.if` operation
-          // that always evaluates to true, in order to create
-          // a dedicated sub-region in `omp.section` where
-          // lastprivate FIR can reside. Later canonicalizations
-          // will optimize away this operation.
-          if (!eval.lowerAsUnstructured()) {
-            auto ifOp = firOpBuilder.create<fir::IfOp>(
-                op->getLoc(),
-                firOpBuilder.createIntegerConstant(
-                    op->getLoc(), firOpBuilder.getIntegerType(1), 0x1),
-                /*else*/ false);
-            firOpBuilder.setInsertionPointToStart(
-                &ifOp.getThenRegion().front());
-
-            const Fortran::parser::OpenMPConstruct *parentOmpConstruct =
-                eval.parentConstruct->getIf<Fortran::parser::OpenMPConstruct>();
-            assert(parentOmpConstruct &&
-                   "Expected a valid enclosing OpenMP construct");
-            const Fortran::parser::OpenMPSectionsConstruct *sectionsConstruct =
-                std::get_if<Fortran::parser::OpenMPSectionsConstruct>(
-                    &parentOmpConstruct->u);
-            assert(sectionsConstruct &&
-                   "Expected an enclosing omp.sections construct");
-            const Fortran::parser::OmpClauseList &sectionsEndClauseList =
-                std::get<Fortran::parser::OmpClauseList>(
-                    std::get<Fortran::parser::OmpEndSectionsDirective>(
-                        sectionsConstruct->t)
-                        .t);
-            for (const Fortran::parser::OmpClause &otherClause :
-                 sectionsEndClauseList.v)
-              if (std::get_if<Fortran::parser::OmpClause::Nowait>(
-                      &otherClause.u))
-                // Emit implicit barrier to synchronize threads and avoid data
-                // races on post-update of lastprivate variables when `nowait`
-                // clause is present.
-                firOpBuilder.create<mlir::omp::BarrierOp>(
-                    converter.getCurrentLocation());
-            firOpBuilder.setInsertionPointToStart(
-                &ifOp.getThenRegion().front());
-            lastPrivIP = firOpBuilder.saveInsertionPoint();
-            firOpBuilder.setInsertionPoint(ifOp);
-            insPt = firOpBuilder.saveInsertionPoint();
-          } else {
-            // Lastprivate operation is inserted at the end
-            // of the lexically last section in the sections
-            // construct
-            mlir::OpBuilder::InsertPoint unstructuredSectionsIP =
-                firOpBuilder.saveInsertionPoint();
-            mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
-            firOpBuilder.setInsertionPoint(lastOper);
-            lastPrivIP = firOpBuilder.saveInsertionPoint();
-            firOpBuilder.restoreInsertionPoint(unstructuredSectionsIP);
-          }
-        }
-      } else if (mlir::isa<mlir::omp::WsLoopOp>(op)) {
-        // Update the original variable just before exiting the worksharing
-        // loop. Conversion as follows:
+  for (const omp::Clause &clause : clauses) {
+    if (clause.id != llvm::omp::OMPC_lastprivate)
+      continue;
+    // TODO: Add lastprivate support for simd construct
+    if (mlir::isa<mlir::omp::SectionOp>(op)) {
+      if (&eval == &eval.parentConstruct->getLastNestedEvaluation()) {
+        // For `omp.sections`, lastprivatized variables occur in
+        // lexically final `omp.section` operation. The following FIR
+        // shall be generated for the same:
         //
-        //                       omp.wsloop {
-        // omp.wsloop {            ...
-        //    ...                  store
-        //    store       ===>     %v = arith.addi %iv, %step
-        //    omp.yield            %cmp = %step < 0 ? %v < %ub : %v > %ub
-        // }                       fir.if %cmp {
-        //                           fir.store %v to %loopIV
-        //                           ^%lpv_update_blk:
-        //                         }
-        //                         omp.yield
-        //                       }
+        // omp.sections lastprivate(...) {
+        //  omp.section {...}
+        //  omp.section {...}
+        //  omp.section {
+        //      fir.allocate for `private`/`firstprivate`
+        //      <More operations here>
+        //      fir.if %true {
+        //          ^%lpv_update_blk
+        //      }
+        //  }
+        // }
         //
-
-        // Only generate the compare once in presence of multiple LastPrivate
-        // clauses.
-        if (cmpCreated)
-          continue;
-        cmpCreated = true;
-
-        mlir::Location loc = op->getLoc();
-        mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
-        firOpBuilder.setInsertionPoint(lastOper);
-
-        mlir::Value iv = op->getRegion(0).front().getArguments()[0];
-        mlir::Value ub =
-            mlir::dyn_cast<mlir::omp::WsLoopOp>(op).getUpperBound()[0];
-        mlir::Value step = mlir::dyn_cast<mlir::omp::WsLoopOp>(op).getStep()[0];
-
-        // v = iv + step
-        // cmp = step < 0 ? v < ub : v > ub
-        mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
-        mlir::Value zero =
-            firOpBuilder.createIntegerConstant(loc, step.getType(), 0);
-        mlir::Value negativeStep = firOpBuilder.create<mlir::arith::CmpIOp>(
-            loc, mlir::arith::CmpIPredicate::slt, step, zero);
-        mlir::Value vLT = firOpBuilder.create<mlir::arith::CmpIOp>(
-            loc, mlir::arith::CmpIPredicate::slt, v, ub);
-        mlir::Value vGT = firOpBuilder.create<mlir::arith::CmpIOp>(
-            loc, mlir::arith::CmpIPredicate::sgt, v, ub);
-        mlir::Value cmpOp = firOpBuilder.create<mlir::arith::SelectOp>(
-            loc, negativeStep, vLT, vGT);
-
-        auto ifOp = firOpBuilder.create<fir::IfOp>(loc, cmpOp, /*else*/ false);
-        firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-        assert(loopIV && "loopIV was not set");
-        firOpBuilder.create<fir::StoreOp>(op->getLoc(), v, loopIV);
-        lastPrivIP = firOpBuilder.saveInsertionPoint();
-      } else {
-        TODO(converter.getCurrentLocation(),
-             "lastprivate clause in constructs other than "
-             "simd/worksharing-loop");
+        // To keep code consistency while handling privatization
+        // through this control flow, add a `fir.if` operation
+        // that always evaluates to true, in order to create
+        // a dedicated sub-region in `omp.section` where
+        // lastprivate FIR can reside. Later canonicalizations
+        // will optimize away this operation.
+        if (!eval.lowerAsUnstructured()) {
+          auto ifOp = firOpBuilder.create<fir::IfOp>(
+              op->getLoc(),
+              firOpBuilder.createIntegerConstant(
+                  op->getLoc(), firOpBuilder.getIntegerType(1), 0x1),
+              /*else*/ false);
+          firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
+          const Fortran::parser::OpenMPConstruct *parentOmpConstruct =
+              eval.parentConstruct->getIf<Fortran::parser::OpenMPConstruct>();
+          assert(parentOmpConstruct &&
+                 "Expected a valid enclosing OpenMP construct");
+          const Fortran::parser::OpenMPSectionsConstruct *sectionsConstruct =
+              std::get_if<Fortran::parser::OpenMPSectionsConstruct>(
+                  &parentOmpConstruct->u);
+          assert(sectionsConstruct &&
+                 "Expected an enclosing omp.sections construct");
+          const Fortran::parser::OmpClauseList &sectionsEndClauseList =
+              std::get<Fortran::parser::OmpClauseList>(
+                  std::get<Fortran::parser::OmpEndSectionsDirective>(
+                      sectionsConstruct->t)
+                      .t);
+          for (const Fortran::parser::OmpClause &otherClause :
+               sectionsEndClauseList.v)
+            if (std::get_if<Fortran::parser::OmpClause::Nowait>(&otherClause.u))
+              // Emit implicit barrier to synchronize threads and avoid data
+              // races on post-update of lastprivate variables when `nowait`
+              // clause is present.
+              firOpBuilder.create<mlir::omp::BarrierOp>(
+                  converter.getCurrentLocation());
+          firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+          lastPrivIP = firOpBuilder.saveInsertionPoint();
+          firOpBuilder.setInsertionPoint(ifOp);
+          insPt = firOpBuilder.saveInsertionPoint();
+        } else {
+          // Lastprivate operation is inserted at the end
+          // of the lexically last section in the sections
+          // construct
+          mlir::OpBuilder::InsertPoint unstructuredSectionsIP =
+              firOpBuilder.saveInsertionPoint();
+          mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
+          firOpBuilder.setInsertionPoint(lastOper);
+          lastPrivIP = firOpBuilder.saveInsertionPoint();
+          firOpBuilder.restoreInsertionPoint(unstructuredSectionsIP);
+        }
       }
+    } else if (mlir::isa<mlir::omp::WsLoopOp>(op)) {
+      // Update the original variable just before exiting the worksharing
+      // loop. Conversion as follows:
+      //
+      //                       omp.wsloop {
+      // omp.wsloop {            ...
+      //    ...                  store
+      //    store       ===>     %v = arith.addi %iv, %step
+      //    omp.yield            %cmp = %step < 0 ? %v < %ub : %v > %ub
+      // }                       fir.if %cmp {
+      //                           fir.store %v to %loopIV
+      //                           ^%lpv_update_blk:
+      //                         }
+      //                         omp.yield
+      //                       }
+      //
+
+      // Only generate the compare once in presence of multiple LastPrivate
+      // clauses.
+      if (cmpCreated)
+        continue;
+      cmpCreated = true;
+
+      mlir::Location loc = op->getLoc();
+      mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
+      firOpBuilder.setInsertionPoint(lastOper);
+
+      mlir::Value iv = op->getRegion(0).front().getArguments()[0];
+      mlir::Value ub =
+          mlir::dyn_cast<mlir::omp::WsLoopOp>(op).getUpperBound()[0];
+      mlir::Value step = mlir::dyn_cast<mlir::omp::WsLoopOp>(op).getStep()[0];
+
+      // v = iv + step
+      // cmp = step < 0 ? v < ub : v > ub
+      mlir::Value v = firOpBuilder.create<mlir::arith::AddIOp>(loc, iv, step);
+      mlir::Value zero =
+          firOpBuilder.createIntegerConstant(loc, step.getType(), 0);
+      mlir::Value negativeStep = firOpBuilder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::slt, step, zero);
+      mlir::Value vLT = firOpBuilder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::slt, v, ub);
+      mlir::Value vGT = firOpBuilder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::sgt, v, ub);
+      mlir::Value cmpOp = firOpBuilder.create<mlir::arith::SelectOp>(
+          loc, negativeStep, vLT, vGT);
+
+      auto ifOp = firOpBuilder.create<fir::IfOp>(loc, cmpOp, /*else*/ false);
+      firOpBuilder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+      assert(loopIV && "loopIV was not set");
+      firOpBuilder.create<fir::StoreOp>(op->getLoc(), v, loopIV);
+      lastPrivIP = firOpBuilder.saveInsertionPoint();
+    } else {
+      TODO(converter.getCurrentLocation(),
+           "lastprivate clause in constructs other than "
+           "simd/worksharing-loop");
     }
   }
   firOpBuilder.restoreInsertionPoint(localInsPt);
@@ -292,14 +286,12 @@ void DataSharingProcessor::collectSymbols(
 }
 
 void DataSharingProcessor::collectDefaultSymbols() {
-  for (const Fortran::parser::OmpClause &clause : opClauseList.v) {
-    if (const auto &defaultClause =
-            std::get_if<Fortran::parser::OmpClause::Default>(&clause.u)) {
-      if (defaultClause->v.v ==
-          Fortran::parser::OmpDefaultClause::Type::Private)
+  for (const omp::Clause &clause : clauses) {
+    if (const auto *defaultClause =
+            std::get_if<omp::clause::Default>(&clause.u)) {
+      if (defaultClause->v == omp::clause::Default::Type::Private)
         collectSymbols(Fortran::semantics::Symbol::Flag::OmpPrivate);
-      else if (defaultClause->v.v ==
-               Fortran::parser::OmpDefaultClause::Type::Firstprivate)
+      else if (defaultClause->v == omp::clause::Default::Type::Firstprivate)
         collectSymbols(Fortran::semantics::Symbol::Flag::OmpFirstPrivate);
     }
   }
@@ -376,6 +368,7 @@ void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
         symLoc, uniquePrivatizerName, symType,
         isFirstPrivate ? mlir::omp::DataSharingClauseType::FirstPrivate
                        : mlir::omp::DataSharingClauseType::Private);
+    fir::ExtendedValue symExV = converter.getSymbolExtendedValue(*sym);
 
     symTable->pushScope();
 
@@ -386,7 +379,8 @@ void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
           &allocRegion, /*insertPt=*/{}, symType, symLoc);
 
       firOpBuilder.setInsertionPointToEnd(allocEntryBlock);
-      symTable->addSymbol(*sym, allocRegion.getArgument(0));
+      symTable->addSymbol(*sym,
+                          fir::substBase(symExV, allocRegion.getArgument(0)));
       symTable->pushScope();
       cloneSymbol(sym);
       firOpBuilder.create<mlir::omp::YieldOp>(
@@ -403,10 +397,12 @@ void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
       mlir::Block *copyEntryBlock = firOpBuilder.createBlock(
           &copyRegion, /*insertPt=*/{}, {symType, symType}, {symLoc, symLoc});
       firOpBuilder.setInsertionPointToEnd(copyEntryBlock);
-      symTable->addSymbol(*sym, copyRegion.getArgument(0),
+      symTable->addSymbol(*sym,
+                          fir::substBase(symExV, copyRegion.getArgument(0)),
                           /*force=*/true);
       symTable->pushScope();
-      symTable->addSymbol(*sym, copyRegion.getArgument(1));
+      symTable->addSymbol(*sym,
+                          fir::substBase(symExV, copyRegion.getArgument(1)));
       auto ip = firOpBuilder.saveInsertionPoint();
       copyFirstPrivateSymbol(sym, &ip);
 
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 9f7301df07598f..226abe96705e35 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -12,6 +12,7 @@
 #ifndef FORTRAN_LOWER_DATASHARINGPROCESSOR_H
 #define FORTRAN_LOWER_DATASHARINGPROCESSOR_H
 
+#include "Clauses.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/OpenMP.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -52,7 +53,7 @@ class DataSharingProcessor {
   llvm::SetVector<const Fortran::semantics::Symbol *> symbolsInParentRegions;
   Fortran::lower::AbstractConverter &converter;
   fir::FirOpBuilder &firOpBuilder;
-  const Fortran::parser::OmpClauseList &opClauseList;
+  omp::List<omp::Clause> clauses;
   Fortran::lower::pft::Evaluation &eval;
   bool useDelayedPrivatization;
   Fortran::lower::SymMap *symTable;
@@ -61,7 +62,7 @@ class DataSharingProcessor {
   bool needBarrier();
   void collectSymbols(Fortran::semantics::Symbol::Flag flag);
   void collectOmpObjectListSymbol(
-      const Fortran::parser::OmpObjectList &ompObjectList,
+      const omp::ObjectList &objects,
       llvm::SetVector<const Fortran::semantics::Symbol *> &symbolSet);
   void collectSymbolsForPrivatization();
   void insertBarrier();
@@ -81,14 +82,15 @@ class DataSharingProcessor {
 
 public:
   DataSharingProcessor(Fortran::lower::AbstractConverter &converter,
+                       Fortran::semantics::SemanticsContext &semaCtx,
                        const Fortran::parser::OmpClauseList &opClauseList,
                        Fortran::lower::pft::Evaluation &eval,
                        bool useDelayedPrivatization = false,
                        Fortran::lower::SymMap *symTable = nullptr)
       : hasLastPrivateOp(false), converter(converter),
-        firOpBuilder(converter.getFirOpBuilder()), opClauseList(opClauseList),
-        eval(eval), useDelayedPrivatization(useDelayedPrivatization),
-        symTable(symTable) {}
+        firOpBuilder(converter.getFirOpBuilder()),
+        clauses(omp::makeList(opClauseList, semaCtx)), eval(eval),
+        useDelayedPrivatization(useDelayedPrivatization), symTable(symTable) {}
 
   // Privatisation is split into two steps.
   // Step1 performs cloning of all privatisation clauses and copying for
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 25bb4d9cff5d16..eaaa8fcd165a91 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -287,9 +287,9 @@ struct OpWithBodyGenInfo {
     return *this;
   }
 
-  OpWithBodyGenInfo &
-  setReductions(llvm::SmallVector<const Fortran::semantics::Symbol *> *value1,
-                llvm::SmallVector<mlir::Type> *value2) {
+  OpWithBodyGenInfo &setReductions(
+      llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *value1,
+      llvm::SmallVectorImpl<mlir::Type> *value2) {
     reductionSymbols = value1;
     reductionTypes = value2;
     return *this;
@@ -317,10 +317,10 @@ struct OpWithBodyGenInfo {
   /// [in] if provided, processes the construct's data-sharing attributes.
   DataSharingProcessor *dsp = nullptr;
   /// [in] if provided, list of reduction symbols
-  llvm::SmallVector<const Fortran::semantics::Symbol *> *reductionSymbols =
+  llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols =
       nullptr;
   /// [in] if provided, list of reduction types
-  llvm::SmallVector<mlir::Type> *reductionTypes = nullptr;
+  llvm::SmallVectorImpl<mlir::Type> *reductionTypes = nullptr;
   /// [in] if provided, emits the op's region entry. Otherwise, an emtpy block
   /// is created in the region.
   GenOMPRegionEntryCBFn genRegionEntryCB = nullptr;
@@ -373,7 +373,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
   std::optional<DataSharingProcessor> tempDsp;
   if (privatize) {
     if (!info.dsp) {
-      tempDsp.emplace(info.converter, *info.clauses, info.eval);
+      tempDsp.emplace(info.converter, info.semaCtx, *info.clauses, info.eval);
       tempDsp->processStep1();
     }
   }
@@ -465,11 +465,9 @@ static void genBodyOfTargetDataOp(
     Fortran::lower::AbstractConverter &converter,
     Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, bool genNested,
-    mlir::omp::DataOp &dataOp,
-    const llvm::SmallVector<mlir::Type> &useDeviceTypes,
-    const llvm::SmallVector<mlir::Location> &useDeviceLocs,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *>
-        &useDeviceSymbols,
+    mlir::omp::DataOp &dataOp, llvm::ArrayRef<mlir::Type> useDeviceTypes,
+    llvm::ArrayRef<mlir::Location> useDeviceLocs,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> useDeviceSymbols,
     const mlir::Location &currentLocation) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::Region &region = dataOp.getRegion();
@@ -574,8 +572,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Parallel,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Parallel, ifClauseOperand);
   cp.processNumThreads(stmtCtx, numThreadsClauseOperand);
   cp.processProcBind(procBindKindAttr);
   cp.processDefault();
@@ -628,7 +625,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   }
 
   bool privatize = !outerCombined;
-  DataSharingProcessor dsp(converter, clauseList, eval,
+  DataSharingProcessor dsp(converter, semaCtx, clauseList, eval,
                            /*useDelayedPrivatization=*/true, &symTable);
 
   if (privatize)
@@ -751,8 +748,7 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
       dependOperands;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Task,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Task, ifClauseOperand);
   cp.processAllocate(allocatorOperands, allocateOperands);
   cp.processDefault();
   cp.processFinal(stmtCtx, finalClauseOperand);
@@ -816,11 +812,12 @@ genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
 //  clause. Support for such list items in a use_device_ptr clause
 //  is deprecated."
 static void promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(
-    llvm::SmallVector<mlir::Value> &devicePtrOperands,
-    llvm::SmallVector<mlir::Value> &deviceAddrOperands,
-    llvm::SmallVector<mlir::Type> &useDeviceTypes,
-    llvm::SmallVector<mlir::Location> &useDeviceLocs,
-    llvm::SmallVector<const Fortran::semantics::Symbol *> &useDeviceSymbols) {
+    llvm::SmallVectorImpl<mlir::Value> &devicePtrOperands,
+    llvm::SmallVectorImpl<mlir::Value> &deviceAddrOperands,
+    llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
+    llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+        &useDeviceSymbols) {
   auto moveElementToBack = [](size_t idx, auto &vector) {
     auto *iter = std::next(vector.begin(), idx);
     vector.push_back(*iter);
@@ -865,8 +862,7 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSymbols;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetData,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::TargetData, ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processUseDevicePtr(devicePtrOperands, useDeviceTypes, useDeviceLocs,
                          useDeviceSymbols);
@@ -911,20 +907,17 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> mapOperands, dependOperands;
   llvm::SmallVector<mlir::Attribute> dependTypeOperands;
 
-  Fortran::parser::OmpIfClause::DirectiveNameModifier directiveName;
+  clause::If::DirectiveNameModifier directiveName;
   // GCC 9.3.0 emits a (probably) bogus warning about an unused variable.
   [[maybe_unused]] llvm::omp::Directive directive;
   if constexpr (std::is_same_v<OpTy, mlir::omp::EnterDataOp>) {
-    directiveName =
-        Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetEnterData;
+    directiveName = clause::If::DirectiveNameModifier::TargetEnterData;
     directive = llvm::omp::Directive::OMPD_target_enter_data;
   } else if constexpr (std::is_same_v<OpTy, mlir::omp::ExitDataOp>) {
-    directiveName =
-        Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetExitData;
+    directiveName = clause::If::DirectiveNameModifier::TargetExitData;
     directive = llvm::omp::Directive::OMPD_target_exit_data;
   } else if constexpr (std::is_same_v<OpTy, mlir::omp::UpdateDataOp>) {
-    directiveName =
-        Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetUpdate;
+    directiveName = clause::If::DirectiveNameModifier::TargetUpdate;
     directive = llvm::omp::Directive::OMPD_target_update;
   } else {
     return nullptr;
@@ -957,15 +950,15 @@ genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
 
 // This functions creates a block for the body of the targetOp's region. It adds
 // all the symbols present in mapSymbols as block arguments to this block.
-static void genBodyOfTargetOp(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval, bool genNested,
-    mlir::omp::TargetOp &targetOp,
-    const llvm::SmallVector<mlir::Type> &mapSymTypes,
-    const llvm::SmallVector<mlir::Location> &mapSymLocs,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *> &mapSymbols,
-    const mlir::Location &currentLocation) {
+static void
+genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
+                  Fortran::semantics::SemanticsContext &semaCtx,
+                  Fortran::lower::pft::Evaluation &eval, bool genNested,
+                  mlir::omp::TargetOp &targetOp,
+                  llvm::ArrayRef<mlir::Type> mapSymTypes,
+                  llvm::ArrayRef<mlir::Location> mapSymLocs,
+                  llvm::ArrayRef<const Fortran::semantics::Symbol *> mapSymbols,
+                  const mlir::Location &currentLocation) {
   assert(mapSymTypes.size() == mapSymLocs.size());
 
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -1126,8 +1119,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> mapSymbols;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Target,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Target, ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processThreadLimit(stmtCtx, threadLimitOperand);
   cp.processDepend(dependTypeOperands, dependOperands);
@@ -1258,8 +1250,7 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
 
   ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Teams,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Teams, ifClauseOperand);
   cp.processAllocate(allocatorOperands, allocateOperands);
   cp.processDefault();
   cp.processNumTeams(stmtCtx, numTeamsClauseOperand);
@@ -1298,8 +1289,9 @@ static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
 
   if (const auto *objectList{
           Fortran::parser::Unwrap<Fortran::parser::OmpObjectList>(spec.u)}) {
+    ObjectList objects{makeList(*objectList, semaCtx)};
     // Case: declare target(func, var1, var2)
-    gatherFuncAndVarSyms(*objectList, mlir::omp::DeclareTargetCaptureClause::to,
+    gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to,
                          symbolAndClause);
   } else if (const auto *clauseList{
                  Fortran::parser::Unwrap<Fortran::parser::OmpClauseList>(
@@ -1438,7 +1430,7 @@ genOmpFlush(Fortran::lower::AbstractConverter &converter,
   if (const auto &ompObjectList =
           std::get<std::optional<Fortran::parser::OmpObjectList>>(
               flushConstruct.t))
-    genObjectList(*ompObjectList, converter, operandRange);
+    genObjectList2(*ompObjectList, converter, operandRange);
   const auto &memOrderClause =
       std::get<std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>>(
           flushConstruct.t);
@@ -1498,7 +1490,7 @@ static void convertLoopBounds(Fortran::lower::AbstractConverter &converter,
 static llvm::SmallVector<const Fortran::semantics::Symbol *>
 genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
             mlir::Location &loc,
-            const llvm::SmallVector<const Fortran::semantics::Symbol *> &args) {
+            llvm::ArrayRef<const Fortran::semantics::Symbol *> args) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   auto &region = op->getRegion(0);
 
@@ -1519,16 +1511,16 @@ genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
   }
   firOpBuilder.setInsertionPointAfter(storeOp);
 
-  return args;
+  return llvm::SmallVector<const Fortran::semantics::Symbol *>(args);
 }
 
 static llvm::SmallVector<const Fortran::semantics::Symbol *>
 genLoopAndReductionVars(
     mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
     mlir::Location &loc,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *> &loopArgs,
-    const llvm::SmallVector<const Fortran::semantics::Symbol *> &reductionArgs,
-    llvm::SmallVector<mlir::Type> &reductionTypes) {
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> loopArgs,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> reductionArgs,
+    llvm::SmallVectorImpl<mlir::Type> &reductionTypes) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   llvm::SmallVector<mlir::Type> blockArgTypes;
@@ -1571,7 +1563,7 @@ genLoopAndReductionVars(
     converter.bindSymbol(*arg, prv);
   }
 
-  return loopArgs;
+  return llvm::SmallVector<const Fortran::semantics::Symbol *>(loopArgs);
 }
 
 static void
@@ -1582,7 +1574,7 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
                const Fortran::parser::OmpClauseList &loopOpClauseList,
                mlir::Location loc) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, loopOpClauseList, eval);
+  DataSharingProcessor dsp(converter, semaCtx, loopOpClauseList, eval);
   dsp.processStep1();
 
   Fortran::lower::StatementContext stmtCtx;
@@ -1600,8 +1592,7 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
                      loopVarTypeSize);
   cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
   cp.processReduction(loc, reductionVars, reductionDeclSymbols);
-  cp.processIf(Fortran::parser::OmpIfClause::DirectiveNameModifier::Simd,
-               ifClauseOperand);
+  cp.processIf(clause::If::DirectiveNameModifier::Simd, ifClauseOperand);
   cp.processSimdlen(simdlenClauseOperand);
   cp.processSafelen(safelenClauseOperand);
   cp.processTODO<Fortran::parser::OmpClause::Aligned,
@@ -1642,7 +1633,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
                          const Fortran::parser::OmpClauseList *endClauseList,
                          mlir::Location loc) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, beginClauseList, eval);
+  DataSharingProcessor dsp(converter, semaCtx, beginClauseList, eval);
   dsp.processStep1();
 
   Fortran::lower::StatementContext stmtCtx;
@@ -2419,106 +2410,100 @@ void Fortran::lower::genOpenMPReduction(
     const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  for (const Fortran::parser::OmpClause &clause : clauseList.v) {
+  List<Clause> clauses{makeList(clauseList, semaCtx)};
+
+  for (const Clause &clause : clauses) {
     if (const auto &reductionClause =
-            std::get_if<Fortran::parser::OmpClause::Reduction>(&clause.u)) {
-      const auto &redOperator{std::get<Fortran::parser::OmpReductionOperator>(
-          reductionClause->v.t)};
-      const auto &objectList{
-          std::get<Fortran::parser::OmpObjectList>(reductionClause->v.t)};
+            std::get_if<clause::Reduction>(&clause.u)) {
+      const auto &redOperator{
+          std::get<clause::ReductionOperator>(reductionClause->t)};
+      const auto &objects{std::get<ObjectList>(reductionClause->t)};
       if (const auto *reductionOp =
-              std::get_if<Fortran::parser::DefinedOperator>(&redOperator.u)) {
+              std::get_if<clause::DefinedOperator>(&redOperator.u)) {
         const auto &intrinsicOp{
-            std::get<Fortran::parser::DefinedOperator::IntrinsicOperator>(
+            std::get<clause::DefinedOperator::IntrinsicOperator>(
                 reductionOp->u)};
 
         switch (intrinsicOp) {
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
-        case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+        case clause::DefinedOperator::IntrinsicOperator::Add:
+        case clause::DefinedOperator::IntrinsicOperator::Multiply:
+        case clause::DefinedOperator::IntrinsicOperator::AND:
+        case clause::DefinedOperator::IntrinsicOperator::EQV:
+        case clause::DefinedOperator::IntrinsicOperator::OR:
+        case clause::DefinedOperator::IntrinsicOperator::NEQV:
           break;
         default:
           continue;
         }
-        for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-          if (const auto *name{
-                  Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-            if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-              mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
-              if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
-                reductionVal = declOp.getBase();
-              mlir::Type reductionType =
-                  reductionVal.getType().cast<fir::ReferenceType>().getEleTy();
-              if (!reductionType.isa<fir::LogicalType>()) {
-                if (!reductionType.isIntOrIndexOrFloat())
-                  continue;
-              }
-              for (mlir::OpOperand &reductionValUse : reductionVal.getUses()) {
-                if (auto loadOp = mlir::dyn_cast<fir::LoadOp>(
-                        reductionValUse.getOwner())) {
-                  mlir::Value loadVal = loadOp.getRes();
-                  if (reductionType.isa<fir::LogicalType>()) {
-                    mlir::Operation *reductionOp = findReductionChain(loadVal);
-                    fir::ConvertOp convertOp =
-                        getConvertFromReductionOp(reductionOp, loadVal);
-                    updateReduction(reductionOp, firOpBuilder, loadVal,
-                                    reductionVal, &convertOp);
-                    removeStoreOp(reductionOp, reductionVal);
-                  } else if (mlir::Operation *reductionOp =
-                                 findReductionChain(loadVal, &reductionVal)) {
-                    updateReduction(reductionOp, firOpBuilder, loadVal,
-                                    reductionVal);
-                  }
+        for (const Object &object : objects) {
+          if (const Fortran::semantics::Symbol *symbol = object.id()) {
+            mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
+            if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
+              reductionVal = declOp.getBase();
+            mlir::Type reductionType =
+                reductionVal.getType().cast<fir::ReferenceType>().getEleTy();
+            if (!reductionType.isa<fir::LogicalType>()) {
+              if (!reductionType.isIntOrIndexOrFloat())
+                continue;
+            }
+            for (mlir::OpOperand &reductionValUse : reductionVal.getUses()) {
+              if (auto loadOp =
+                      mlir::dyn_cast<fir::LoadOp>(reductionValUse.getOwner())) {
+                mlir::Value loadVal = loadOp.getRes();
+                if (reductionType.isa<fir::LogicalType>()) {
+                  mlir::Operation *reductionOp = findReductionChain(loadVal);
+                  fir::ConvertOp convertOp =
+                      getConvertFromReductionOp(reductionOp, loadVal);
+                  updateReduction(reductionOp, firOpBuilder, loadVal,
+                                  reductionVal, &convertOp);
+                  removeStoreOp(reductionOp, reductionVal);
+                } else if (mlir::Operation *reductionOp =
+                               findReductionChain(loadVal, &reductionVal)) {
+                  updateReduction(reductionOp, firOpBuilder, loadVal,
+                                  reductionVal);
                 }
               }
             }
           }
         }
       } else if (const auto *reductionIntrinsic =
-                     std::get_if<Fortran::parser::ProcedureDesignator>(
-                         &redOperator.u)) {
+                     std::get_if<clause::ProcedureDesignator>(&redOperator.u)) {
         if (!ReductionProcessor::supportedIntrinsicProcReduction(
                 *reductionIntrinsic))
           continue;
         ReductionProcessor::ReductionIdentifier redId =
             ReductionProcessor::getReductionType(*reductionIntrinsic);
-        for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-          if (const auto *name{
-                  Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-            if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-              mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
-              if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
-                reductionVal = declOp.getBase();
-              for (const mlir::OpOperand &reductionValUse :
-                   reductionVal.getUses()) {
-                if (auto loadOp = mlir::dyn_cast<fir::LoadOp>(
-                        reductionValUse.getOwner())) {
-                  mlir::Value loadVal = loadOp.getRes();
-                  // Max is lowered as a compare -> select.
-                  // Match the pattern here.
-                  mlir::Operation *reductionOp =
-                      findReductionChain(loadVal, &reductionVal);
-                  if (reductionOp == nullptr)
-                    continue;
-
-                  if (redId == ReductionProcessor::ReductionIdentifier::MAX ||
-                      redId == ReductionProcessor::ReductionIdentifier::MIN) {
-                    assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
-                           "Selection Op not found in reduction intrinsic");
-                    mlir::Operation *compareOp =
-                        getCompareFromReductionOp(reductionOp, loadVal);
-                    updateReduction(compareOp, firOpBuilder, loadVal,
-                                    reductionVal);
-                  }
-                  if (redId == ReductionProcessor::ReductionIdentifier::IOR ||
-                      redId == ReductionProcessor::ReductionIdentifier::IEOR ||
-                      redId == ReductionProcessor::ReductionIdentifier::IAND) {
-                    updateReduction(reductionOp, firOpBuilder, loadVal,
-                                    reductionVal);
-                  }
+        for (const Object &object : objects) {
+          if (const Fortran::semantics::Symbol *symbol = object.id()) {
+            mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
+            if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
+              reductionVal = declOp.getBase();
+            for (const mlir::OpOperand &reductionValUse :
+                 reductionVal.getUses()) {
+              if (auto loadOp =
+                      mlir::dyn_cast<fir::LoadOp>(reductionValUse.getOwner())) {
+                mlir::Value loadVal = loadOp.getRes();
+                // Max is lowered as a compare -> select.
+                // Match the pattern here.
+                mlir::Operation *reductionOp =
+                    findReductionChain(loadVal, &reductionVal);
+                if (reductionOp == nullptr)
+                  continue;
+
+                if (redId == ReductionProcessor::ReductionIdentifier::MAX ||
+                    redId == ReductionProcessor::ReductionIdentifier::MIN) {
+                  assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
+                         "Selection Op not found in reduction intrinsic");
+                  mlir::Operation *compareOp =
+                      getCompareFromReductionOp(reductionOp, loadVal);
+                  updateReduction(compareOp, firOpBuilder, loadVal,
+                                  reductionVal);
+                }
+                if (redId == ReductionProcessor::ReductionIdentifier::IOR ||
+                    redId == ReductionProcessor::ReductionIdentifier::IEOR ||
+                    redId == ReductionProcessor::ReductionIdentifier::IAND) {
+                  updateReduction(reductionOp, firOpBuilder, loadVal,
+                                  reductionVal);
                 }
               }
             }
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index e6a63dd4b939ce..54d9fc556973f7 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -13,6 +13,7 @@
 #include "ReductionProcessor.h"
 
 #include "flang/Lower/AbstractConverter.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
@@ -30,9 +31,9 @@ namespace lower {
 namespace omp {
 
 ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
-    const Fortran::parser::ProcedureDesignator &pd) {
+    const omp::clause::ProcedureDesignator &pd) {
   auto redType = llvm::StringSwitch<std::optional<ReductionIdentifier>>(
-                     ReductionProcessor::getRealName(pd).ToString())
+                     getRealName(pd.v.id()).ToString())
                      .Case("max", ReductionIdentifier::MAX)
                      .Case("min", ReductionIdentifier::MIN)
                      .Case("iand", ReductionIdentifier::IAND)
@@ -44,21 +45,21 @@ ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
 }
 
 ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
-    Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp) {
+    omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp) {
   switch (intrinsicOp) {
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Add:
     return ReductionIdentifier::ADD;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Subtract:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Subtract:
     return ReductionIdentifier::SUBTRACT;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Multiply:
     return ReductionIdentifier::MULTIPLY;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
+  case omp::clause::DefinedOperator::IntrinsicOperator::AND:
     return ReductionIdentifier::AND;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+  case omp::clause::DefinedOperator::IntrinsicOperator::EQV:
     return ReductionIdentifier::EQV;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
+  case omp::clause::DefinedOperator::IntrinsicOperator::OR:
     return ReductionIdentifier::OR;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+  case omp::clause::DefinedOperator::IntrinsicOperator::NEQV:
     return ReductionIdentifier::NEQV;
   default:
     llvm_unreachable("unexpected intrinsic operator in reduction");
@@ -66,13 +67,11 @@ ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType(
 }
 
 bool ReductionProcessor::supportedIntrinsicProcReduction(
-    const Fortran::parser::ProcedureDesignator &pd) {
-  const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(pd)};
-  assert(name && "Invalid Reduction Intrinsic.");
-  if (!name->symbol->GetUltimate().attrs().test(
-          Fortran::semantics::Attr::INTRINSIC))
+    const omp::clause::ProcedureDesignator &pd) {
+  Fortran::semantics::Symbol *sym = pd.v.id();
+  if (!sym->GetUltimate().attrs().test(Fortran::semantics::Attr::INTRINSIC))
     return false;
-  auto redType = llvm::StringSwitch<bool>(getRealName(name).ToString())
+  auto redType = llvm::StringSwitch<bool>(getRealName(sym).ToString())
                      .Case("max", true)
                      .Case("min", true)
                      .Case("iand", true)
@@ -92,31 +91,63 @@ std::string ReductionProcessor::getReductionName(llvm::StringRef name,
   if (isByRef)
     byrefAddition = "_byref";
 
-  return (llvm::Twine(name) +
-          (ty.isIntOrIndex() ? llvm::Twine("_i_") : llvm::Twine("_f_")) +
-          llvm::Twine(ty.getIntOrFloatBitWidth()) + byrefAddition)
-      .str();
+  if (fir::isa_trivial(ty))
+    return (llvm::Twine(name) +
+            (ty.isIntOrIndex() ? llvm::Twine("_i_") : llvm::Twine("_f_")) +
+            llvm::Twine(ty.getIntOrFloatBitWidth()) + byrefAddition)
+        .str();
+
+  // creates a name like reduction_i_64_box_ux4x3
+  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
+    // TODO: support for allocatable boxes:
+    // !fir.box<!fir.heap<!fir.array<...>>>
+    fir::SequenceType seqTy = fir::unwrapRefType(boxTy.getEleTy())
+                                  .dyn_cast_or_null<fir::SequenceType>();
+    if (!seqTy)
+      return {};
+
+    std::string prefix = getReductionName(
+        name, fir::unwrapSeqOrBoxedSeqType(ty), /*isByRef=*/false);
+    if (prefix.empty())
+      return {};
+    std::stringstream tyStr;
+    tyStr << prefix << "_box_";
+    bool first = true;
+    for (std::int64_t extent : seqTy.getShape()) {
+      if (first)
+        first = false;
+      else
+        tyStr << "x";
+      if (extent == seqTy.getUnknownExtent())
+        tyStr << 'u'; // I'm not sure that '?' is safe in symbol names
+      else
+        tyStr << extent;
+    }
+    return (tyStr.str() + byrefAddition).str();
+  }
+
+  return {};
 }
 
 std::string ReductionProcessor::getReductionName(
-    Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-    mlir::Type ty, bool isByRef) {
+    omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp, mlir::Type ty,
+    bool isByRef) {
   std::string reductionName;
 
   switch (intrinsicOp) {
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Add:
     reductionName = "add_reduction";
     break;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+  case omp::clause::DefinedOperator::IntrinsicOperator::Multiply:
     reductionName = "multiply_reduction";
     break;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
+  case omp::clause::DefinedOperator::IntrinsicOperator::AND:
     return "and_reduction";
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+  case omp::clause::DefinedOperator::IntrinsicOperator::EQV:
     return "eqv_reduction";
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
+  case omp::clause::DefinedOperator::IntrinsicOperator::OR:
     return "or_reduction";
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+  case omp::clause::DefinedOperator::IntrinsicOperator::NEQV:
     return "neqv_reduction";
   default:
     reductionName = "other_reduction";
@@ -283,6 +314,148 @@ mlir::Value ReductionProcessor::createScalarCombiner(
   return reductionOp;
 }
 
+/// Create reduction combiner region for reduction variables which are boxed
+/// arrays
+static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
+                           ReductionProcessor::ReductionIdentifier redId,
+                           fir::BaseBoxType boxTy, mlir::Value lhs,
+                           mlir::Value rhs) {
+  fir::SequenceType seqTy =
+      mlir::dyn_cast_or_null<fir::SequenceType>(boxTy.getEleTy());
+  // TODO: support allocatable arrays: !fir.box<!fir.heap<!fir.array<...>>>
+  if (!seqTy || seqTy.hasUnknownShape())
+    TODO(loc, "Unsupported boxed type in OpenMP reduction");
+
+  // load fir.ref<fir.box<...>>
+  mlir::Value lhsAddr = lhs;
+  lhs = builder.create<fir::LoadOp>(loc, lhs);
+  rhs = builder.create<fir::LoadOp>(loc, rhs);
+
+  const unsigned rank = seqTy.getDimension();
+  llvm::SmallVector<mlir::Value> extents;
+  extents.reserve(rank);
+  llvm::SmallVector<mlir::Value> lbAndExtents;
+  lbAndExtents.reserve(rank * 2);
+
+  // Get box lowerbounds and extents:
+  mlir::Type idxTy = builder.getIndexType();
+  for (unsigned i = 0; i < rank; ++i) {
+    // TODO: ideally we want to hoist box reads out of the critical section.
+    // We could do this by having box dimensions in block arguments like
+    // OpenACC does
+    mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i);
+    auto dimInfo =
+        builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, lhs, dim);
+    extents.push_back(dimInfo.getExtent());
+    lbAndExtents.push_back(dimInfo.getLowerBound());
+    lbAndExtents.push_back(dimInfo.getExtent());
+  }
+
+  auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank);
+  auto shapeShift =
+      builder.create<fir::ShapeShiftOp>(loc, shapeShiftTy, lbAndExtents);
+
+  // Iterate over array elements, applying the equivalent scalar reduction:
+
+  // A hlfir::elemental here gets inlined with a temporary so create the
+  // loop nest directly.
+  // This function already controls all of the code in this region so we
+  // know this won't miss any opportuinties for clever elemental inlining
+  hlfir::LoopNest nest =
+      hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true);
+  builder.setInsertionPointToStart(nest.innerLoop.getBody());
+  mlir::Type refTy = fir::ReferenceType::get(seqTy.getEleTy());
+  auto lhsEleAddr = builder.create<fir::ArrayCoorOp>(
+      loc, refTy, lhs, shapeShift, /*slice=*/mlir::Value{},
+      nest.oneBasedIndices, /*typeparms=*/mlir::ValueRange{});
+  auto rhsEleAddr = builder.create<fir::ArrayCoorOp>(
+      loc, refTy, rhs, shapeShift, /*slice=*/mlir::Value{},
+      nest.oneBasedIndices, /*typeparms=*/mlir::ValueRange{});
+  auto lhsEle = builder.create<fir::LoadOp>(loc, lhsEleAddr);
+  auto rhsEle = builder.create<fir::LoadOp>(loc, rhsEleAddr);
+  mlir::Value scalarReduction = ReductionProcessor::createScalarCombiner(
+      builder, loc, redId, refTy, lhsEle, rhsEle);
+  builder.create<fir::StoreOp>(loc, scalarReduction, lhsEleAddr);
+
+  builder.setInsertionPointAfter(nest.outerLoop);
+  builder.create<mlir::omp::YieldOp>(loc, lhsAddr);
+}
+
+// generate combiner region for reduction operations
+static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc,
+                        ReductionProcessor::ReductionIdentifier redId,
+                        mlir::Type ty, mlir::Value lhs, mlir::Value rhs,
+                        bool isByRef) {
+  ty = fir::unwrapRefType(ty);
+
+  if (fir::isa_trivial(ty)) {
+    mlir::Value lhsLoaded = builder.loadIfRef(loc, lhs);
+    mlir::Value rhsLoaded = builder.loadIfRef(loc, rhs);
+
+    mlir::Value result = ReductionProcessor::createScalarCombiner(
+        builder, loc, redId, ty, lhsLoaded, rhsLoaded);
+    if (isByRef) {
+      builder.create<fir::StoreOp>(loc, result, lhs);
+      builder.create<mlir::omp::YieldOp>(loc, lhs);
+    } else {
+      builder.create<mlir::omp::YieldOp>(loc, result);
+    }
+    return;
+  }
+  // all arrays should have been boxed
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty)) {
+    genBoxCombiner(builder, loc, redId, boxTy, lhs, rhs);
+    return;
+  }
+
+  TODO(loc, "OpenMP genCombiner for unsupported reduction variable type");
+}
+
+static mlir::Value
+createReductionInitRegion(fir::FirOpBuilder &builder, mlir::Location loc,
+                          const ReductionProcessor::ReductionIdentifier redId,
+                          mlir::Type type, bool isByRef) {
+  mlir::Type ty = fir::unwrapRefType(type);
+  mlir::Value initValue = ReductionProcessor::getReductionInitValue(
+      loc, fir::unwrapSeqOrBoxedSeqType(ty), redId, builder);
+
+  if (fir::isa_trivial(ty)) {
+    if (isByRef) {
+      mlir::Value alloca = builder.create<fir::AllocaOp>(loc, ty);
+      builder.createStoreWithConvert(loc, initValue, alloca);
+      return alloca;
+    }
+    // by val
+    return initValue;
+  }
+
+  // all arrays are boxed
+  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(ty)) {
+    assert(isByRef && "passing arrays by value is unsupported");
+    // TODO: support allocatable arrays: !fir.box<!fir.heap<!fir.array<...>>>
+    mlir::Type innerTy = fir::extractSequenceType(boxTy);
+    if (!mlir::isa<fir::SequenceType>(innerTy))
+      TODO(loc, "Unsupported boxed type for reduction");
+    // Create the private copy from the initial fir.box:
+    hlfir::Entity source = hlfir::Entity{builder.getBlock()->getArgument(0)};
+
+    // TODO: if the whole reduction is nested inside of a loop, this alloca
+    // could lead to a stack overflow (the memory is only freed at the end of
+    // the stack frame). The reduction declare operation needs a deallocation
+    // region to undo the init region.
+    hlfir::Entity temp = createStackTempFromMold(loc, builder, source);
+
+    // Put the temporary inside of a box:
+    hlfir::Entity box = hlfir::genVariableBox(loc, builder, temp);
+    builder.create<hlfir::AssignOp>(loc, initValue, box);
+    mlir::Value boxAlloca = builder.create<fir::AllocaOp>(loc, ty);
+    builder.create<fir::StoreOp>(loc, box, boxAlloca);
+    return boxAlloca;
+  }
+
+  TODO(loc, "createReductionInitRegion for unsupported type");
+}
+
 mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
     fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
     const ReductionIdentifier redId, mlir::Type type, mlir::Location loc,
@@ -290,6 +463,9 @@ mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
   mlir::OpBuilder::InsertionGuard guard(builder);
   mlir::ModuleOp module = builder.getModule();
 
+  if (reductionOpName.empty())
+    TODO(loc, "Reduction of some types is not supported");
+
   auto decl =
       module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
   if (decl)
@@ -306,14 +482,9 @@ mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
                       decl.getInitializerRegion().end(), {type}, {loc});
   builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
 
-  mlir::Value init = getReductionInitValue(loc, type, redId, builder);
-  if (isByRef) {
-    mlir::Value alloca = builder.create<fir::AllocaOp>(loc, valTy);
-    builder.createStoreWithConvert(loc, init, alloca);
-    builder.create<mlir::omp::YieldOp>(loc, alloca);
-  } else {
-    builder.create<mlir::omp::YieldOp>(loc, init);
-  }
+  mlir::Value init =
+      createReductionInitRegion(builder, loc, redId, type, isByRef);
+  builder.create<mlir::omp::YieldOp>(loc, init);
 
   builder.createBlock(&decl.getReductionRegion(),
                       decl.getReductionRegion().end(), {type, type},
@@ -322,19 +493,7 @@ mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
   builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
   mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
   mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
-  mlir::Value outAddr = op1;
-
-  op1 = builder.loadIfRef(loc, op1);
-  op2 = builder.loadIfRef(loc, op2);
-
-  mlir::Value reductionOp =
-      createScalarCombiner(builder, loc, redId, type, op1, op2);
-  if (isByRef) {
-    builder.create<fir::StoreOp>(loc, reductionOp, outAddr);
-    builder.create<mlir::omp::YieldOp>(loc, outAddr);
-  } else {
-    builder.create<mlir::omp::YieldOp>(loc, reductionOp);
-  }
+  genCombiner(builder, loc, redId, type, op1, op2, isByRef);
 
   return decl;
 }
@@ -364,7 +523,7 @@ bool ReductionProcessor::doReductionByRef(
 void ReductionProcessor::addReductionDecl(
     mlir::Location currentLocation,
     Fortran::lower::AbstractConverter &converter,
-    const Fortran::parser::OmpReductionClause &reduction,
+    const omp::clause::Reduction &reduction,
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
@@ -372,13 +531,12 @@ void ReductionProcessor::addReductionDecl(
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::omp::ReductionDeclareOp decl;
   const auto &redOperator{
-      std::get<Fortran::parser::OmpReductionOperator>(reduction.t)};
-  const auto &objectList{std::get<Fortran::parser::OmpObjectList>(reduction.t)};
+      std::get<omp::clause::ReductionOperator>(reduction.t)};
+  const auto &objectList{std::get<omp::ObjectList>(reduction.t)};
 
-  if (!std::holds_alternative<Fortran::parser::DefinedOperator>(
-          redOperator.u)) {
+  if (!std::holds_alternative<omp::clause::DefinedOperator>(redOperator.u)) {
     if (const auto *reductionIntrinsic =
-            std::get_if<Fortran::parser::ProcedureDesignator>(&redOperator.u)) {
+            std::get_if<omp::clause::ProcedureDesignator>(&redOperator.u)) {
       if (!ReductionProcessor::supportedIntrinsicProcReduction(
               *reductionIntrinsic)) {
         return;
@@ -388,27 +546,43 @@ void ReductionProcessor::addReductionDecl(
     }
   }
 
-  // initial pass to collect all recuction vars so we can figure out if this
+  // initial pass to collect all reduction vars so we can figure out if this
   // should happen byref
-  for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-    if (const auto *name{
-            Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-      if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-        if (reductionSymbols)
-          reductionSymbols->push_back(symbol);
-        mlir::Value symVal = converter.getSymbolAddress(*symbol);
-        if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
-          symVal = declOp.getBase();
-        reductionVars.push_back(symVal);
-      }
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  for (const Object &object : objectList) {
+    const Fortran::semantics::Symbol *symbol = object.id();
+    if (reductionSymbols)
+      reductionSymbols->push_back(symbol);
+    mlir::Value symVal = converter.getSymbolAddress(*symbol);
+    auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
+
+    // all arrays must be boxed so that we have convenient access to all the
+    // information needed to iterate over the array
+    if (mlir::isa<fir::SequenceType>(redType.getEleTy())) {
+      hlfir::Entity entity{symVal};
+      entity = genVariableBox(currentLocation, builder, entity);
+      mlir::Value box = entity.getBase();
+
+      // Always pass the box by reference so that the OpenMP dialect
+      // verifiers don't need to know anything about fir.box
+      auto alloca =
+          builder.create<fir::AllocaOp>(currentLocation, box.getType());
+      builder.create<fir::StoreOp>(currentLocation, box, alloca);
+
+      symVal = alloca;
+      redType = mlir::cast<fir::ReferenceType>(symVal.getType());
+    } else if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>()) {
+      symVal = declOp.getBase();
     }
+
+    reductionVars.push_back(symVal);
   }
   const bool isByRef = doReductionByRef(reductionVars);
 
   if (const auto &redDefinedOp =
-          std::get_if<Fortran::parser::DefinedOperator>(&redOperator.u)) {
+          std::get_if<omp::clause::DefinedOperator>(&redOperator.u)) {
     const auto &intrinsicOp{
-        std::get<Fortran::parser::DefinedOperator::IntrinsicOperator>(
+        std::get<omp::clause::DefinedOperator::IntrinsicOperator>(
             redDefinedOp->u)};
     ReductionIdentifier redId = getReductionType(intrinsicOp);
     switch (redId) {
@@ -424,73 +598,56 @@ void ReductionProcessor::addReductionDecl(
            "Reduction of some intrinsic operators is not supported");
       break;
     }
-    for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-      if (const auto *name{
-              Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-        if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-          mlir::Value symVal = converter.getSymbolAddress(*symbol);
-          if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
-            symVal = declOp.getBase();
-          auto redType = symVal.getType().cast<fir::ReferenceType>();
-          if (redType.getEleTy().isa<fir::LogicalType>())
-            decl = createReductionDecl(
-                firOpBuilder,
-                getReductionName(intrinsicOp, firOpBuilder.getI1Type(),
-                                 isByRef),
-                redId, redType, currentLocation, isByRef);
-          else if (redType.getEleTy().isIntOrIndexOrFloat()) {
-            decl = createReductionDecl(
-                firOpBuilder, getReductionName(intrinsicOp, redType, isByRef),
-                redId, redType, currentLocation, isByRef);
-          } else {
-            TODO(currentLocation, "Reduction of some types is not supported");
-          }
-          reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
-              firOpBuilder.getContext(), decl.getSymName()));
-        }
-      }
+
+    for (mlir::Value symVal : reductionVars) {
+      auto redType = mlir::cast<fir::ReferenceType>(symVal.getType());
+      if (redType.getEleTy().isa<fir::LogicalType>())
+        decl = createReductionDecl(
+            firOpBuilder,
+            getReductionName(intrinsicOp, firOpBuilder.getI1Type(), isByRef),
+            redId, redType, currentLocation, isByRef);
+      else
+        decl = createReductionDecl(
+            firOpBuilder, getReductionName(intrinsicOp, redType, isByRef),
+            redId, redType, currentLocation, isByRef);
+      reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
+          firOpBuilder.getContext(), decl.getSymName()));
     }
   } else if (const auto *reductionIntrinsic =
-                 std::get_if<Fortran::parser::ProcedureDesignator>(
+                 std::get_if<omp::clause::ProcedureDesignator>(
                      &redOperator.u)) {
     if (ReductionProcessor::supportedIntrinsicProcReduction(
             *reductionIntrinsic)) {
       ReductionProcessor::ReductionIdentifier redId =
           ReductionProcessor::getReductionType(*reductionIntrinsic);
-      for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-        if (const auto *name{
-                Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-          if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-            mlir::Value symVal = converter.getSymbolAddress(*symbol);
-            if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
-              symVal = declOp.getBase();
-            auto redType = symVal.getType().cast<fir::ReferenceType>();
-            assert(redType.getEleTy().isIntOrIndexOrFloat() &&
-                   "Unsupported reduction type");
-            decl = createReductionDecl(
-                firOpBuilder,
-                getReductionName(getRealName(*reductionIntrinsic).ToString(),
-                                 redType, isByRef),
-                redId, redType, currentLocation, isByRef);
-            reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
-                firOpBuilder.getContext(), decl.getSymName()));
-          }
-        }
+      for (const Object &object : objectList) {
+        const Fortran::semantics::Symbol *symbol = object.id();
+        mlir::Value symVal = converter.getSymbolAddress(*symbol);
+        if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
+          symVal = declOp.getBase();
+        auto redType = symVal.getType().cast<fir::ReferenceType>();
+        if (!redType.getEleTy().isIntOrIndexOrFloat())
+          TODO(currentLocation, "User Defined Reduction on non-trivial type");
+        decl = createReductionDecl(
+            firOpBuilder,
+            getReductionName(getRealName(*reductionIntrinsic).ToString(),
+                             redType, isByRef),
+            redId, redType, currentLocation, isByRef);
+        reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
+            firOpBuilder.getContext(), decl.getSymName()));
       }
     }
   }
 }
 
 const Fortran::semantics::SourceName
-ReductionProcessor::getRealName(const Fortran::parser::Name *name) {
-  return name->symbol->GetUltimate().name();
+ReductionProcessor::getRealName(const Fortran::semantics::Symbol *symbol) {
+  return symbol->GetUltimate().name();
 }
 
-const Fortran::semantics::SourceName ReductionProcessor::getRealName(
-    const Fortran::parser::ProcedureDesignator &pd) {
-  const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(pd)};
-  assert(name && "Invalid Reduction Intrinsic.");
-  return getRealName(name);
+const Fortran::semantics::SourceName
+ReductionProcessor::getRealName(const omp::clause::ProcedureDesignator &pd) {
+  return getRealName(pd.v.id());
 }
 
 int ReductionProcessor::getOperationIdentity(ReductionIdentifier redId,
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/lib/Lower/OpenMP/ReductionProcessor.h
index 679580f2a3cac7..f481d4cbd1c51a 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.h
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.h
@@ -13,6 +13,7 @@
 #ifndef FORTRAN_LOWER_REDUCTIONPROCESSOR_H
 #define FORTRAN_LOWER_REDUCTIONPROCESSOR_H
 
+#include "Clauses.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Parser/parse-tree.h"
@@ -58,19 +59,19 @@ class ReductionProcessor {
   };
 
   static ReductionIdentifier
-  getReductionType(const Fortran::parser::ProcedureDesignator &pd);
+  getReductionType(const omp::clause::ProcedureDesignator &pd);
 
-  static ReductionIdentifier getReductionType(
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp);
+  static ReductionIdentifier
+  getReductionType(omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp);
 
-  static bool supportedIntrinsicProcReduction(
-      const Fortran::parser::ProcedureDesignator &pd);
+  static bool
+  supportedIntrinsicProcReduction(const omp::clause::ProcedureDesignator &pd);
 
   static const Fortran::semantics::SourceName
-  getRealName(const Fortran::parser::Name *name);
+  getRealName(const Fortran::semantics::Symbol *symbol);
 
   static const Fortran::semantics::SourceName
-  getRealName(const Fortran::parser::ProcedureDesignator &pd);
+  getRealName(const omp::clause::ProcedureDesignator &pd);
 
   static bool
   doReductionByRef(const llvm::SmallVectorImpl<mlir::Value> &reductionVars);
@@ -78,9 +79,9 @@ class ReductionProcessor {
   static std::string getReductionName(llvm::StringRef name, mlir::Type ty,
                                       bool isByRef);
 
-  static std::string getReductionName(
-      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      mlir::Type ty, bool isByRef);
+  static std::string
+  getReductionName(omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp,
+                   mlir::Type ty, bool isByRef);
 
   /// This function returns the identity value of the operator \p
   /// reductionOpName. For example:
@@ -107,7 +108,7 @@ class ReductionProcessor {
   /// Creates an OpenMP reduction declaration and inserts it into the provided
   /// symbol table. The declaration has a constant initializer with the neutral
   /// value `initValue`, and the reduction combiner carried over from `reduce`.
-  /// TODO: Generalize this for non-integer types, add atomic region.
+  /// TODO: add atomic region.
   static mlir::omp::ReductionDeclareOp
   createReductionDecl(fir::FirOpBuilder &builder,
                       llvm::StringRef reductionOpName,
@@ -119,7 +120,7 @@ class ReductionProcessor {
   static void
   addReductionDecl(mlir::Location currentLocation,
                    Fortran::lower::AbstractConverter &converter,
-                   const Fortran::parser::OmpReductionClause &reduction,
+                   const omp::clause::Reduction &reduction,
                    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
                    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
                    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 49517f62895dfe..fa4a51e3384837 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Utils.h"
+#include "Clauses.h"
 
 #include <flang/Lower/AbstractConverter.h>
 #include <flang/Lower/ConvertType.h>
@@ -34,19 +35,33 @@ namespace Fortran {
 namespace lower {
 namespace omp {
 
-void genObjectList(const Fortran::parser::OmpObjectList &objectList,
+void genObjectList(const ObjectList &objects,
                    Fortran::lower::AbstractConverter &converter,
                    llvm::SmallVectorImpl<mlir::Value> &operands) {
+  for (const Object &object : objects) {
+    const Fortran::semantics::Symbol *sym = object.id();
+    assert(sym && "Expected Symbol");
+    if (mlir::Value variable = converter.getSymbolAddress(*sym)) {
+      operands.push_back(variable);
+    } else if (const auto *details =
+                   sym->detailsIf<Fortran::semantics::HostAssocDetails>()) {
+      operands.push_back(converter.getSymbolAddress(details->symbol()));
+      converter.copySymbolBinding(details->symbol(), *sym);
+    }
+  }
+}
+
+void genObjectList2(const Fortran::parser::OmpObjectList &objectList,
+                    Fortran::lower::AbstractConverter &converter,
+                    llvm::SmallVectorImpl<mlir::Value> &operands) {
   auto addOperands = [&](Fortran::lower::SymbolRef sym) {
     const mlir::Value variable = converter.getSymbolAddress(sym);
     if (variable) {
       operands.push_back(variable);
-    } else {
-      if (const auto *details =
-              sym->detailsIf<Fortran::semantics::HostAssocDetails>()) {
-        operands.push_back(converter.getSymbolAddress(details->symbol()));
-        converter.copySymbolBinding(details->symbol(), sym);
-      }
+    } else if (const auto *details =
+                   sym->detailsIf<Fortran::semantics::HostAssocDetails>()) {
+      operands.push_back(converter.getSymbolAddress(details->symbol()));
+      converter.copySymbolBinding(details->symbol(), sym);
     }
   };
   for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
@@ -56,24 +71,10 @@ void genObjectList(const Fortran::parser::OmpObjectList &objectList,
 }
 
 void gatherFuncAndVarSyms(
-    const Fortran::parser::OmpObjectList &objList,
-    mlir::omp::DeclareTargetCaptureClause clause,
+    const ObjectList &objects, mlir::omp::DeclareTargetCaptureClause clause,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
-  for (const Fortran::parser::OmpObject &ompObject : objList.v) {
-    Fortran::common::visit(
-        Fortran::common::visitors{
-            [&](const Fortran::parser::Designator &designator) {
-              if (const Fortran::parser::Name *name =
-                      Fortran::semantics::getDesignatorNameIfDataRef(
-                          designator)) {
-                symbolAndClause.emplace_back(clause, *name->symbol);
-              }
-            },
-            [&](const Fortran::parser::Name &name) {
-              symbolAndClause.emplace_back(clause, *name.symbol);
-            }},
-        ompObject.u);
-  }
+  for (const Object &object : objects)
+    symbolAndClause.emplace_back(clause, *object.id());
 }
 
 Fortran::semantics::Symbol *
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 76a15e8bcaab9b..3ab0823a462148 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_LOWER_OPENMPUTILS_H
 #define FORTRAN_LOWER_OPENMPUTILS_H
 
+#include "Clauses.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/Value.h"
@@ -45,23 +46,26 @@ using DeclareTargetCapturePair =
 mlir::omp::MapInfoOp
 createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name,
-                mlir::SmallVector<mlir::Value> bounds,
-                mlir::SmallVector<mlir::Value> members, uint64_t mapType,
+                mlir::ArrayRef<mlir::Value> bounds,
+                mlir::ArrayRef<mlir::Value> members, uint64_t mapType,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
                 bool isVal = false);
 
 void gatherFuncAndVarSyms(
-    const Fortran::parser::OmpObjectList &objList,
-    mlir::omp::DeclareTargetCaptureClause clause,
+    const ObjectList &objects, mlir::omp::DeclareTargetCaptureClause clause,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause);
 
 Fortran::semantics::Symbol *
 getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject);
 
-void genObjectList(const Fortran::parser::OmpObjectList &objectList,
+void genObjectList(const ObjectList &objects,
                    Fortran::lower::AbstractConverter &converter,
                    llvm::SmallVectorImpl<mlir::Value> &operands);
 
+void genObjectList2(const Fortran::parser::OmpObjectList &objectList,
+                    Fortran::lower::AbstractConverter &converter,
+                    llvm::SmallVectorImpl<mlir::Value> &operands);
+
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 12da7412888a3b..f7327a299d9a5e 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -208,6 +208,8 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
               .getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
     return ompOutlineableIface.getAllocaBlock();
   }
+  if (mlir::isa<mlir::omp::ReductionDeclareOp>(getRegion().getParentOp()))
+    return &getRegion().front();
   if (auto accRecipeIface =
           getRegion().getParentOfType<mlir::acc::RecipeInterface>()) {
     return accRecipeIface.getAllocaBlock(getRegion());
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index c7a550814e1d58..db638ceb40700b 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1111,6 +1111,35 @@ hlfir::createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
   return {hlfir::Entity{declareOp.getBase()}, isHeapAlloc};
 }
 
+hlfir::Entity hlfir::createStackTempFromMold(mlir::Location loc,
+                                             fir::FirOpBuilder &builder,
+                                             hlfir::Entity mold) {
+  llvm::SmallVector<mlir::Value> lenParams;
+  hlfir::genLengthParameters(loc, builder, mold, lenParams);
+  llvm::StringRef tmpName{".tmp"};
+  mlir::Value alloc;
+  mlir::Value shape{};
+  fir::FortranVariableFlagsAttr declAttrs;
+
+  if (mold.isPolymorphic()) {
+    // genAllocatableApplyMold does heap allocation
+    TODO(loc, "createStackTempFromMold for polymorphic type");
+  } else if (mold.isArray()) {
+    mlir::Type sequenceType =
+        hlfir::getFortranElementOrSequenceType(mold.getType());
+    shape = hlfir::genShape(loc, builder, mold);
+    auto extents = hlfir::getIndexExtents(loc, builder, shape);
+    alloc =
+        builder.createTemporary(loc, sequenceType, tmpName, extents, lenParams);
+  } else {
+    alloc = builder.createTemporary(loc, mold.getFortranElementType(), tmpName,
+                                    /*shape=*/std::nullopt, lenParams);
+  }
+  auto declareOp = builder.create<hlfir::DeclareOp>(loc, alloc, tmpName, shape,
+                                                    lenParams, declAttrs);
+  return hlfir::Entity{declareOp.getBase()};
+}
+
 hlfir::EntityWithAttributes
 hlfir::convertCharacterKind(mlir::Location loc, fir::FirOpBuilder &builder,
                             hlfir::Entity scalarChar, int toKind) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 94fcfa3503114e..eb8f5135ff12e0 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -177,6 +177,8 @@ static constexpr IntrinsicHandler handlers[]{
      /*isElemental=*/false},
     {"c_funloc", &I::genCFunLoc, {{{"x", asBox}}}, /*isElemental=*/false},
     {"c_loc", &I::genCLoc, {{{"x", asBox}}}, /*isElemental=*/false},
+    {"c_ptr_eq", &I::genCPtrCompare<mlir::arith::CmpIPredicate::eq>},
+    {"c_ptr_ne", &I::genCPtrCompare<mlir::arith::CmpIPredicate::ne>},
     {"ceiling", &I::genCeiling},
     {"char", &I::genChar},
     {"cmplx",
@@ -2797,6 +2799,23 @@ IntrinsicLibrary::genCLoc(mlir::Type resultType,
   return genCLocOrCFunLoc(builder, loc, resultType, args);
 }
 
+// C_PTR_EQ and C_PTR_NE
+template <mlir::arith::CmpIPredicate pred>
+fir::ExtendedValue
+IntrinsicLibrary::genCPtrCompare(mlir::Type resultType,
+                                 llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value cPtr1 = fir::getBase(args[0]);
+  mlir::Value cPtrVal1 =
+      fir::factory::genCPtrOrCFunptrValue(builder, loc, cPtr1);
+  mlir::Value cPtr2 = fir::getBase(args[1]);
+  mlir::Value cPtrVal2 =
+      fir::factory::genCPtrOrCFunptrValue(builder, loc, cPtr2);
+  mlir::Value cmp =
+      builder.create<mlir::arith::CmpIOp>(loc, pred, cPtrVal1, cPtrVal2);
+  return builder.createConvert(loc, resultType, cmp);
+}
+
 // CEILING
 mlir::Value IntrinsicLibrary::genCeiling(mlir::Type resultType,
                                          llvm::ArrayRef<mlir::Value> args) {
@@ -5240,6 +5259,13 @@ mlir::Value IntrinsicLibrary::genModulo(mlir::Type resultType,
                                                  remainder);
   }
 
+  // F128 arith::RemFOp may be lowered to a runtime call that may be unsupported
+  // on the target, so generate a call to Fortran Runtime's ModuloReal16.
+  if (resultType == mlir::FloatType::getF128(builder.getContext()))
+    return builder.createConvert(
+        loc, resultType,
+        fir::runtime::genModulo(builder, loc, args[0], args[1]));
+
   auto remainder = builder.create<mlir::arith::RemFOp>(loc, args[0], args[1]);
   mlir::Value zero = builder.createRealZeroConstant(loc, remainder.getType());
   auto remainderIsNotZero = builder.create<mlir::arith::CmpFOp>(
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index b958a30eb6e5b5..4dcbd13cb319f5 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -118,6 +118,20 @@ struct ForcedMod16 {
   }
 };
 
+/// Placeholder for real*16 version of Modulo Intrinsic
+struct ForcedModulo16 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ModuloReal16));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto fltTy = mlir::FloatType::getF128(ctx);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      return mlir::FunctionType::get(ctx, {fltTy, fltTy, strTy, intTy},
+                                     {fltTy});
+    };
+  }
+};
+
 /// Placeholder for real*10 version of Nearest Intrinsic
 struct ForcedNearest10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Nearest10));
@@ -323,6 +337,33 @@ mlir::Value fir::runtime::genMod(fir::FirOpBuilder &builder, mlir::Location loc,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
+/// Generate call to Modulo intrinsic runtime routine.
+mlir::Value fir::runtime::genModulo(fir::FirOpBuilder &builder,
+                                    mlir::Location loc, mlir::Value a,
+                                    mlir::Value p) {
+  mlir::func::FuncOp func;
+  mlir::Type fltTy = a.getType();
+
+  if (fltTy != p.getType())
+    fir::emitFatalError(loc, "arguments type mismatch in MOD");
+
+  // MODULO is lowered into math operations in intrinsics lowering,
+  // so genModulo() should only be used for F128 data type now.
+  if (fltTy.isF128())
+    func = fir::runtime::getRuntimeFunc<ForcedModulo16>(loc, builder);
+  else
+    fir::intrinsicTypeTODO(builder, fltTy, loc, "MODULO");
+
+  auto funcTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, funcTy.getInput(3));
+  auto args = fir::runtime::createArguments(builder, loc, funcTy, a, p,
+                                            sourceFile, sourceLine);
+
+  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
+
 /// Generate call to Nearest intrinsic runtime routine.
 mlir::Value fir::runtime::genNearest(fir::FirOpBuilder &builder,
                                      mlir::Location loc, mlir::Value x,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index f81a08388da722..e941a0226ca50f 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -37,6 +37,7 @@
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/Transforms/AddComdats.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
@@ -49,7 +50,6 @@
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include <mlir/Dialect/LLVMIR/LLVMAttrs.h>
 
 namespace fir {
 #define GEN_PASS_DEF_FIRTOLLVMLOWERING
@@ -410,8 +410,15 @@ class FIROpConversion : public mlir::ConvertOpToLLVMPattern<FromOp> {
       mlir::ConversionPatternRewriter &rewriter) const {
     auto thisPt = rewriter.saveInsertionPoint();
     mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
-    mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp);
-    rewriter.setInsertionPointToStart(insertBlock);
+    if (mlir::isa<mlir::omp::ReductionDeclareOp>(parentOp)) {
+      // ReductionDeclareOp has multiple child regions. We want to get the first
+      // block of whichever of those regions we are currently in
+      mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
+      rewriter.setInsertionPointToStart(&parentRegion->front());
+    } else {
+      mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp);
+      rewriter.setInsertionPointToStart(insertBlock);
+    }
     auto size = genI32Constant(loc, rewriter, 1);
     unsigned allocaAs = getAllocaAddressSpace(rewriter);
     unsigned programAs = getProgramAddressSpace(rewriter);
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index 0170b56367cf3c..665bf09b8fc33b 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -283,7 +283,9 @@ class DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
 
 class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 public:
-  void runOn(mlir::Operation *op, mlir::Region &region) {
+  void runOnOperation() override final {
+    mlir::ModuleOp mod = getOperation();
+
     auto &context = getContext();
     mlir::ConversionTarget target(context);
     target.addLegalDialect<mlir::arith::ArithDialect, fir::FIROpsDialect,
@@ -298,10 +300,9 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
                                        .isa<fir::SequenceType>());
     });
     mlir::RewritePatternSet patterns(&context);
-    patterns.insert<EmboxConversion, ArrayCoorConversion, ReboxConversion,
-                    DeclareOpConversion>(&context);
+    fir::populatePreCGRewritePatterns(patterns);
     if (mlir::failed(
-            mlir::applyPartialConversion(op, target, std::move(patterns)))) {
+            mlir::applyPartialConversion(mod, target, std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(&context),
                       "error in running the pre-codegen conversions");
       signalPassFailure();
@@ -309,16 +310,7 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
     }
     // Erase any residual (fir.shape, fir.slice...).
     mlir::IRRewriter rewriter(&context);
-    (void)mlir::runRegionDCE(rewriter, op->getRegions());
-  }
-
-  void runOnOperation() override final {
-    // Call runOn on all top level regions that may contain emboxOp/arrayCoorOp.
-    auto mod = getOperation();
-    for (auto func : mod.getOps<mlir::func::FuncOp>())
-      runOn(func, func.getBody());
-    for (auto global : mod.getOps<fir::GlobalOp>())
-      runOn(global, global.getRegion());
+    (void)mlir::runRegionDCE(rewriter, mod->getRegions());
   }
 };
 
@@ -327,3 +319,8 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 std::unique_ptr<mlir::Pass> fir::createFirCodeGenRewritePass() {
   return std::make_unique<CodeGenRewrite>();
 }
+
+void fir::populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns) {
+  patterns.insert<EmboxConversion, ArrayCoorConversion, ReboxConversion,
+                  DeclareOpConversion>(patterns.getContext());
+}
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index 8a2c681d958609..5c4cad6d208344 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -254,6 +254,18 @@ bool hasDynamicSize(mlir::Type t) {
   return false;
 }
 
+mlir::Type extractSequenceType(mlir::Type ty) {
+  if (mlir::isa<fir::SequenceType>(ty))
+    return ty;
+  if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(ty))
+    return extractSequenceType(boxTy.getEleTy());
+  if (auto heapTy = mlir::dyn_cast<fir::HeapType>(ty))
+    return extractSequenceType(heapTy.getEleTy());
+  if (auto ptrTy = mlir::dyn_cast<fir::PointerType>(ty))
+    return extractSequenceType(ptrTy.getEleTy());
+  return mlir::Type{};
+}
+
 bool isPointerType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
index 0944b184ca0d4e..45609a99995d0a 100644
--- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
+++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
@@ -24,7 +24,8 @@
 #include "llvm/Support/CommandLine.h"
 
 namespace fir {
-#define GEN_PASS_DEF_CFGCONVERSION
+#define GEN_PASS_DEF_CFGCONVERSIONONFUNC
+#define GEN_PASS_DEF_CFGCONVERSIONONREDUCTION
 #include "flang/Optimizer/Transforms/Passes.h.inc"
 } // namespace fir
 
@@ -308,13 +309,13 @@ class CfgIterWhileConv : public mlir::OpRewritePattern<fir::IterWhileOp> {
 };
 
 /// Convert FIR structured control flow ops to CFG ops.
-class CfgConversion : public fir::impl::CFGConversionBase<CfgConversion> {
+template <typename Pass, template <typename> class PassBase>
+class CfgConversionTemplate : public PassBase<Pass> {
 public:
   void runOnOperation() override {
-    auto *context = &getContext();
+    auto *context = &this->getContext();
     mlir::RewritePatternSet patterns(context);
-    patterns.insert<CfgLoopConv, CfgIfConv, CfgIterWhileConv>(
-        context, forceLoopToExecuteOnce);
+    fir::populateCfgConversionRewrites(patterns, this->forceLoopToExecuteOnce);
     mlir::ConversionTarget target(*context);
     target.addLegalDialect<mlir::affine::AffineDialect,
                            mlir::cf::ControlFlowDialect, FIROpsDialect,
@@ -323,19 +324,38 @@ class CfgConversion : public fir::impl::CFGConversionBase<CfgConversion> {
     // apply the patterns
     target.addIllegalOp<ResultOp, DoLoopOp, IfOp, IterWhileOp>();
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
-    if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
+    if (mlir::failed(mlir::applyPartialConversion(this->getOperation(), target,
                                                   std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(context),
                       "error in converting to CFG\n");
-      signalPassFailure();
+      this->signalPassFailure();
     }
   }
 };
+
+class CfgConversionOnFunc
+    : public CfgConversionTemplate<CfgConversionOnFunc,
+                                   ::fir::impl::CFGConversionOnFuncBase> {};
+
+class CfgConversionOnReduction
+    : public CfgConversionTemplate<CfgConversionOnReduction,
+                                   ::fir::impl::CFGConversionOnReductionBase> {
+};
 } // namespace
 
+/// Expose conversion rewriters to other passes
+void fir::populateCfgConversionRewrites(mlir::RewritePatternSet &patterns,
+                                        bool forceLoopToExecuteOnce) {
+  patterns.insert<CfgLoopConv, CfgIfConv, CfgIterWhileConv>(
+      patterns.getContext(), forceLoopToExecuteOnce);
+}
+
 /// Convert FIR's structured control flow ops to CFG ops.  This
 /// conversion enables the `createLowerToCFGPass` to transform these to CFG
 /// form.
-std::unique_ptr<mlir::Pass> fir::createFirToCfgPass() {
-  return std::make_unique<CfgConversion>();
+std::unique_ptr<mlir::Pass> fir::createFirToCfgOnFuncPass() {
+  return std::make_unique<CfgConversionOnFunc>();
+}
+std::unique_ptr<mlir::Pass> fir::createFirToCfgOnReductionPass() {
+  return std::make_unique<CfgConversionOnReduction>();
 }
diff --git a/flang/lib/Parser/executable-parsers.cpp b/flang/lib/Parser/executable-parsers.cpp
index de2be017508c37..07a570bd61e990 100644
--- a/flang/lib/Parser/executable-parsers.cpp
+++ b/flang/lib/Parser/executable-parsers.cpp
@@ -542,19 +542,19 @@ TYPE_CONTEXT_PARSER("UNLOCK statement"_en_US,
 // CUF-kernel-do-directive ->
 //     !$CUF KERNEL DO [ (scalar-int-constant-expr) ] <<< grid, block [, stream]
 //     >>> do-construct
-// grid -> * | scalar-int-expr | ( scalar-int-expr-list )
-// block -> * | scalar-int-expr | ( scalar-int-expr-list )
+// star-or-expr -> * | scalar-int-expr
+// grid -> * | scalar-int-expr | ( star-or-expr-list )
+// block -> * | scalar-int-expr | ( star-or-expr-list )
 // stream -> ( 0, | STREAM = ) scalar-int-expr
+constexpr auto starOrExpr{construct<CUFKernelDoConstruct::StarOrExpr>(
+    "*" >> pure<std::optional<ScalarIntExpr>>() ||
+    applyFunction(presentOptional<ScalarIntExpr>, scalarIntExpr))};
+constexpr auto gridOrBlock{parenthesized(nonemptyList(starOrExpr)) ||
+    applyFunction(singletonList<CUFKernelDoConstruct::StarOrExpr>, starOrExpr)};
 TYPE_PARSER(sourced(beginDirective >> "$CUF KERNEL DO"_tok >>
     construct<CUFKernelDoConstruct::Directive>(
-        maybe(parenthesized(scalarIntConstantExpr)),
-        "<<<" >>
-            ("*" >> pure<std::list<ScalarIntExpr>>() ||
-                parenthesized(nonemptyList(scalarIntExpr)) ||
-                applyFunction(singletonList<ScalarIntExpr>, scalarIntExpr)),
-        "," >> ("*" >> pure<std::list<ScalarIntExpr>>() ||
-                   parenthesized(nonemptyList(scalarIntExpr)) ||
-                   applyFunction(singletonList<ScalarIntExpr>, scalarIntExpr)),
+        maybe(parenthesized(scalarIntConstantExpr)), "<<<" >> gridOrBlock,
+        "," >> gridOrBlock,
         maybe((", 0 ,"_tok || ", STREAM ="_tok) >> scalarIntExpr) / ">>>" /
             endDirective)))
 TYPE_CONTEXT_PARSER("!$CUF KERNEL DO construct"_en_US,
diff --git a/flang/lib/Parser/misc-parsers.h b/flang/lib/Parser/misc-parsers.h
index e9b52b7d0fcd0f..4a318e05bb4b8c 100644
--- a/flang/lib/Parser/misc-parsers.h
+++ b/flang/lib/Parser/misc-parsers.h
@@ -57,5 +57,10 @@ template <typename A> common::IfNoLvalue<std::list<A>, A> singletonList(A &&x) {
   result.emplace_back(std::move(x));
   return result;
 }
+
+template <typename A>
+common::IfNoLvalue<std::optional<A>, A> presentOptional(A &&x) {
+  return std::make_optional(std::move(x));
+}
 } // namespace Fortran::parser
 #endif
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 600aa01999dab7..baba4863f5775f 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2729,6 +2729,13 @@ class UnparseVisitor {
   WALK_NESTED_ENUM(OmpOrderModifier, Kind) // OMP order-modifier
 #undef WALK_NESTED_ENUM
 
+  void Unparse(const CUFKernelDoConstruct::StarOrExpr &x) {
+    if (x.v) {
+      Walk(*x.v);
+    } else {
+      Word("*");
+    }
+  }
   void Unparse(const CUFKernelDoConstruct::Directive &x) {
     Word("!$CUF KERNEL DO");
     Walk(" (", std::get<std::optional<ScalarIntConstantExpr>>(x.t), ")");
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 431cef20793508..581371ff7a0031 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -632,6 +632,11 @@ void CheckHelper::CheckObjectEntity(
     const Symbol &symbol, const ObjectEntityDetails &details) {
   CheckSymbolType(symbol);
   CheckArraySpec(symbol, details.shape());
+  CheckConflicting(symbol, Attr::ALLOCATABLE, Attr::PARAMETER);
+  CheckConflicting(symbol, Attr::ASYNCHRONOUS, Attr::PARAMETER);
+  CheckConflicting(symbol, Attr::SAVE, Attr::PARAMETER);
+  CheckConflicting(symbol, Attr::TARGET, Attr::PARAMETER);
+  CheckConflicting(symbol, Attr::VOLATILE, Attr::PARAMETER);
   Check(details.shape());
   Check(details.coshape());
   if (details.shape().Rank() > common::maxRank) {
diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index 4e8578d0e1daff..51f536f3d77231 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -220,8 +220,11 @@ class DoConcurrentBodyEnforce {
       if (MightDeallocatePolymorphic(*entity, DeallocateNonCoarray)) {
         SayDeallocateOfPolymorph(variable.GetSource(), *entity, reason);
       }
-      if (const Symbol * impure{HasImpureFinal(*entity)}) {
-        SayDeallocateWithImpureFinal(*entity, reason, *impure);
+      if (const auto *assignment{GetAssignment(stmt)}) {
+        const auto &lhs{assignment->lhs};
+        if (const Symbol * impure{HasImpureFinal(*entity, lhs.Rank())}) {
+          SayDeallocateWithImpureFinal(*entity, reason, *impure);
+        }
       }
     }
     if (const auto *assignment{GetAssignment(stmt)}) {
@@ -435,6 +438,18 @@ class DoContext {
       CheckForallIndexesUsed(*assignment);
       CheckForImpureCall(assignment->lhs);
       CheckForImpureCall(assignment->rhs);
+
+      if (IsVariable(assignment->lhs)) {
+        if (const Symbol * symbol{GetLastSymbol(assignment->lhs)}) {
+          if (auto impureFinal{
+                  HasImpureFinal(*symbol, assignment->lhs.Rank())}) {
+            context_.SayWithDecl(*symbol, parser::FindSourceLocation(stmt),
+                "Impure procedure '%s' is referenced by finalization in a %s"_err_en_US,
+                impureFinal->name(), LoopKindName());
+          }
+        }
+      }
+
       if (const auto *proc{
               std::get_if<evaluate::ProcedureRef>(&assignment->u)}) {
         CheckForImpureCall(*proc);
diff --git a/flang/lib/Semantics/data-to-inits.cpp b/flang/lib/Semantics/data-to-inits.cpp
index fa22d498679053..2ebc4e561a339a 100644
--- a/flang/lib/Semantics/data-to-inits.cpp
+++ b/flang/lib/Semantics/data-to-inits.cpp
@@ -118,9 +118,10 @@ class DataInitializationCompiler {
   bool Scan(const parser::DataIDoObject &);
 
   // Initializes all elements of a designator, which can be an array or section.
-  bool InitDesignator(const SomeExpr &);
+  bool InitDesignator(const SomeExpr &, const Scope &);
   // Initializes a single scalar object.
-  bool InitElement(const evaluate::OffsetSymbol &, const SomeExpr &designator);
+  bool InitElement(const evaluate::OffsetSymbol &, const SomeExpr &designator,
+      const Scope &);
   // If the returned flag is true, emit a warning about CHARACTER misusage.
   std::optional<std::pair<SomeExpr, bool>> ConvertElement(
       const SomeExpr &, const evaluate::DynamicType &);
@@ -128,7 +129,6 @@ class DataInitializationCompiler {
   DataInitializations &inits_;
   evaluate::ExpressionAnalyzer &exprAnalyzer_;
   ValueListIterator<DSV> values_;
-  const Scope *scope_{nullptr};
 };
 
 template <typename DSV>
@@ -149,8 +149,7 @@ bool DataInitializationCompiler<DSV>::Scan(const parser::Variable &var) {
   if (const auto *expr{GetExpr(exprAnalyzer_.context(), var)}) {
     parser::CharBlock at{var.GetSource()};
     exprAnalyzer_.GetFoldingContext().messages().SetLocation(at);
-    scope_ = &exprAnalyzer_.context().FindScope(at);
-    if (InitDesignator(*expr)) {
+    if (InitDesignator(*expr, exprAnalyzer_.context().FindScope(at))) {
       return true;
     }
   }
@@ -170,8 +169,7 @@ bool DataInitializationCompiler<DSV>::Scan(
   if (expr) {
     parser::CharBlock at{parser::FindSourceLocation(designator)};
     exprAnalyzer_.GetFoldingContext().messages().SetLocation(at);
-    scope_ = &exprAnalyzer_.context().FindScope(at);
-    if (InitDesignator(*expr)) {
+    if (InitDesignator(*expr, exprAnalyzer_.context().FindScope(at))) {
       return true;
     }
   }
@@ -254,12 +252,12 @@ template <typename DSV>
 bool DataInitializationCompiler<DSV>::Scan(const Symbol &symbol) {
   auto designator{exprAnalyzer_.Designate(evaluate::DataRef{symbol})};
   CHECK(designator.has_value());
-  return InitDesignator(*designator);
+  return InitDesignator(*designator, symbol.owner());
 }
 
 template <typename DSV>
 bool DataInitializationCompiler<DSV>::InitDesignator(
-    const SomeExpr &designator) {
+    const SomeExpr &designator, const Scope &scope) {
   evaluate::FoldingContext &context{exprAnalyzer_.GetFoldingContext()};
   evaluate::DesignatorFolder folder{context};
   while (auto offsetSymbol{folder.FoldDesignator(designator)}) {
@@ -274,7 +272,7 @@ bool DataInitializationCompiler<DSV>::InitDesignator(
             designator.AsFortran());
       }
       return false;
-    } else if (!InitElement(*offsetSymbol, designator)) {
+    } else if (!InitElement(*offsetSymbol, designator, scope)) {
       return false;
     } else {
       ++values_;
@@ -314,7 +312,8 @@ DataInitializationCompiler<DSV>::ConvertElement(
 
 template <typename DSV>
 bool DataInitializationCompiler<DSV>::InitElement(
-    const evaluate::OffsetSymbol &offsetSymbol, const SomeExpr &designator) {
+    const evaluate::OffsetSymbol &offsetSymbol, const SomeExpr &designator,
+    const Scope &scope) {
   const Symbol &symbol{offsetSymbol.symbol()};
   const Symbol *lastSymbol{GetLastSymbol(designator)};
   bool isPointer{lastSymbol && IsPointer(*lastSymbol)};
@@ -390,7 +389,7 @@ bool DataInitializationCompiler<DSV>::InitElement(
     } else if (isProcPointer) {
       if (evaluate::IsProcedure(*expr)) {
         if (CheckPointerAssignment(exprAnalyzer_.context(), designator, *expr,
-                DEREF(scope_),
+                scope,
                 /*isBoundsRemapping=*/false, /*isAssumedRank=*/false)) {
           if (lastSymbol->has<ProcEntityDetails>()) {
             GetImage().AddPointer(offsetSymbol.offset(), *expr);
@@ -413,7 +412,7 @@ bool DataInitializationCompiler<DSV>::InitElement(
           "Procedure '%s' may not be used to initialize '%s', which is not a procedure pointer"_err_en_US,
           expr->AsFortran(), DescribeElement());
     } else if (CheckInitialDataPointerTarget(
-                   exprAnalyzer_.context(), designator, *expr, DEREF(scope_))) {
+                   exprAnalyzer_.context(), designator, *expr, scope)) {
       GetImage().AddPointer(offsetSymbol.offset(), *expr);
       return true;
     }
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 215a3c9060a241..6d58013b87d298 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -487,6 +487,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
       const auto namePair{
           currScope().try_emplace(name->source, Attrs{}, ProcEntityDetails{})};
       auto &newSymbol{*namePair.first->second};
+      if (context_.intrinsics().IsIntrinsic(name->ToString())) {
+        newSymbol.attrs().set(Attr::INTRINSIC);
+      }
       name->symbol = &newSymbol;
     };
     if (const auto *procD{parser::Unwrap<parser::ProcedureDesignator>(opr.u)}) {
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index b13674573fe07e..f89323f3e54a62 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -4432,7 +4432,7 @@ Symbol &SubprogramVisitor::PushSubprogramScope(const parser::Name &name,
         CHECK(context().HasError(genericSymbol));
       }
     }
-    set_inheritFromParent(false);
+    set_inheritFromParent(hasModulePrefix);
   }
   if (Symbol * found{FindSymbol(name)};
       found && found->has<HostAssocDetails>()) {
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 9845a190bc756c..15ea34c66dba52 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -1239,6 +1239,16 @@ void RuntimeTableBuilder::IncorporateDefinedIoGenericInterfaces(
 RuntimeDerivedTypeTables BuildRuntimeDerivedTypeTables(
     SemanticsContext &context) {
   RuntimeDerivedTypeTables result;
+  // Do not attempt to read __fortran_type_info.mod when compiling
+  // the module on which it depends.
+  const auto &allSources{context.allCookedSources().allSources()};
+  if (auto firstProv{allSources.GetFirstFileProvenance()}) {
+    if (const auto *srcFile{allSources.GetSourceFile(firstProv->start())}) {
+      if (srcFile->path().find("__fortran_builtins.f90") != std::string::npos) {
+        return result;
+      }
+    }
+  }
   result.schemata = context.GetBuiltinModule(typeInfoBuiltinModule);
   if (result.schemata) {
     RuntimeTableBuilder builder{context, result};
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index bf999b090419c6..0484baae93cd59 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -827,15 +827,18 @@ static const Symbol *HasImpureFinal(
   return IsFinalizable(derived, nullptr, /*withImpureFinalizer=*/true, rank);
 }
 
-const Symbol *HasImpureFinal(const Symbol &original) {
+const Symbol *HasImpureFinal(const Symbol &original, std::optional<int> rank) {
   const Symbol &symbol{ResolveAssociations(original)};
   if (symbol.has<ObjectEntityDetails>()) {
     if (const DeclTypeSpec * symType{symbol.GetType()}) {
       if (const DerivedTypeSpec * derived{symType->AsDerived()}) {
-        // finalizable assumed-rank not allowed (C839)
-        return evaluate::IsAssumedRank(symbol)
-            ? nullptr
-            : HasImpureFinal(*derived, symbol.Rank());
+        if (evaluate::IsAssumedRank(symbol)) {
+          // finalizable assumed-rank not allowed (C839)
+          return nullptr;
+        } else {
+          int actualRank{rank.value_or(symbol.Rank())};
+          return HasImpureFinal(*derived, actualRank);
+        }
       }
     }
   }
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
index 62b20103221ade..3d3dbef6d018aa 100644
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -110,7 +110,7 @@
   public :: operator(==)
 
   interface operator(/=)
-    module procedure __builtin_c_ptr_eq
+    module procedure __builtin_c_ptr_ne
   end interface
   public :: operator(/=)
 
diff --git a/flang/runtime/connection.h b/flang/runtime/connection.h
index c9a7566f209881..c41970d47e7b09 100644
--- a/flang/runtime/connection.h
+++ b/flang/runtime/connection.h
@@ -12,8 +12,8 @@
 #define FORTRAN_RUNTIME_IO_CONNECTION_H_
 
 #include "format.h"
+#include "flang/Common/optional.h"
 #include <cinttypes>
-#include <optional>
 
 namespace Fortran::runtime::io {
 
@@ -26,10 +26,10 @@ enum class Access { Sequential, Direct, Stream };
 // established in an OPEN statement.
 struct ConnectionAttributes {
   Access access{Access::Sequential}; // ACCESS='SEQUENTIAL', 'DIRECT', 'STREAM'
-  std::optional<bool> isUnformatted; // FORM='UNFORMATTED' if true
+  Fortran::common::optional<bool> isUnformatted; // FORM='UNFORMATTED' if true
   bool isUTF8{false}; // ENCODING='UTF-8'
   unsigned char internalIoCharKind{0}; // 0->external, 1/2/4->internal
-  std::optional<std::int64_t> openRecl; // RECL= on OPEN
+  Fortran::common::optional<std::int64_t> openRecl; // RECL= on OPEN
 
   bool IsRecordFile() const {
     // Formatted stream files are viewed as having records, at least on input
@@ -63,14 +63,14 @@ struct ConnectionState : public ConnectionAttributes {
     unterminatedRecord = false;
   }
 
-  std::optional<std::int64_t> EffectiveRecordLength() const {
+  Fortran::common::optional<std::int64_t> EffectiveRecordLength() const {
     // When an input record is longer than an explicit RECL= from OPEN
     // it is effectively truncated on input.
     return openRecl && recordLength && *openRecl < *recordLength ? openRecl
                                                                  : recordLength;
   }
 
-  std::optional<std::int64_t> recordLength;
+  Fortran::common::optional<std::int64_t> recordLength;
 
   std::int64_t currentRecordNumber{1}; // 1 is first
 
@@ -86,11 +86,12 @@ struct ConnectionState : public ConnectionAttributes {
   std::int64_t furthestPositionInRecord{0}; // max(position+bytes)
 
   // Set at end of non-advancing I/O data transfer
-  std::optional<std::int64_t> leftTabLimit; // offset in current record
+  Fortran::common::optional<std::int64_t>
+      leftTabLimit; // offset in current record
 
   // currentRecordNumber value captured after ENDFILE/REWIND/BACKSPACE statement
   // or an end-of-file READ condition on a sequential access file
-  std::optional<std::int64_t> endfileRecordNumber;
+  Fortran::common::optional<std::int64_t> endfileRecordNumber;
 
   // Mutable modes set at OPEN() that can be overridden in READ/WRITE & FORMAT
   MutableModes modes; // BLANK=, DECIMAL=, SIGN=, ROUND=, PAD=, DELIM=, kP
diff --git a/flang/runtime/derived-api.cpp b/flang/runtime/derived-api.cpp
index 321f50a1edfcfe..eca784be208d10 100644
--- a/flang/runtime/derived-api.cpp
+++ b/flang/runtime/derived-api.cpp
@@ -95,7 +95,7 @@ inline RT_API_ATTRS bool CompareDerivedType(
   return a == b || CompareDerivedTypeNames(a->name(), b->name());
 }
 
-static const RT_API_ATTRS typeInfo::DerivedType *GetDerivedType(
+static RT_API_ATTRS const typeInfo::DerivedType *GetDerivedType(
     const Descriptor &desc) {
   if (const DescriptorAddendum * addendum{desc.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
diff --git a/flang/runtime/descriptor-io.cpp b/flang/runtime/descriptor-io.cpp
index 6041104773cc49..7c7323b719adf8 100644
--- a/flang/runtime/descriptor-io.cpp
+++ b/flang/runtime/descriptor-io.cpp
@@ -12,11 +12,12 @@
 namespace Fortran::runtime::io::descr {
 
 // Defined formatted I/O (maybe)
-std::optional<bool> DefinedFormattedIo(IoStatementState &io,
+Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
     const Descriptor &descriptor, const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special,
     const SubscriptValue subscripts[]) {
-  std::optional<DataEdit> peek{io.GetNextDataEdit(0 /*to peek at it*/)};
+  Fortran::common::optional<DataEdit> peek{
+      io.GetNextDataEdit(0 /*to peek at it*/)};
   if (peek &&
       (peek->descriptor == DataEdit::DefinedDerivedType ||
           peek->descriptor == DataEdit::ListDirected)) {
@@ -55,7 +56,7 @@ std::optional<bool> DefinedFormattedIo(IoStatementState &io,
     int unit{external->unitNumber()};
     int ioStat{IostatOk};
     char ioMsg[100];
-    std::optional<std::int64_t> startPos;
+    Fortran::common::optional<std::int64_t> startPos;
     if (edit.descriptor == DataEdit::DefinedDerivedType &&
         special.which() == typeInfo::SpecialBinding::Which::ReadFormatted) {
       // DT is an edit descriptor so everything that the child
@@ -96,7 +97,7 @@ std::optional<bool> DefinedFormattedIo(IoStatementState &io,
     // There's a defined I/O subroutine, but there's a FORMAT present and
     // it does not have a DT data edit descriptor, so apply default formatting
     // to the components of the derived type as usual.
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 }
 
diff --git a/flang/runtime/descriptor-io.h b/flang/runtime/descriptor-io.h
index 394578796faa79..b6b0fefcff870b 100644
--- a/flang/runtime/descriptor-io.h
+++ b/flang/runtime/descriptor-io.h
@@ -21,6 +21,7 @@
 #include "terminator.h"
 #include "type-info.h"
 #include "unit.h"
+#include "flang/Common/optional.h"
 #include "flang/Common/uint128.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
@@ -321,9 +322,9 @@ static bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
   return true;
 }
 
-std::optional<bool> DefinedFormattedIo(IoStatementState &, const Descriptor &,
-    const typeInfo::DerivedType &, const typeInfo::SpecialBinding &,
-    const SubscriptValue[]);
+Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &,
+    const Descriptor &, const typeInfo::DerivedType &,
+    const typeInfo::SpecialBinding &, const SubscriptValue[]);
 
 template <Direction DIR>
 static bool FormattedDerivedTypeIO(IoStatementState &io,
@@ -334,7 +335,7 @@ static bool FormattedDerivedTypeIO(IoStatementState &io,
   RUNTIME_CHECK(handler, addendum != nullptr);
   const typeInfo::DerivedType *type{addendum->derivedType()};
   RUNTIME_CHECK(handler, type != nullptr);
-  std::optional<typeInfo::SpecialBinding> nonTbpSpecial;
+  Fortran::common::optional<typeInfo::SpecialBinding> nonTbpSpecial;
   const typeInfo::SpecialBinding *special{nullptr};
   if (table) {
     if (const auto *definedIo{table->Find(*type,
@@ -365,7 +366,7 @@ static bool FormattedDerivedTypeIO(IoStatementState &io,
   std::size_t numElements{descriptor.Elements()};
   for (std::size_t j{0}; j < numElements;
        ++j, descriptor.IncrementSubscripts(subscripts)) {
-    std::optional<bool> result;
+    Fortran::common::optional<bool> result;
     if (special) {
       result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts);
     }
@@ -406,7 +407,7 @@ static bool UnformattedDescriptorIO(IoStatementState &io,
                   : typeInfo::SpecialBinding::Which::WriteUnformatted,
               definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
               false};
-          if (std::optional<bool> wasDefined{
+          if (Fortran::common::optional<bool> wasDefined{
                   DefinedUnformattedIo(io, descriptor, *type, special)}) {
             return *wasDefined;
           }
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 85cce2f1a16623..fbeb1a595b327e 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -9,6 +9,7 @@
 #include "edit-input.h"
 #include "namelist.h"
 #include "utf.h"
+#include "flang/Common/optional.h"
 #include "flang/Common/real.h"
 #include "flang/Common/uint128.h"
 #include <algorithm>
@@ -54,9 +55,9 @@ template <int LOG2_BASE>
 static bool EditBOZInput(
     IoStatementState &io, const DataEdit &edit, void *n, std::size_t bytes) {
   // Skip leading white space & zeroes
-  std::optional<int> remaining{io.CueUpInput(edit)};
+  Fortran::common::optional<int> remaining{io.CueUpInput(edit)};
   auto start{io.GetConnectionState().positionInRecord};
-  std::optional<char32_t> next{io.NextInField(remaining, edit)};
+  Fortran::common::optional<char32_t> next{io.NextInField(remaining, edit)};
   if (next.value_or('?') == '0') {
     do {
       start = io.GetConnectionState().positionInRecord;
@@ -156,7 +157,8 @@ static inline char32_t GetRadixPointChar(const DataEdit &edit) {
 
 // Prepares input from a field, and returns the sign, if any, else '\0'.
 static char ScanNumericPrefix(IoStatementState &io, const DataEdit &edit,
-    std::optional<char32_t> &next, std::optional<int> &remaining) {
+    Fortran::common::optional<char32_t> &next,
+    Fortran::common::optional<int> &remaining) {
   remaining = io.CueUpInput(edit);
   next = io.NextInField(remaining, edit);
   char sign{'\0'};
@@ -198,8 +200,8 @@ bool EditIntegerInput(
         edit.descriptor);
     return false;
   }
-  std::optional<int> remaining;
-  std::optional<char32_t> next;
+  Fortran::common::optional<int> remaining;
+  Fortran::common::optional<char32_t> next;
   char sign{ScanNumericPrefix(io, edit, next, remaining)};
   common::UnsignedInt128 value{0};
   bool any{!!sign};
@@ -279,10 +281,10 @@ struct ScannedRealInput {
 };
 static ScannedRealInput ScanRealInput(
     char *buffer, int bufferSize, IoStatementState &io, const DataEdit &edit) {
-  std::optional<int> remaining;
-  std::optional<char32_t> next;
+  Fortran::common::optional<int> remaining;
+  Fortran::common::optional<char32_t> next;
   int got{0};
-  std::optional<int> radixPointOffset;
+  Fortran::common::optional<int> radixPointOffset;
   auto Put{[&](char ch) -> void {
     if (got < bufferSize) {
       buffer[got] = ch;
@@ -612,11 +614,23 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
     } else {
       break;
     }
-    while (fraction >> binaryPrecision) {
-      guardBit |= roundingBit;
-      roundingBit = (int)fraction & 1;
-      fraction >>= 1;
-      ++expo;
+    if (fraction >> binaryPrecision) {
+      while (fraction >> binaryPrecision) {
+        guardBit |= roundingBit;
+        roundingBit = (int)fraction & 1;
+        fraction >>= 1;
+        ++expo;
+      }
+      // Consume excess digits
+      while (*++p) {
+        if (*p == '0') {
+        } else if ((*p >= '1' && *p <= '9') || (*p >= 'A' && *p <= 'F')) {
+          guardBit = 1;
+        } else {
+          break;
+        }
+      }
+      break;
     }
   }
   if (fraction) {
@@ -631,31 +645,32 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
     while (expo > 1 && !(fraction >> (binaryPrecision - 1))) {
       fraction <<= 1;
       --expo;
+      guardBit = roundingBit = 0;
     }
-    // Rounding
-    bool increase{false};
-    switch (rounding) {
-    case decimal::RoundNearest: // RN & RP
-      increase = roundingBit && (guardBit | ((int)fraction & 1));
-      break;
-    case decimal::RoundUp: // RU
-      increase = !isNegative && (roundingBit | guardBit);
-      break;
-    case decimal::RoundDown: // RD
-      increase = isNegative && (roundingBit | guardBit);
-      break;
-    case decimal::RoundToZero: // RZ
-      break;
-    case decimal::RoundCompatible: // RC
-      increase = roundingBit != 0;
-      break;
-    }
-    if (increase) {
-      ++fraction;
-      if (fraction >> binaryPrecision) {
-        fraction >>= 1;
-        ++expo;
-      }
+  }
+  // Rounding
+  bool increase{false};
+  switch (rounding) {
+  case decimal::RoundNearest: // RN & RP
+    increase = roundingBit && (guardBit | ((int)fraction & 1));
+    break;
+  case decimal::RoundUp: // RU
+    increase = !isNegative && (roundingBit | guardBit);
+    break;
+  case decimal::RoundDown: // RD
+    increase = isNegative && (roundingBit | guardBit);
+    break;
+  case decimal::RoundToZero: // RZ
+    break;
+  case decimal::RoundCompatible: // RC
+    increase = roundingBit != 0;
+    break;
+  }
+  if (increase) {
+    ++fraction;
+    if (fraction >> binaryPrecision) {
+      fraction >>= 1;
+      ++expo;
     }
   }
   // Package & return result
@@ -833,8 +848,8 @@ bool EditLogicalInput(IoStatementState &io, const DataEdit &edit, bool &x) {
         edit.descriptor);
     return false;
   }
-  std::optional<int> remaining{io.CueUpInput(edit)};
-  std::optional<char32_t> next{io.NextInField(remaining, edit)};
+  Fortran::common::optional<int> remaining{io.CueUpInput(edit)};
+  Fortran::common::optional<char32_t> next{io.NextInField(remaining, edit)};
   if (next && *next == '.') { // skip optional period
     next = io.NextInField(remaining, edit);
   }
@@ -916,8 +931,9 @@ static bool EditListDirectedCharacterInput(
   // or the end of the current record.  Subtlety: the "remaining" count
   // here is a dummy that's used to avoid the interpretation of separators
   // in NextInField.
-  std::optional<int> remaining{length > 0 ? maxUTF8Bytes : 0};
-  while (std::optional<char32_t> next{io.NextInField(remaining, edit)}) {
+  Fortran::common::optional<int> remaining{length > 0 ? maxUTF8Bytes : 0};
+  while (Fortran::common::optional<char32_t> next{
+      io.NextInField(remaining, edit)}) {
     bool isSep{false};
     switch (*next) {
     case ' ':
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index b474c8cd91bae4..7267540370fc07 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -9,6 +9,7 @@
 #include "edit-output.h"
 #include "emit-encoded.h"
 #include "utf.h"
+#include "flang/Common/real.h"
 #include "flang/Common/uint128.h"
 #include <algorithm>
 
@@ -700,7 +701,9 @@ bool RealOutputEditing<KIND>::EditEXOutput(const DataEdit &edit) {
   if ((editWidth == 0 && !edit.digits) || editDigits == 0) {
     // EX0 or EXw.0
     flags |= decimal::Minimize;
-    significantDigits = 28; // enough for 128-bit F.P.
+    static constexpr int maxSigHexDigits{
+        (common::PrecisionOfRealKind(16) + 3) / 4};
+    significantDigits = maxSigHexDigits;
   }
   auto converted{
       ConvertToHexadecimal(significantDigits, edit.modes.round, flags)};
diff --git a/flang/runtime/environment.cpp b/flang/runtime/environment.cpp
index fe6701d72c9ff5..b74067a377774b 100644
--- a/flang/runtime/environment.cpp
+++ b/flang/runtime/environment.cpp
@@ -49,7 +49,8 @@ static void SetEnvironmentDefaults(const EnvironmentDefaultList *envDefaults) {
   }
 }
 
-std::optional<Convert> GetConvertFromString(const char *x, std::size_t n) {
+Fortran::common::optional<Convert> GetConvertFromString(
+    const char *x, std::size_t n) {
   static const char *keywords[]{
       "UNKNOWN", "NATIVE", "LITTLE_ENDIAN", "BIG_ENDIAN", "SWAP", nullptr};
   switch (IdentifyValue(x, n, keywords)) {
@@ -64,7 +65,7 @@ std::optional<Convert> GetConvertFromString(const char *x, std::size_t n) {
   case 4:
     return Convert::Swap;
   default:
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 }
 
diff --git a/flang/runtime/environment.h b/flang/runtime/environment.h
index 49c7dbd2940f8c..9bc1158509615f 100644
--- a/flang/runtime/environment.h
+++ b/flang/runtime/environment.h
@@ -9,9 +9,8 @@
 #ifndef FORTRAN_RUNTIME_ENVIRONMENT_H_
 #define FORTRAN_RUNTIME_ENVIRONMENT_H_
 
+#include "flang/Common/optional.h"
 #include "flang/Decimal/decimal.h"
-#include "flang/Runtime/api-attrs.h"
-#include <optional>
 
 struct EnvironmentDefaultList;
 
@@ -30,7 +29,8 @@ constexpr bool isHostLittleEndian{true};
 // External unformatted I/O data conversions
 enum class Convert { Unknown, Native, LittleEndian, BigEndian, Swap };
 
-std::optional<Convert> GetConvertFromString(const char *, std::size_t);
+Fortran::common::optional<Convert> GetConvertFromString(
+    const char *, std::size_t);
 
 struct ExecutionEnvironment {
 #if !defined(_OPENMP)
diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp
index 61afb0458430db..d6e9633372f524 100644
--- a/flang/runtime/extrema.cpp
+++ b/flang/runtime/extrema.cpp
@@ -18,7 +18,6 @@
 #include <cfloat>
 #include <cinttypes>
 #include <cmath>
-#include <optional>
 #include <type_traits>
 
 namespace Fortran::runtime {
diff --git a/flang/runtime/file.cpp b/flang/runtime/file.cpp
index 5cf91b8d64c863..6ca5776f812a04 100644
--- a/flang/runtime/file.cpp
+++ b/flang/runtime/file.cpp
@@ -60,7 +60,7 @@ static int openfile_mkstemp(IoErrorHandler &handler) {
   return fd;
 }
 
-void OpenFile::Open(OpenStatus status, std::optional<Action> action,
+void OpenFile::Open(OpenStatus status, Fortran::common::optional<Action> action,
     Position position, IoErrorHandler &handler) {
   if (fd_ >= 0 &&
       (status == OpenStatus::Old || status == OpenStatus::Unknown)) {
@@ -322,7 +322,7 @@ int OpenFile::WriteAsynchronously(FileOffset at, const char *buffer,
 }
 
 void OpenFile::Wait(int id, IoErrorHandler &handler) {
-  std::optional<int> ioStat;
+  Fortran::common::optional<int> ioStat;
   Pending *prev{nullptr};
   for (Pending *p{pending_.get()}; p; p = (prev = p)->next.get()) {
     if (p->id == id) {
diff --git a/flang/runtime/file.h b/flang/runtime/file.h
index a0712136a7f3df..17deeb0e2f8270 100644
--- a/flang/runtime/file.h
+++ b/flang/runtime/file.h
@@ -12,9 +12,9 @@
 #define FORTRAN_RUNTIME_FILE_H_
 
 #include "io-error.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/memory.h"
 #include <cinttypes>
-#include <optional>
 
 namespace Fortran::runtime::io {
 
@@ -37,10 +37,11 @@ class OpenFile {
   void set_mayAsynchronous(bool yes) { mayAsynchronous_ = yes; }
   bool isTerminal() const { return isTerminal_; }
   bool isWindowsTextFile() const { return isWindowsTextFile_; }
-  std::optional<FileOffset> knownSize() const { return knownSize_; }
+  Fortran::common::optional<FileOffset> knownSize() const { return knownSize_; }
 
   bool IsConnected() const { return fd_ >= 0; }
-  void Open(OpenStatus, std::optional<Action>, Position, IoErrorHandler &);
+  void Open(OpenStatus, Fortran::common::optional<Action>, Position,
+      IoErrorHandler &);
   void Predefine(int fd);
   void Close(CloseStatus, IoErrorHandler &);
 
@@ -94,9 +95,10 @@ class OpenFile {
   bool mayWrite_{false};
   bool mayPosition_{false};
   bool mayAsynchronous_{false};
-  std::optional<Position> openPosition_; // from Open(); reset after positioning
+  Fortran::common::optional<Position>
+      openPosition_; // from Open(); reset after positioning
   FileOffset position_{0};
-  std::optional<FileOffset> knownSize_;
+  Fortran::common::optional<FileOffset> knownSize_;
   bool isTerminal_{false};
   bool isWindowsTextFile_{false}; // expands LF to CR+LF on write
 
diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h
index f554f740573f18..b84e3208271b75 100644
--- a/flang/runtime/format-implementation.h
+++ b/flang/runtime/format-implementation.h
@@ -233,7 +233,7 @@ int FormatControl<CONTEXT>::CueUpNextDataEdit(Context &context, bool stop) {
     }
   }
   while (true) {
-    std::optional<int> repeat;
+    Fortran::common::optional<int> repeat;
     bool unlimited{false};
     auto maybeReversionPoint{offset_};
     CharType ch{GetNextChar(context)};
@@ -419,7 +419,7 @@ int FormatControl<CONTEXT>::CueUpNextDataEdit(Context &context, bool stop) {
 
 // Returns the next data edit descriptor
 template <typename CONTEXT>
-std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
+Fortran::common::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
     Context &context, int maxRepeat) {
   int repeat{CueUpNextDataEdit(context)};
   auto start{offset_};
@@ -451,7 +451,7 @@ std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
         }
         if (edit.ioTypeChars >= edit.maxIoTypeChars) {
           ReportBadFormat(context, "Excessive DT'iotype' in FORMAT", start);
-          return std::nullopt;
+          return Fortran::common::nullopt;
         }
         edit.ioType[edit.ioTypeChars++] = ch;
         if (ch == quote) {
@@ -460,7 +460,7 @@ std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
       }
       if (!ok) {
         ReportBadFormat(context, "Unclosed DT'iotype' in FORMAT", start);
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
     }
     if (PeekNext() == '(') {
@@ -475,7 +475,7 @@ std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
         }
         if (edit.vListEntries >= edit.maxVListEntries) {
           ReportBadFormat(context, "Excessive DT(v_list) in FORMAT", start);
-          return std::nullopt;
+          return Fortran::common::nullopt;
         }
         edit.vList[edit.vListEntries++] = n;
         auto ch{static_cast<char>(GetNextChar(context))};
@@ -486,7 +486,7 @@ std::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
       }
       if (!ok) {
         ReportBadFormat(context, "Unclosed DT(v_list) in FORMAT", start);
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
     }
   } else { // not DT'iotype'
diff --git a/flang/runtime/format.h b/flang/runtime/format.h
index 1fe0802ac43c69..e7d94559964041 100644
--- a/flang/runtime/format.h
+++ b/flang/runtime/format.h
@@ -14,9 +14,9 @@
 #include "environment.h"
 #include "io-error.h"
 #include "flang/Common/Fortran.h"
+#include "flang/Common/optional.h"
 #include "flang/Decimal/decimal.h"
 #include <cinttypes>
-#include <optional>
 
 namespace Fortran::runtime {
 class Descriptor;
@@ -64,9 +64,9 @@ struct DataEdit {
   static constexpr char DefinedDerivedType{'d'}; // DT defined I/O
 
   char variation{'\0'}; // N, S, or X for EN, ES, EX; G/l for original G/list
-  std::optional<int> width; // the 'w' field; optional for A
-  std::optional<int> digits; // the 'm' or 'd' field
-  std::optional<int> expoDigits; // 'Ee' field
+  Fortran::common::optional<int> width; // the 'w' field; optional for A
+  Fortran::common::optional<int> digits; // the 'm' or 'd' field
+  Fortran::common::optional<int> expoDigits; // 'Ee' field
   MutableModes modes;
   int repeat{1};
 
@@ -102,7 +102,8 @@ template <typename CONTEXT> class FormatControl {
   // Extracts the next data edit descriptor, handling control edit descriptors
   // along the way.  If maxRepeat==0, this is a peek at the next data edit
   // descriptor.
-  std::optional<DataEdit> GetNextDataEdit(Context &, int maxRepeat = 1);
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
+      Context &, int maxRepeat = 1);
 
   // Emit any remaining character literals after the last data item (on output)
   // and perform remaining record positioning actions.
diff --git a/flang/runtime/freestanding-tools.h b/flang/runtime/freestanding-tools.h
index bdc11ae93ac909..682b4c9b89294b 100644
--- a/flang/runtime/freestanding-tools.h
+++ b/flang/runtime/freestanding-tools.h
@@ -42,6 +42,11 @@
 #define STD_REALLOC_UNSUPPORTED 1
 #endif
 
+#if !defined(STD_MEMCHR_UNSUPPORTED) && \
+    (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define STD_MEMCHR_UNSUPPORTED 1
+#endif
+
 namespace Fortran::runtime {
 
 #if STD_FILL_N_UNSUPPORTED
@@ -134,5 +139,23 @@ static inline RT_API_ATTRS void *realloc(void *ptr, std::size_t newByteSize) {
 using std::realloc;
 #endif // !STD_REALLOC_UNSUPPORTED
 
+#if STD_MEMCHR_UNSUPPORTED
+// Provides alternative implementation for std::memchr(), if
+// it is not supported.
+static inline RT_API_ATTRS const void *memchr(
+    const void *ptr, int ch, std::size_t count) {
+  auto buf{reinterpret_cast<const unsigned char *>(ptr)};
+  auto c{static_cast<unsigned char>(ch)};
+  for (; count--; ++buf) {
+    if (*buf == c) {
+      return buf;
+    }
+  }
+  return nullptr;
+}
+#else // !STD_MEMCMP_UNSUPPORTED
+using std::memchr;
+#endif // !STD_MEMCMP_UNSUPPORTED
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_
diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index 0ac20d3346dd5b..094db5572f15c2 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -18,6 +18,7 @@
 #include "terminator.h"
 #include "tools.h"
 #include "unit.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/memory.h"
 #include <cstdlib>
@@ -168,7 +169,7 @@ static Cookie NoopUnit(const Terminator &terminator, int unitNumber,
 }
 
 static ExternalFileUnit *GetOrCreateUnit(int unitNumber, Direction direction,
-    std::optional<bool> isUnformatted, const Terminator &terminator,
+    Fortran::common::optional<bool> isUnformatted, const Terminator &terminator,
     Cookie &errorCookie) {
   if (ExternalFileUnit *
       unit{ExternalFileUnit::LookUpOrCreateAnonymous(
@@ -472,8 +473,8 @@ Cookie IONAME(BeginEndfile)(
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
   if (ExternalFileUnit *
-      unit{GetOrCreateUnit(unitNumber, Direction::Output, std::nullopt,
-          terminator, errorCookie)}) {
+      unit{GetOrCreateUnit(unitNumber, Direction::Output,
+          Fortran::common::nullopt, terminator, errorCookie)}) {
     if (ChildIo * child{unit->GetChildIo()}) {
       return &child->BeginIoStatement<ErroneousIoStatementState>(
           IostatBadOpOnChildUnit, nullptr /* no unit */, sourceFile,
@@ -492,8 +493,8 @@ Cookie IONAME(BeginRewind)(
   Terminator terminator{sourceFile, sourceLine};
   Cookie errorCookie{nullptr};
   if (ExternalFileUnit *
-      unit{GetOrCreateUnit(unitNumber, Direction::Input, std::nullopt,
-          terminator, errorCookie)}) {
+      unit{GetOrCreateUnit(unitNumber, Direction::Input,
+          Fortran::common::nullopt, terminator, errorCookie)}) {
     if (ChildIo * child{unit->GetChildIo()}) {
       return &child->BeginIoStatement<ErroneousIoStatementState>(
           IostatBadOpOnChildUnit, nullptr /* no unit */, sourceFile,
@@ -801,7 +802,7 @@ bool IONAME(SetAction)(Cookie cookie, const char *keyword, std::size_t length) {
     io.GetIoErrorHandler().Crash(
         "SetAction() called after GetNewUnit() for an OPEN statement");
   }
-  std::optional<Action> action;
+  Fortran::common::optional<Action> action;
   static const char *keywords[]{"READ", "WRITE", "READWRITE", nullptr};
   switch (IdentifyValue(keyword, length, keywords)) {
   case 0:
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 153195fd965624..075d7b5ae518a4 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -39,9 +39,9 @@ bool IoStatementBase::Receive(char *, std::size_t, std::size_t) {
   return false;
 }
 
-std::optional<DataEdit> IoStatementBase::GetNextDataEdit(
+Fortran::common::optional<DataEdit> IoStatementBase::GetNextDataEdit(
     IoStatementState &, int) {
-  return std::nullopt;
+  return Fortran::common::nullopt;
 }
 
 ExternalFileUnit *IoStatementBase::GetExternalFileUnit() const {
@@ -466,7 +466,7 @@ int ExternalFormattedIoStatementState<DIR, CHAR>::EndIoStatement() {
   return ExternalIoStatementState<DIR>::EndIoStatement();
 }
 
-std::optional<DataEdit> IoStatementState::GetNextDataEdit(int n) {
+Fortran::common::optional<DataEdit> IoStatementState::GetNextDataEdit(int n) {
   return common::visit(
       [&](auto &x) { return x.get().GetNextDataEdit(*this, n); }, u_);
 }
@@ -541,13 +541,13 @@ ExternalFileUnit *IoStatementState::GetExternalFileUnit() const {
       [](auto &x) { return x.get().GetExternalFileUnit(); }, u_);
 }
 
-std::optional<char32_t> IoStatementState::GetCurrentChar(
+Fortran::common::optional<char32_t> IoStatementState::GetCurrentChar(
     std::size_t &byteCount) {
   const char *p{nullptr};
   std::size_t bytes{GetNextInputBytes(p)};
   if (bytes == 0) {
     byteCount = 0;
-    return std::nullopt;
+    return Fortran::common::nullopt;
   } else {
     const ConnectionState &connection{GetConnectionState()};
     if (connection.isUTF8) {
@@ -573,8 +573,8 @@ std::optional<char32_t> IoStatementState::GetCurrentChar(
   }
 }
 
-std::optional<char32_t> IoStatementState::NextInField(
-    std::optional<int> &remaining, const DataEdit &edit) {
+Fortran::common::optional<char32_t> IoStatementState::NextInField(
+    Fortran::common::optional<int> &remaining, const DataEdit &edit) {
   std::size_t byteCount{0};
   if (!remaining) { // Stream, list-directed, or NAMELIST
     if (auto next{GetCurrentChar(byteCount)}) {
@@ -590,21 +590,21 @@ std::optional<char32_t> IoStatementState::NextInField(
         case '"':
         case '*':
         case '\n': // for stream access
-          return std::nullopt;
+          return Fortran::common::nullopt;
         case '&':
         case '$':
           if (edit.IsNamelist()) {
-            return std::nullopt;
+            return Fortran::common::nullopt;
           }
           break;
         case ',':
           if (!(edit.modes.editingFlags & decimalComma)) {
-            return std::nullopt;
+            return Fortran::common::nullopt;
           }
           break;
         case ';':
           if (edit.modes.editingFlags & decimalComma) {
-            return std::nullopt;
+            return Fortran::common::nullopt;
           }
           break;
         default:
@@ -618,7 +618,7 @@ std::optional<char32_t> IoStatementState::NextInField(
   } else if (*remaining > 0) {
     if (auto next{GetCurrentChar(byteCount)}) {
       if (byteCount > static_cast<std::size_t>(*remaining)) {
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
       *remaining -= byteCount;
       HandleRelativePosition(byteCount);
@@ -627,10 +627,10 @@ std::optional<char32_t> IoStatementState::NextInField(
     }
     if (CheckForEndOfRecord(0)) { // do padding
       --*remaining;
-      return std::optional<char32_t>{' '};
+      return Fortran::common::optional<char32_t>{' '};
     }
   }
-  return std::nullopt;
+  return Fortran::common::nullopt;
 }
 
 bool IoStatementState::CheckForEndOfRecord(std::size_t afterReading) {
@@ -722,7 +722,7 @@ bool ListDirectedStatementState<Direction::Output>::EmitLeadingSpaceOrAdvance(
   return true;
 }
 
-std::optional<DataEdit>
+Fortran::common::optional<DataEdit>
 ListDirectedStatementState<Direction::Output>::GetNextDataEdit(
     IoStatementState &io, int maxRepeat) {
   DataEdit edit;
@@ -739,7 +739,7 @@ int ListDirectedStatementState<Direction::Input>::EndIoStatement() {
   return IostatOk;
 }
 
-std::optional<DataEdit>
+Fortran::common::optional<DataEdit>
 ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
     IoStatementState &io, int maxRepeat) {
   // N.B. list-directed transfers cannot be nonadvancing (C1221)
@@ -795,7 +795,7 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
   }
   eatComma_ = true;
   if (!ch) {
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
   if (*ch == '/') {
     hitSlash_ = true;
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index dcee7f9365692a..e00d54980aae59 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -16,6 +16,8 @@
 #include "format.h"
 #include "internal-unit.h"
 #include "io-error.h"
+#include "flang/Common/optional.h"
+#include "flang/Common/reference-wrapper.h"
 #include "flang/Common/visit.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/io-api.h"
@@ -94,7 +96,7 @@ class IoStatementState {
   void BackspaceRecord();
   void HandleRelativePosition(std::int64_t byteOffset);
   void HandleAbsolutePosition(std::int64_t byteOffset); // for r* in list I/O
-  std::optional<DataEdit> GetNextDataEdit(int maxRepeat = 1);
+  Fortran::common::optional<DataEdit> GetNextDataEdit(int maxRepeat = 1);
   ExternalFileUnit *GetExternalFileUnit() const; // null if internal unit
   bool BeginReadingRecord();
   void FinishReadingRecord();
@@ -122,7 +124,7 @@ class IoStatementState {
   }
 
   // Vacant after the end of the current record
-  std::optional<char32_t> GetCurrentChar(std::size_t &byteCount);
+  Fortran::common::optional<char32_t> GetCurrentChar(std::size_t &byteCount);
 
   // The "remaining" arguments to CueUpInput(), SkipSpaces(), & NextInField()
   // are always in units of bytes, not characters; the distinction matters
@@ -130,8 +132,8 @@ class IoStatementState {
 
   // For fixed-width fields, return the number of remaining bytes.
   // Skip over leading blanks.
-  std::optional<int> CueUpInput(const DataEdit &edit) {
-    std::optional<int> remaining;
+  Fortran::common::optional<int> CueUpInput(const DataEdit &edit) {
+    Fortran::common::optional<int> remaining;
     if (edit.IsListDirected()) {
       std::size_t byteCount{0};
       GetNextNonBlank(byteCount);
@@ -148,7 +150,8 @@ class IoStatementState {
     return remaining;
   }
 
-  std::optional<char32_t> SkipSpaces(std::optional<int> &remaining) {
+  Fortran::common::optional<char32_t> SkipSpaces(
+      Fortran::common::optional<int> &remaining) {
     while (!remaining || *remaining > 0) {
       std::size_t byteCount{0};
       if (auto ch{GetCurrentChar(byteCount)}) {
@@ -167,27 +170,27 @@ class IoStatementState {
         break;
       }
     }
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 
   // Acquires the next input character, respecting any applicable field width
   // or separator character.
-  std::optional<char32_t> NextInField(
-      std::optional<int> &remaining, const DataEdit &);
+  Fortran::common::optional<char32_t> NextInField(
+      Fortran::common::optional<int> &remaining, const DataEdit &);
 
   // Detect and signal any end-of-record condition after input.
   // Returns true if at EOR and remaining input should be padded with blanks.
   bool CheckForEndOfRecord(std::size_t afterReading);
 
   // Skips spaces, advances records, and ignores NAMELIST comments
-  std::optional<char32_t> GetNextNonBlank(std::size_t &byteCount) {
+  Fortran::common::optional<char32_t> GetNextNonBlank(std::size_t &byteCount) {
     auto ch{GetCurrentChar(byteCount)};
     bool inNamelist{mutableModes().inNamelist};
     while (!ch || *ch == ' ' || *ch == '\t' || (inNamelist && *ch == '!')) {
       if (ch && (*ch == ' ' || *ch == '\t')) {
         HandleRelativePosition(byteCount);
       } else if (!AdvanceRecord()) {
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
       ch = GetCurrentChar(byteCount);
     }
@@ -208,39 +211,47 @@ class IoStatementState {
   }
 
 private:
-  std::variant<std::reference_wrapper<OpenStatementState>,
-      std::reference_wrapper<CloseStatementState>,
-      std::reference_wrapper<NoopStatementState>,
-      std::reference_wrapper<
+  std::variant<Fortran::common::reference_wrapper<OpenStatementState>,
+      Fortran::common::reference_wrapper<CloseStatementState>,
+      Fortran::common::reference_wrapper<NoopStatementState>,
+      Fortran::common::reference_wrapper<
           InternalFormattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
           InternalFormattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<InternalListIoStatementState<Direction::Output>>,
-      std::reference_wrapper<InternalListIoStatementState<Direction::Input>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
+          InternalListIoStatementState<Direction::Output>>,
+      Fortran::common::reference_wrapper<
+          InternalListIoStatementState<Direction::Input>>,
+      Fortran::common::reference_wrapper<
           ExternalFormattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
           ExternalFormattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<ExternalListIoStatementState<Direction::Output>>,
-      std::reference_wrapper<ExternalListIoStatementState<Direction::Input>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
+          ExternalListIoStatementState<Direction::Output>>,
+      Fortran::common::reference_wrapper<
+          ExternalListIoStatementState<Direction::Input>>,
+      Fortran::common::reference_wrapper<
           ExternalUnformattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
           ExternalUnformattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<ChildFormattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<ChildFormattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<ChildListIoStatementState<Direction::Output>>,
-      std::reference_wrapper<ChildListIoStatementState<Direction::Input>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
+          ChildFormattedIoStatementState<Direction::Output>>,
+      Fortran::common::reference_wrapper<
+          ChildFormattedIoStatementState<Direction::Input>>,
+      Fortran::common::reference_wrapper<
+          ChildListIoStatementState<Direction::Output>>,
+      Fortran::common::reference_wrapper<
+          ChildListIoStatementState<Direction::Input>>,
+      Fortran::common::reference_wrapper<
           ChildUnformattedIoStatementState<Direction::Output>>,
-      std::reference_wrapper<
+      Fortran::common::reference_wrapper<
           ChildUnformattedIoStatementState<Direction::Input>>,
-      std::reference_wrapper<InquireUnitState>,
-      std::reference_wrapper<InquireNoUnitState>,
-      std::reference_wrapper<InquireUnconnectedFileState>,
-      std::reference_wrapper<InquireIOLengthState>,
-      std::reference_wrapper<ExternalMiscIoStatementState>,
-      std::reference_wrapper<ErroneousIoStatementState>>
+      Fortran::common::reference_wrapper<InquireUnitState>,
+      Fortran::common::reference_wrapper<InquireNoUnitState>,
+      Fortran::common::reference_wrapper<InquireUnconnectedFileState>,
+      Fortran::common::reference_wrapper<InquireIOLengthState>,
+      Fortran::common::reference_wrapper<ExternalMiscIoStatementState>,
+      Fortran::common::reference_wrapper<ErroneousIoStatementState>>
       u_;
 };
 
@@ -262,7 +273,7 @@ class IoStatementBase : public IoErrorHandler {
   void BackspaceRecord();
   void HandleRelativePosition(std::int64_t);
   void HandleAbsolutePosition(std::int64_t);
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1);
   ExternalFileUnit *GetExternalFileUnit() const;
   bool BeginReadingRecord();
@@ -287,7 +298,7 @@ class ListDirectedStatementState<Direction::Output>
 public:
   bool EmitLeadingSpaceOrAdvance(
       IoStatementState &, std::size_t = 1, bool isCharacter = false);
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1);
   bool lastWasUndelimitedCharacter() const {
     return lastWasUndelimitedCharacter_;
@@ -309,7 +320,7 @@ class ListDirectedStatementState<Direction::Input>
   // Skips value separators, handles repetition and null values.
   // Vacant when '/' appears; present with descriptor == ListDirectedNullValue
   // when a null value appears.
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1);
 
   // Each NAMELIST input item is treated like a distinct list-directed
@@ -328,7 +339,7 @@ class ListDirectedStatementState<Direction::Input>
 
 private:
   int remaining_{0}; // for "r*" repetition
-  std::optional<SavedPosition> repeatPosition_;
+  Fortran::common::optional<SavedPosition> repeatPosition_;
   bool eatComma_{false}; // consume comma after previously read item
   bool hitSlash_{false}; // once '/' is seen, nullify further items
   bool realPart_{false};
@@ -380,7 +391,7 @@ class InternalFormattedIoStatementState
   IoStatementState &ioStatementState() { return ioStatementState_; }
   void CompleteOperation();
   int EndIoStatement();
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1) {
     return format_.GetNextDataEdit(*this, maxRepeat);
   }
@@ -465,7 +476,7 @@ class ExternalFormattedIoStatementState
       const char *sourceFile = nullptr, int sourceLine = 0);
   void CompleteOperation();
   int EndIoStatement();
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1) {
     return format_.GetNextDataEdit(*this, maxRepeat);
   }
@@ -523,7 +534,7 @@ class ChildFormattedIoStatementState : public ChildIoStatementState<DIR>,
   void CompleteOperation();
   int EndIoStatement();
   bool AdvanceRecord(int = 1);
-  std::optional<DataEdit> GetNextDataEdit(
+  Fortran::common::optional<DataEdit> GetNextDataEdit(
       IoStatementState &, int maxRepeat = 1) {
     return format_.GetNextDataEdit(*this, maxRepeat);
   }
@@ -571,14 +582,14 @@ class OpenStatementState : public ExternalIoStatementBase {
 private:
   bool wasExtant_;
   bool isNewUnit_;
-  std::optional<OpenStatus> status_;
-  std::optional<Position> position_;
-  std::optional<Action> action_;
+  Fortran::common::optional<OpenStatus> status_;
+  Fortran::common::optional<Position> position_;
+  Fortran::common::optional<Action> action_;
   Convert convert_{Convert::Unknown};
   OwningPtr<char> path_;
   std::size_t pathLength_;
-  std::optional<bool> isUnformatted_;
-  std::optional<Access> access_;
+  Fortran::common::optional<bool> isUnformatted_;
+  Fortran::common::optional<Access> access_;
 };
 
 class CloseStatementState : public ExternalIoStatementBase {
diff --git a/flang/runtime/matmul-transpose.cpp b/flang/runtime/matmul-transpose.cpp
index ee5fcd842b0258..a12d188266f7c3 100644
--- a/flang/runtime/matmul-transpose.cpp
+++ b/flang/runtime/matmul-transpose.cpp
@@ -23,6 +23,7 @@
 #include "flang/Runtime/matmul-transpose.h"
 #include "terminator.h"
 #include "tools.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/c-or-cpp.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
@@ -96,8 +97,8 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
-    SubscriptValue n, std::optional<std::size_t> xColumnByteStride,
-    std::optional<std::size_t> yColumnByteStride) {
+    SubscriptValue n, Fortran::common::optional<std::size_t> xColumnByteStride,
+    Fortran::common::optional<std::size_t> yColumnByteStride) {
   if (!xColumnByteStride) {
     if (!yColumnByteStride) {
       MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
@@ -163,7 +164,7 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
-    std::optional<std::size_t> xColumnByteStride) {
+    Fortran::common::optional<std::size_t> xColumnByteStride) {
   if (!xColumnByteStride) {
     MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, false>(
         product, rows, n, x, y);
@@ -229,7 +230,7 @@ inline static RT_API_ATTRS void DoMatmulTranspose(
         (IS_ALLOCATING || result.IsContiguous())) {
       // Contiguous numeric matrices (maybe with columns
       // separated by a stride).
-      std::optional<std::size_t> xColumnByteStride;
+      Fortran::common::optional<std::size_t> xColumnByteStride;
       if (!x.IsContiguous()) {
         // X's columns are strided.
         SubscriptValue xAt[2]{};
@@ -237,7 +238,7 @@ inline static RT_API_ATTRS void DoMatmulTranspose(
         xAt[1]++;
         xColumnByteStride = x.SubscriptsToByteOffset(xAt);
       }
-      std::optional<std::size_t> yColumnByteStride;
+      Fortran::common::optional<std::size_t> yColumnByteStride;
       if (!y.IsContiguous()) {
         // Y's columns are strided.
         SubscriptValue yAt[2]{};
diff --git a/flang/runtime/matmul.cpp b/flang/runtime/matmul.cpp
index e4595db779260f..543284cb5c3633 100644
--- a/flang/runtime/matmul.cpp
+++ b/flang/runtime/matmul.cpp
@@ -22,6 +22,7 @@
 #include "flang/Runtime/matmul.h"
 #include "terminator.h"
 #include "tools.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/c-or-cpp.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
@@ -116,8 +117,8 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 inline RT_API_ATTRS void MatrixTimesMatrixHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
-    SubscriptValue n, std::optional<std::size_t> xColumnByteStride,
-    std::optional<std::size_t> yColumnByteStride) {
+    SubscriptValue n, Fortran::common::optional<std::size_t> xColumnByteStride,
+    Fortran::common::optional<std::size_t> yColumnByteStride) {
   if (!xColumnByteStride) {
     if (!yColumnByteStride) {
       MatrixTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
@@ -183,7 +184,7 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
 inline RT_API_ATTRS void MatrixTimesVectorHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
     SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
-    std::optional<std::size_t> xColumnByteStride) {
+    Fortran::common::optional<std::size_t> xColumnByteStride) {
   if (!xColumnByteStride) {
     MatrixTimesVector<RCAT, RKIND, XT, YT, false>(product, rows, n, x, y);
   } else {
@@ -240,7 +241,7 @@ template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
 inline RT_API_ATTRS void VectorTimesMatrixHelper(
     CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue n,
     SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
-    std::optional<std::size_t> yColumnByteStride) {
+    Fortran::common::optional<std::size_t> yColumnByteStride) {
   if (!yColumnByteStride) {
     VectorTimesMatrix<RCAT, RKIND, XT, YT, false>(product, n, cols, x, y);
   } else {
@@ -301,7 +302,7 @@ static inline RT_API_ATTRS void DoMatmul(
         (IS_ALLOCATING || result.IsContiguous())) {
       // Contiguous numeric matrices (maybe with columns
       // separated by a stride).
-      std::optional<std::size_t> xColumnByteStride;
+      Fortran::common::optional<std::size_t> xColumnByteStride;
       if (!x.IsContiguous()) {
         // X's columns are strided.
         SubscriptValue xAt[2]{};
@@ -309,7 +310,7 @@ static inline RT_API_ATTRS void DoMatmul(
         xAt[1]++;
         xColumnByteStride = x.SubscriptsToByteOffset(xAt);
       }
-      std::optional<std::size_t> yColumnByteStride;
+      Fortran::common::optional<std::size_t> yColumnByteStride;
       if (!y.IsContiguous()) {
         // Y's columns are strided.
         SubscriptValue yAt[2]{};
diff --git a/flang/runtime/misc-intrinsic.cpp b/flang/runtime/misc-intrinsic.cpp
index 56f2028c2ff02c..f5b292a1f3d32c 100644
--- a/flang/runtime/misc-intrinsic.cpp
+++ b/flang/runtime/misc-intrinsic.cpp
@@ -9,16 +9,16 @@
 #include "flang/Runtime/misc-intrinsic.h"
 #include "terminator.h"
 #include "tools.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <cstring>
-#include <optional>
 
 namespace Fortran::runtime {
 
 static RT_API_ATTRS void TransferImpl(Descriptor &result,
     const Descriptor &source, const Descriptor &mold, const char *sourceFile,
-    int line, std::optional<std::int64_t> resultExtent) {
+    int line, Fortran::common::optional<std::int64_t> resultExtent) {
   int rank{resultExtent.has_value() ? 1 : 0};
   std::size_t elementBytes{mold.ElementBytes()};
   result.Establish(mold.type(), elementBytes, nullptr, rank, nullptr,
@@ -57,7 +57,7 @@ RT_EXT_API_GROUP_BEGIN
 
 void RTDEF(Transfer)(Descriptor &result, const Descriptor &source,
     const Descriptor &mold, const char *sourceFile, int line) {
-  std::optional<std::int64_t> elements;
+  Fortran::common::optional<std::int64_t> elements;
   if (mold.rank() > 0) {
     if (std::size_t sourceElementBytes{
             source.Elements() * source.ElementBytes()}) {
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index f5d7404fdcb86b..ac9234f4af832b 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -116,10 +116,11 @@ static bool GetLowerCaseName(
   return false;
 }
 
-static std::optional<SubscriptValue> GetSubscriptValue(IoStatementState &io) {
-  std::optional<SubscriptValue> value;
+static Fortran::common::optional<SubscriptValue> GetSubscriptValue(
+    IoStatementState &io) {
+  Fortran::common::optional<SubscriptValue> value;
   std::size_t byteCount{0};
-  std::optional<char32_t> ch{io.GetCurrentChar(byteCount)};
+  Fortran::common::optional<char32_t> ch{io.GetCurrentChar(byteCount)};
   bool negate{ch && *ch == '-'};
   if ((ch && *ch == '+') || negate) {
     io.HandleRelativePosition(byteCount);
@@ -136,7 +137,7 @@ static std::optional<SubscriptValue> GetSubscriptValue(IoStatementState &io) {
   if (overflow) {
     io.GetIoErrorHandler().SignalError(
         "NAMELIST input subscript value overflow");
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
   if (negate) {
     if (value) {
@@ -158,7 +159,7 @@ static bool HandleSubscripts(IoStatementState &io, Descriptor &desc,
   std::size_t contiguousStride{source.ElementBytes()};
   bool ok{true};
   std::size_t byteCount{0};
-  std::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
+  Fortran::common::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
   char32_t comma{GetComma(io)};
   for (; ch && *ch != ')'; ++j) {
     SubscriptValue dimLower{0}, dimUpper{0}, dimStride{0};
@@ -282,9 +283,9 @@ static bool HandleSubstring(
   SubscriptValue chars{static_cast<SubscriptValue>(desc.ElementBytes()) / kind};
   // Allow for blanks in substring bounds; they're nonstandard, but not
   // ambiguous within the parentheses.
-  std::optional<SubscriptValue> lower, upper;
+  Fortran::common::optional<SubscriptValue> lower, upper;
   std::size_t byteCount{0};
-  std::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
+  Fortran::common::optional<char32_t> ch{io.GetNextNonBlank(byteCount)};
   if (ch) {
     if (*ch == ':') {
       lower = 1;
@@ -346,7 +347,8 @@ static bool HandleComponent(IoStatementState &io, Descriptor &desc,
           // If base and component are both arrays, the component name
           // must be followed by subscripts; process them now.
           std::size_t byteCount{0};
-          if (std::optional<char32_t> next{io.GetNextNonBlank(byteCount)};
+          if (Fortran::common::optional<char32_t> next{
+                  io.GetNextNonBlank(byteCount)};
               next && *next == '(') {
             io.HandleRelativePosition(byteCount); // skip over '('
             StaticDescriptor<maxRank, true, 16> staticDesc;
@@ -435,7 +437,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   RUNTIME_CHECK(handler, listInput != nullptr);
   // Find this namelist group's header in the input
   io.BeginReadingRecord();
-  std::optional<char32_t> next;
+  Fortran::common::optional<char32_t> next;
   char name[nameBufferSize];
   RUNTIME_CHECK(handler, group.groupName != nullptr);
   char32_t comma{GetComma(io)};
diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h
index ecc3b2654d9652..8ea3daaa57bcf8 100644
--- a/flang/runtime/numeric-templates.h
+++ b/flang/runtime/numeric-templates.h
@@ -122,12 +122,19 @@ template <typename T> struct ABSTy {
   static constexpr RT_API_ATTRS T compute(T x) { return std::abs(x); }
 };
 
+// Suppress the warnings about calling __host__-only
+// 'long double' std::frexp, from __device__ code.
+RT_DIAG_PUSH
+RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
+
 template <typename T> struct FREXPTy {
   static constexpr RT_API_ATTRS T compute(T x, int *e) {
     return std::frexp(x, e);
   }
 };
 
+RT_DIAG_POP
+
 template <typename T> struct ILOGBTy {
   static constexpr RT_API_ATTRS int compute(T x) { return std::ilogb(x); }
 };
diff --git a/flang/runtime/pointer.cpp b/flang/runtime/pointer.cpp
index 08a1223764f393..b01735dc30e691 100644
--- a/flang/runtime/pointer.cpp
+++ b/flang/runtime/pointer.cpp
@@ -185,6 +185,7 @@ int RTDEF(PointerDeallocate)(Descriptor &pointer, bool hasStat,
   if (!pointer.IsAllocated()) {
     return ReturnError(terminator, StatBaseNull, errMsg, hasStat);
   }
+#if !defined(RT_DEVICE_COMPILATION)
   if (executionEnvironment.checkPointerDeallocation) {
     // Validate the footer.  This should fail if the pointer doesn't
     // span the entire object, or the object was not allocated as a
@@ -200,6 +201,7 @@ int RTDEF(PointerDeallocate)(Descriptor &pointer, bool hasStat,
           terminator, StatBadPointerDeallocation, errMsg, hasStat);
     }
   }
+#endif
   return ReturnError(terminator,
       pointer.Destroy(/*finalize=*/true, /*destroyPointers=*/true, &terminator),
       errMsg, hasStat);
diff --git a/flang/runtime/random-templates.h b/flang/runtime/random-templates.h
index ce64a94901a281..f34422f6f5d9ac 100644
--- a/flang/runtime/random-templates.h
+++ b/flang/runtime/random-templates.h
@@ -11,6 +11,7 @@
 
 #include "lock.h"
 #include "numeric-templates.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <random>
@@ -32,7 +33,7 @@ static constexpr int rangeBits{
 
 extern Lock lock;
 extern Generator generator;
-extern std::optional<GeneratedWord> nextValue;
+extern Fortran::common::optional<GeneratedWord> nextValue;
 
 // Call only with lock held
 static GeneratedWord GetNextValue() {
diff --git a/flang/runtime/random.cpp b/flang/runtime/random.cpp
index 13bed1f0abe10c..e0a421fd283962 100644
--- a/flang/runtime/random.cpp
+++ b/flang/runtime/random.cpp
@@ -28,7 +28,7 @@ namespace Fortran::runtime::random {
 
 Lock lock;
 Generator generator;
-std::optional<GeneratedWord> nextValue;
+Fortran::common::optional<GeneratedWord> nextValue;
 
 extern "C" {
 
diff --git a/flang/runtime/tools.cpp b/flang/runtime/tools.cpp
index e653323ed1de03..71022c7a8c179d 100644
--- a/flang/runtime/tools.cpp
+++ b/flang/runtime/tools.cpp
@@ -175,7 +175,7 @@ RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from) {
 
 RT_API_ATTRS char *EnsureNullTerminated(
     char *str, std::size_t length, Terminator &terminator) {
-  if (std::memchr(str, '\0', length) == nullptr) {
+  if (runtime::memchr(str, '\0', length) == nullptr) {
     char *newCmd{(char *)AllocateMemoryOrCrash(terminator, length + 1)};
     std::memcpy(newCmd, str, length);
     newCmd[length] = '\0';
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index c1f89cadca06e7..df25eb8882335b 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -12,6 +12,7 @@
 #include "freestanding-tools.h"
 #include "stat.h"
 #include "terminator.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/memory.h"
@@ -95,7 +96,7 @@ static inline RT_API_ATTRS std::int64_t GetInt64(
   }
 }
 
-static inline RT_API_ATTRS std::optional<std::int64_t> GetInt64Safe(
+static inline RT_API_ATTRS Fortran::common::optional<std::int64_t> GetInt64Safe(
     const char *p, std::size_t bytes, Terminator &terminator) {
   switch (bytes) {
   case 1:
@@ -113,7 +114,7 @@ static inline RT_API_ATTRS std::optional<std::int64_t> GetInt64Safe(
     if (static_cast<Int128>(result) == n) {
       return result;
     }
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
   default:
     terminator.Crash("GetInt64Safe: no case for %zd bytes", bytes);
@@ -334,7 +335,8 @@ inline RT_API_ATTRS RESULT ApplyLogicalKind(
 }
 
 // Calculate result type of (X op Y) for *, //, DOT_PRODUCT, &c.
-std::optional<std::pair<TypeCategory, int>> inline constexpr RT_API_ATTRS
+Fortran::common::optional<
+    std::pair<TypeCategory, int>> inline constexpr RT_API_ATTRS
 GetResultType(TypeCategory xCat, int xKind, TypeCategory yCat, int yKind) {
   int maxKind{std::max(xKind, yKind)};
   switch (xCat) {
@@ -390,18 +392,18 @@ GetResultType(TypeCategory xCat, int xKind, TypeCategory yCat, int yKind) {
     if (yCat == TypeCategory::Character) {
       return std::make_pair(TypeCategory::Character, maxKind);
     } else {
-      return std::nullopt;
+      return Fortran::common::nullopt;
     }
   case TypeCategory::Logical:
     if (yCat == TypeCategory::Logical) {
       return std::make_pair(TypeCategory::Logical, maxKind);
     } else {
-      return std::nullopt;
+      return Fortran::common::nullopt;
     }
   default:
     break;
   }
-  return std::nullopt;
+  return Fortran::common::nullopt;
 }
 
 // Accumulate floating-point results in (at least) double precision
@@ -428,7 +430,7 @@ template <>
 inline RT_API_ATTRS const char *FindCharacter(
     const char *data, char ch, std::size_t chars) {
   return reinterpret_cast<const char *>(
-      std::memchr(data, static_cast<int>(ch), chars));
+      runtime::memchr(data, static_cast<int>(ch), chars));
 }
 
 // Copy payload data from one allocated descriptor to another.
diff --git a/flang/runtime/type-code.cpp b/flang/runtime/type-code.cpp
index b9ef307835dfba..cb1b944433aae5 100644
--- a/flang/runtime/type-code.cpp
+++ b/flang/runtime/type-code.cpp
@@ -112,7 +112,7 @@ RT_API_ATTRS TypeCode::TypeCode(TypeCategory f, int kind) {
   }
 }
 
-RT_API_ATTRS std::optional<std::pair<TypeCategory, int>>
+RT_API_ATTRS Fortran::common::optional<std::pair<TypeCategory, int>>
 TypeCode::GetCategoryAndKind() const {
   switch (raw_) {
   case CFI_type_signed_char:
@@ -204,7 +204,7 @@ TypeCode::GetCategoryAndKind() const {
   case CFI_type_char32_t:
     return std::make_pair(TypeCategory::Character, 4);
   default:
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 }
 
diff --git a/flang/runtime/type-info.cpp b/flang/runtime/type-info.cpp
index b30a2c832a1388..cb18c5669b5ffc 100644
--- a/flang/runtime/type-info.cpp
+++ b/flang/runtime/type-info.cpp
@@ -15,7 +15,7 @@ namespace Fortran::runtime::typeInfo {
 
 RT_OFFLOAD_API_GROUP_BEGIN
 
-RT_API_ATTRS std::optional<TypeParameterValue> Value::GetValue(
+RT_API_ATTRS Fortran::common::optional<TypeParameterValue> Value::GetValue(
     const Descriptor *descriptor) const {
   switch (genre_) {
   case Genre::Explicit:
@@ -26,9 +26,9 @@ RT_API_ATTRS std::optional<TypeParameterValue> Value::GetValue(
         return addendum->LenParameterValue(value_);
       }
     }
-    return std::nullopt;
+    return Fortran::common::nullopt;
   default:
-    return std::nullopt;
+    return Fortran::common::nullopt;
   }
 }
 
diff --git a/flang/runtime/type-info.h b/flang/runtime/type-info.h
index bd8112d9d6d8c0..c3f3595e32ef28 100644
--- a/flang/runtime/type-info.h
+++ b/flang/runtime/type-info.h
@@ -15,10 +15,10 @@
 #include "terminator.h"
 #include "flang/Common/Fortran.h"
 #include "flang/Common/bit-population-count.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/descriptor.h"
 #include <cinttypes>
 #include <memory>
-#include <optional>
 
 namespace Fortran::runtime::typeInfo {
 
@@ -39,7 +39,7 @@ class Value {
     LenParameter = 3
   };
   RT_API_ATTRS Genre genre() const { return genre_; }
-  RT_API_ATTRS std::optional<TypeParameterValue> GetValue(
+  RT_API_ATTRS Fortran::common::optional<TypeParameterValue> GetValue(
       const Descriptor *) const;
 
 private:
@@ -58,7 +58,7 @@ class Component {
     Automatic = 4
   };
 
-  const RT_API_ATTRS Descriptor &name() const { return name_.descriptor(); }
+  RT_API_ATTRS const Descriptor &name() const { return name_.descriptor(); }
   RT_API_ATTRS Genre genre() const { return genre_; }
   RT_API_ATTRS TypeCategory category() const {
     return static_cast<TypeCategory>(category_);
@@ -66,17 +66,17 @@ class Component {
   RT_API_ATTRS int kind() const { return kind_; }
   RT_API_ATTRS int rank() const { return rank_; }
   RT_API_ATTRS std::uint64_t offset() const { return offset_; }
-  const RT_API_ATTRS Value &characterLen() const { return characterLen_; }
-  const RT_API_ATTRS DerivedType *derivedType() const {
+  RT_API_ATTRS const Value &characterLen() const { return characterLen_; }
+  RT_API_ATTRS const DerivedType *derivedType() const {
     return derivedType_.descriptor().OffsetElement<const DerivedType>();
   }
-  const RT_API_ATTRS Value *lenValue() const {
+  RT_API_ATTRS const Value *lenValue() const {
     return lenValue_.descriptor().OffsetElement<const Value>();
   }
-  const RT_API_ATTRS Value *bounds() const {
+  RT_API_ATTRS const Value *bounds() const {
     return bounds_.descriptor().OffsetElement<const Value>();
   }
-  const RT_API_ATTRS char *initialization() const { return initialization_; }
+  RT_API_ATTRS const char *initialization() const { return initialization_; }
 
   RT_API_ATTRS std::size_t GetElementByteSize(const Descriptor &) const;
   RT_API_ATTRS std::size_t GetElements(const Descriptor &) const;
@@ -205,27 +205,27 @@ class DerivedType {
 public:
   ~DerivedType(); // never defined
 
-  const RT_API_ATTRS Descriptor &binding() const {
+  RT_API_ATTRS const Descriptor &binding() const {
     return binding_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &name() const { return name_.descriptor(); }
+  RT_API_ATTRS const Descriptor &name() const { return name_.descriptor(); }
   RT_API_ATTRS std::uint64_t sizeInBytes() const { return sizeInBytes_; }
-  const RT_API_ATTRS Descriptor &uninstatiated() const {
+  RT_API_ATTRS const Descriptor &uninstatiated() const {
     return uninstantiated_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &kindParameter() const {
+  RT_API_ATTRS const Descriptor &kindParameter() const {
     return kindParameter_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &lenParameterKind() const {
+  RT_API_ATTRS const Descriptor &lenParameterKind() const {
     return lenParameterKind_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &component() const {
+  RT_API_ATTRS const Descriptor &component() const {
     return component_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &procPtr() const {
+  RT_API_ATTRS const Descriptor &procPtr() const {
     return procPtr_.descriptor();
   }
-  const RT_API_ATTRS Descriptor &special() const {
+  RT_API_ATTRS const Descriptor &special() const {
     return special_.descriptor();
   }
   RT_API_ATTRS bool hasParent() const { return hasParent_; }
@@ -241,14 +241,14 @@ class DerivedType {
     return lenParameterKind().Elements();
   }
 
-  const RT_API_ATTRS DerivedType *GetParentType() const;
+  RT_API_ATTRS const DerivedType *GetParentType() const;
 
   // Finds a data component by name in this derived type or its ancestors.
-  const RT_API_ATTRS Component *FindDataComponent(
+  RT_API_ATTRS const Component *FindDataComponent(
       const char *name, std::size_t nameLen) const;
 
   // O(1) look-up of special procedure bindings
-  const RT_API_ATTRS SpecialBinding *FindSpecialBinding(
+  RT_API_ATTRS const SpecialBinding *FindSpecialBinding(
       SpecialBinding::Which which) const {
     auto bitIndex{static_cast<std::uint32_t>(which)};
     auto bit{std::uint32_t{1} << bitIndex};
diff --git a/flang/runtime/unit-map.cpp b/flang/runtime/unit-map.cpp
index 6ebda5c45f0870..684a9b9e20b97a 100644
--- a/flang/runtime/unit-map.cpp
+++ b/flang/runtime/unit-map.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "unit-map.h"
+#include "flang/Common/optional.h"
 
 namespace Fortran::runtime::io {
 
@@ -29,7 +30,7 @@ void UnitMap::Initialize() {
 ExternalFileUnit &UnitMap::NewUnit(const Terminator &terminator) {
   CriticalSection critical{lock_};
   Initialize();
-  std::optional<int> n{freeNewUnits_.PopValue()};
+  Fortran::common::optional<int> n{freeNewUnits_.PopValue()};
   if (!n) {
     n = emergencyNewUnit_++;
   }
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 09782d2f849278..82f0e68cc20a26 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -52,7 +52,7 @@ ExternalFileUnit *ExternalFileUnit::LookUpOrCreate(
 }
 
 ExternalFileUnit *ExternalFileUnit::LookUpOrCreateAnonymous(int unit,
-    Direction dir, std::optional<bool> isUnformatted,
+    Direction dir, Fortran::common::optional<bool> isUnformatted,
     const Terminator &terminator) {
   // Make sure that the returned anonymous unit has been opened
   // not just created in the unitMap.
@@ -95,9 +95,10 @@ ExternalFileUnit &ExternalFileUnit::NewUnit(
   return unit;
 }
 
-bool ExternalFileUnit::OpenUnit(std::optional<OpenStatus> status,
-    std::optional<Action> action, Position position, OwningPtr<char> &&newPath,
-    std::size_t newPathLength, Convert convert, IoErrorHandler &handler) {
+bool ExternalFileUnit::OpenUnit(Fortran::common::optional<OpenStatus> status,
+    Fortran::common::optional<Action> action, Position position,
+    OwningPtr<char> &&newPath, std::size_t newPathLength, Convert convert,
+    IoErrorHandler &handler) {
   if (convert == Convert::Unknown) {
     convert = executionEnvironment.conversion;
   }
@@ -176,9 +177,10 @@ bool ExternalFileUnit::OpenUnit(std::optional<OpenStatus> status,
   return impliedClose;
 }
 
-void ExternalFileUnit::OpenAnonymousUnit(std::optional<OpenStatus> status,
-    std::optional<Action> action, Position position, Convert convert,
-    IoErrorHandler &handler) {
+void ExternalFileUnit::OpenAnonymousUnit(
+    Fortran::common::optional<OpenStatus> status,
+    Fortran::common::optional<Action> action, Position position,
+    Convert convert, IoErrorHandler &handler) {
   // I/O to an unconnected unit reads/creates a local file, e.g. fort.7
   std::size_t pathMaxLen{32};
   auto path{SizedNew<char>{handler}(pathMaxLen)};
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index e3c8757645bb9e..fc5bead7e1d930 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -21,10 +21,10 @@
 #include "lock.h"
 #include "terminator.h"
 #include "flang/Common/constexpr-bitset.h"
+#include "flang/Common/optional.h"
 #include "flang/Runtime/memory.h"
 #include <cstdlib>
 #include <cstring>
-#include <optional>
 #include <variant>
 
 namespace Fortran::runtime::io {
@@ -55,7 +55,7 @@ class ExternalFileUnit : public ConnectionState,
   static ExternalFileUnit *LookUpOrCreate(
       int unit, const Terminator &, bool &wasExtant);
   static ExternalFileUnit *LookUpOrCreateAnonymous(int unit, Direction,
-      std::optional<bool> isUnformatted, const Terminator &);
+      Fortran::common::optional<bool> isUnformatted, const Terminator &);
   static ExternalFileUnit *LookUp(const char *path, std::size_t pathLen);
   static ExternalFileUnit &CreateNew(int unit, const Terminator &);
   static ExternalFileUnit *LookUpForClose(int unit);
@@ -64,11 +64,11 @@ class ExternalFileUnit : public ConnectionState,
   static void FlushAll(IoErrorHandler &);
 
   // Returns true if an existing unit was closed
-  bool OpenUnit(std::optional<OpenStatus>, std::optional<Action>, Position,
-      OwningPtr<char> &&path, std::size_t pathLength, Convert,
-      IoErrorHandler &);
-  void OpenAnonymousUnit(std::optional<OpenStatus>, std::optional<Action>,
-      Position, Convert, IoErrorHandler &);
+  bool OpenUnit(Fortran::common::optional<OpenStatus>,
+      Fortran::common::optional<Action>, Position, OwningPtr<char> &&path,
+      std::size_t pathLength, Convert, IoErrorHandler &);
+  void OpenAnonymousUnit(Fortran::common::optional<OpenStatus>,
+      Fortran::common::optional<Action>, Position, Convert, IoErrorHandler &);
   void CloseUnit(CloseStatus, IoErrorHandler &);
   void DestroyClosed();
 
@@ -169,7 +169,7 @@ class ExternalFileUnit : public ConnectionState,
       u_;
 
   // Points to the active alternative (if any) in u_ for use as a Cookie
-  std::optional<IoStatementState> io_;
+  Fortran::common::optional<IoStatementState> io_;
 
   // A stack of child I/O pseudo-units for defined I/O that have this
   // unit number.
@@ -211,7 +211,7 @@ class ChildIo {
       ChildUnformattedIoStatementState<Direction::Input>, InquireUnitState,
       ErroneousIoStatementState, ExternalMiscIoStatementState>
       u_;
-  std::optional<IoStatementState> io_;
+  Fortran::common::optional<IoStatementState> io_;
 };
 
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/utf.cpp b/flang/runtime/utf.cpp
index 8f59ddbb196632..e9ccc2c04b6b07 100644
--- a/flang/runtime/utf.cpp
+++ b/flang/runtime/utf.cpp
@@ -40,7 +40,7 @@ const std::uint8_t UTF8FirstByteTable[256]{
 // clang-format on
 
 // Non-minimal encodings are accepted.
-std::optional<char32_t> DecodeUTF8(const char *p0) {
+Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
   const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
   std::size_t bytes{MeasureUTF8Bytes(*p0)};
   if (bytes == 1) {
@@ -50,7 +50,7 @@ std::optional<char32_t> DecodeUTF8(const char *p0) {
     for (std::size_t j{1}; j < bytes; ++j) {
       std::uint8_t next{p[j]};
       if (next < 0x80 || next > 0xbf) {
-        return std::nullopt;
+        return Fortran::common::nullopt;
       }
       result = (result << 6) | (next & 0x3f);
     }
@@ -58,7 +58,7 @@ std::optional<char32_t> DecodeUTF8(const char *p0) {
       return static_cast<char32_t>(result);
     }
   }
-  return std::nullopt;
+  return Fortran::common::nullopt;
 }
 
 std::size_t EncodeUTF8(char *p0, char32_t ucs) {
diff --git a/flang/runtime/utf.h b/flang/runtime/utf.h
index 6d9943bb6b8a2f..2b4e4f9a188758 100644
--- a/flang/runtime/utf.h
+++ b/flang/runtime/utf.h
@@ -41,9 +41,9 @@
 #ifndef FORTRAN_RUNTIME_UTF_H_
 #define FORTRAN_RUNTIME_UTF_H_
 
+#include "flang/Common/optional.h"
 #include <cstddef>
 #include <cstdint>
-#include <optional>
 
 namespace Fortran::runtime {
 
@@ -58,7 +58,7 @@ static constexpr std::size_t maxUTF8Bytes{7};
 
 // Ensure that all bytes are present in sequence in the input buffer
 // before calling; use MeasureUTF8Bytes(first byte) to count them.
-std::optional<char32_t> DecodeUTF8(const char *);
+Fortran::common::optional<char32_t> DecodeUTF8(const char *);
 
 // Ensure that at least maxUTF8Bytes remain in the output
 // buffer before calling.
diff --git a/flang/test/Driver/bbc-mlir-pass-pipeline.f90 b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
index 243a620a9fd003..b25edf3599175e 100644
--- a/flang/test/Driver/bbc-mlir-pass-pipeline.f90
+++ b/flang/test/Driver/bbc-mlir-pass-pipeline.f90
@@ -38,9 +38,12 @@
 ! CHECK-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! CHECK-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
+! CHECK-NEXT: Pipeline Collection : ['func.func', 'omp.reduction.declare']
 ! CHECK-NEXT: 'func.func' Pipeline
 ! CHECK-NEXT:   PolymorphicOpConversion
-! CHECK-NEXT:   CFGConversion
+! CHECK-NEXT:   CFGConversionOnFunc
+! CHECK-NEXT: 'omp.reduction.declare' Pipeline
+! CHECK-NEXT:   CFGConversionOnReduction
 
 ! CHECK-NEXT: SCFToControlFlow
 ! CHECK-NEXT: Canonicalizer
diff --git a/flang/test/Driver/cuda-option.f90 b/flang/test/Driver/cuda-option.f90
index 112e1cb6c77f8c..562f8683b0ff7d 100644
--- a/flang/test/Driver/cuda-option.f90
+++ b/flang/test/Driver/cuda-option.f90
@@ -1,6 +1,6 @@
 ! Test -fcuda option
-! RUN: %flang -fc1 -cpp -x cuda -fdebug-unparse %s -o - | FileCheck %s
-! RUN: not %flang -fc1 -cpp %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
+! RUN: %flang_fc1 -cpp -x cuda -fdebug-unparse %s -o - | FileCheck %s
+! RUN: not %flang_fc1 -cpp %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 program main
 #if _CUDA
   integer :: var = _CUDA
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index 44dbac44772b29..bf3660d57cbb4f 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -52,8 +52,6 @@
 ! CHECK-NEXT:                         Do not use HLFIR lowering (deprecated)
 ! CHECK-NEXT: -flang-experimental-hlfir
 ! CHECK-NEXT:                         Use HLFIR lowering (experimental)
-! CHECK-NEXT: -flang-experimental-polymorphism
-! CHECK-NEXT:                         Enable Fortran 2003 polymorphism (experimental)
 ! CHECK-NEXT: -flarge-sizes           Use INTEGER(KIND=8) for the result type in size-related intrinsics
 ! CHECK-NEXT: -flogical-abbreviations Enable logical abbreviations
 ! CHECK-NEXT: -flto=auto              Enable LTO in 'full' mode
diff --git a/flang/test/Driver/flang-experimental-polymorphism-flag.f90 b/flang/test/Driver/flang-experimental-polymorphism-flag.f90
deleted file mode 100644
index 095c1cc929e67b..00000000000000
--- a/flang/test/Driver/flang-experimental-polymorphism-flag.f90
+++ /dev/null
@@ -1,10 +0,0 @@
-! Test -flang-experimental-hlfir flag
-! RUN: %flang_fc1 -flang-experimental-polymorphism -emit-fir -o - %s | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -o - %s 2>&1 | FileCheck %s --check-prefix NO-POLYMORPHISM
-
-! CHECK: func.func @_QPtest(%{{.*}}: !fir.class<none> {fir.bindc_name = "poly"})
-subroutine test(poly)
-  class(*) :: poly
-end subroutine test
-
-! NO-POLYMORPHISM: func.func @_QPtest
diff --git a/flang/test/Driver/frontend-forwarding.f90 b/flang/test/Driver/frontend-forwarding.f90
index 8e9c9b78c3c10a..eac9773ce25c77 100644
--- a/flang/test/Driver/frontend-forwarding.f90
+++ b/flang/test/Driver/frontend-forwarding.f90
@@ -17,7 +17,6 @@
 ! RUN:     -fomit-frame-pointer \
 ! RUN:     -fpass-plugin=Bye%pluginext \
 ! RUN:     -fversion-loops-for-stride \
-! RUN:     -flang-experimental-polymorphism \
 ! RUN:     -flang-experimental-hlfir \
 ! RUN:     -flang-deprecated-no-hlfir \
 ! RUN:     -fno-ppc-native-vector-element-order \
@@ -49,7 +48,6 @@
 ! CHECK: "-fconvert=little-endian"
 ! CHECK: "-fpass-plugin=Bye
 ! CHECK: "-fversion-loops-for-stride"
-! CHECK: "-flang-experimental-polymorphism"
 ! CHECK: "-flang-experimental-hlfir"
 ! CHECK: "-flang-deprecated-no-hlfir"
 ! CHECK: "-fno-ppc-native-vector-element-order"
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index 45b1717d7187db..a865b170c3d417 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -58,10 +58,12 @@
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: 'func.func' Pipeline
-! ALL-NEXT:   PolymorphicOpConversion
-! ALL-NEXT:   CFGConversion
-
+! ALL-NEXT: Pipeline Collection : ['func.func', 'omp.reduction.declare']
+! ALL-NEXT:   'func.func' Pipeline
+! ALL-NEXT:     PolymorphicOpConversion
+! ALL-NEXT:     CFGConversionOnFunc
+! ALL-NEXT:   'omp.reduction.declare' Pipeline
+! ALL-NEXT:     CFGConversionOnReduction
 ! ALL-NEXT: SCFToControlFlow
 ! ALL-NEXT: Canonicalizer
 ! ALL-NEXT: SimplifyRegionLite
@@ -72,9 +74,9 @@
 
 ! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func']
 ! ALL-NEXT:   'fir.global' Pipeline
-! ALL-NEXT:    AbstractResultOnGlobalOpt
-! ALL-NEXT:  'func.func' Pipeline
-! ALL-NEXT:    AbstractResultOnFuncOpt
+! ALL-NEXT:   AbstractResultOnGlobalOpt
+! ALL-NEXT:   'func.func' Pipeline
+! ALL-NEXT:   AbstractResultOnFuncOpt
 
 ! ALL-NEXT: CodeGenRewrite
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations eliminated
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 3d8c42f123e2eb..ee3f41a15ca481 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -1,8 +1,8 @@
 ! Test the MLIR pass pipeline
 
-! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline -o /dev/null %s 2>&1 | FileCheck --check-prefixes=ALL %s
+! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline -o /dev/null %s 2>&1 | FileCheck --check-prefixes=ALL,NOTO2 %s
 ! -O0 is the default:
-! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline %s -O0 -o /dev/null 2>&1 | FileCheck --check-prefixes=ALL %s
+! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline %s -O0 -o /dev/null 2>&1 | FileCheck --check-prefixes=ALL,NOTO2 %s
 ! RUN: %flang_fc1 -S -mmlir --mlir-pass-statistics -mmlir --mlir-pass-statistics-display=pipeline %s -O2 -o /dev/null 2>&1 | FileCheck --check-prefixes=ALL,O2 %s
 
 ! REQUIRES: asserts
@@ -49,11 +49,15 @@
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 
-! ALL-NEXT: 'func.func' Pipeline
-! ALL-NEXT:   PolymorphicOpConversion
+! O2-NEXT:    'func.func' Pipeline
+! O2-NEXT:      PolymorphicOpConversion
 ! O2-NEXT:  AddAliasTags
-! O2-NEXT:  'func.func' Pipeline
-! ALL-NEXT:   CFGConversion
+! ALL-NEXT: Pipeline Collection : ['func.func', 'omp.reduction.declare']
+! ALL-NEXT:    'func.func' Pipeline
+! NOTO2-NEXT:      PolymorphicOpConversion
+! ALL-NEXT:      CFGConversionOnFunc
+! ALL-NEXT:   'omp.reduction.declare' Pipeline
+! ALL-NEXT:      CFGConversionOnReduction
 
 ! ALL-NEXT: SCFToControlFlow
 ! ALL-NEXT: Canonicalizer
diff --git a/flang/test/Evaluate/fold-scale.f90 b/flang/test/Evaluate/fold-scale.f90
index a5c5f081668b16..6d767a9f4c9a3c 100644
--- a/flang/test/Evaluate/fold-scale.f90
+++ b/flang/test/Evaluate/fold-scale.f90
@@ -7,5 +7,6 @@ module m
   logical, parameter :: test_4 = sign(1.0, scale(0.0, 0)) == 1.0
   logical, parameter :: test_5 = scale(1.0, -1) == 0.5
   logical, parameter :: test_6 = scale(2.0, -1) == 1.0
+  logical, parameter :: test_7 = scale(huge(0.d0), -1200) == 1.0440487148797638d-53
+  logical, parameter :: test_8 = scale(tiny(0.d0), 1200) == 3.8312388521647221d053
 end module
-
diff --git a/flang/test/Fir/array-value-copy-2.fir b/flang/test/Fir/array-value-copy-2.fir
index 21b340af10c6b8..cb8d6ca2b05540 100644
--- a/flang/test/Fir/array-value-copy-2.fir
+++ b/flang/test/Fir/array-value-copy-2.fir
@@ -1,5 +1,5 @@
-// RUN: fir-opt --array-value-copy --cfg-conversion %s | FileCheck %s
-// RUN: fir-opt --array-value-copy="optimize-conflicts=true" --cfg-conversion %s | FileCheck %s
+// RUN: fir-opt --array-value-copy --cfg-conversion-on-func-opt %s | FileCheck %s
+// RUN: fir-opt --array-value-copy="optimize-conflicts=true" --cfg-conversion-on-func-opt %s | FileCheck %s
 
 // CHECK-LABEL: func @_QPslice1(
 // CHECK-NOT: fir.allocmem
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index d8a9e74c318ce1..516946bd4fb46a 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -60,8 +60,11 @@ func.func @_QQmain() {
 
 // PASSES-NEXT: AddAliasTags
 
+// PASSES-NEXT: Pipeline Collection : ['func.func', 'omp.reduction.declare']
 // PASSES-NEXT: 'func.func' Pipeline
-// PASSES-NEXT:   CFGConversion
+// PASSES-NEXT:   CFGConversionOnFunc
+// PASSES-NEXT: 'omp.reduction.declare' Pipeline
+// PASSES-NEXT:   CFGConversionOnReduction
 
 // PASSES-NEXT: SCFToControlFlow
 // PASSES-NEXT: Canonicalizer
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index a1fc614334dbce..8d9bdca4bf5446 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -1,4 +1,4 @@
-// RUN: fir-opt --split-input-file --cfg-conversion --fir-to-llvm-ir="target=aarch64-unknown-linux-gnu" %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cfg-conversion-on-func-opt --fir-to-llvm-ir="target=aarch64-unknown-linux-gnu" %s | FileCheck %s
 
 func.func @_QPsb1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref<!fir.array<?xi32>> {fir.bindc_name = "arr"}) {
   %c1_i64 = arith.constant 1 : i64
diff --git a/flang/test/Fir/dispatch.f90 b/flang/test/Fir/dispatch.f90
index 1dc71038813d87..1479d611b986aa 100644
--- a/flang/test/Fir/dispatch.f90
+++ b/flang/test/Fir/dispatch.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | fir-opt --fir-polymorphic-op | FileCheck %s
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s --check-prefix=BT
+! RUN: bbc -emit-hlfir %s -o - | fir-opt --fir-polymorphic-op | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s --check-prefix=BT
 
 ! Tests codegen of fir.dispatch operation. This test is intentionally run from
 ! Fortran through bbc and tco so we have all the binding tables lowered to FIR
diff --git a/flang/test/Fir/loop01.fir b/flang/test/Fir/loop01.fir
index 72ca1c3989e453..c849797b969eba 100644
--- a/flang/test/Fir/loop01.fir
+++ b/flang/test/Fir/loop01.fir
@@ -1,4 +1,4 @@
-// RUN: fir-opt --split-input-file --cfg-conversion %s | FileCheck %s
+// RUN: fir-opt --split-input-file --cfg-conversion-on-func-opt %s | FileCheck %s
 
 func.func @x(%lb : index, %ub : index, %step : index, %b : i1, %addr : !fir.ref<index>) {
   fir.do_loop %iv = %lb to %ub step %step unordered {
diff --git a/flang/test/Fir/loop02.fir b/flang/test/Fir/loop02.fir
index 50948e0e7aa6b5..8918666f0b3460 100644
--- a/flang/test/Fir/loop02.fir
+++ b/flang/test/Fir/loop02.fir
@@ -1,5 +1,5 @@
-// RUN: fir-opt --cfg-conversion="always-execute-loop-body=true" %s | FileCheck %s
-// RUN: fir-opt --cfg-conversion %s | FileCheck %s --check-prefix=NOOPT
+// RUN: fir-opt --cfg-conversion-on-func-opt="always-execute-loop-body=true" %s | FileCheck %s
+// RUN: fir-opt --cfg-conversion-on-func-opt %s | FileCheck %s --check-prefix=NOOPT
 
 func.func @x(%addr : !fir.ref<index>) {
   %bound = arith.constant 452 : index
diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir
new file mode 100644
index 00000000000000..24bde536667b50
--- /dev/null
+++ b/flang/test/Fir/omp-reduction-embox-codegen.fir
@@ -0,0 +1,36 @@
+// RUN: tco %s | FileCheck %s
+
+// the fir.embox in the init region is turned into an alloca for the box. Test
+// that CodeGen.cpp knows where to place an alloca when it is inside of an
+// omp.reduction.declare
+
+// regretably this has to be nonsense IR because we need the subsequent patches
+// to process anything useful
+
+omp.reduction.declare @test_reduction : !fir.ref<!fir.box<i32>> init {
+^bb0(%arg0: !fir.ref<!fir.box<i32>>):
+  %0 = fir.alloca !fir.box<i32>
+  %1 = fir.alloca i32
+  %2 = fir.embox %1 : (!fir.ref<i32>) -> !fir.box<i32>
+
+  // use the embox for something so it isn't removed
+  fir.store %2 to %0 : !fir.ref<!fir.box<i32>>
+
+  omp.yield(%0 : !fir.ref<!fir.box<i32>>)
+} combiner {
+^bb0(%arg0: !fir.ref<!fir.box<i32>>, %arg1: !fir.ref<!fir.box<i32>>):
+  %0 = fir.undefined !fir.ref<!fir.box<i32>>
+  omp.yield(%0 : !fir.ref<!fir.box<i32>>)
+}
+
+func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+  %4 = fir.alloca !fir.box<i32>
+  omp.parallel byref reduction(@test_reduction %4 -> %arg0 : !fir.ref<!fir.box<i32>>) {
+    omp.terminator
+  }
+  return
+}
+
+// basically we are testing that there isn't a crash
+// CHECK-LABEL: define void @_QQmain
+// CHECK-NEXT:    alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
diff --git a/flang/test/HLFIR/assumed-type-actual-args.f90 b/flang/test/HLFIR/assumed-type-actual-args.f90
index 58c282b6ab1884..dbdfc1785ce9d7 100644
--- a/flang/test/HLFIR/assumed-type-actual-args.f90
+++ b/flang/test/HLFIR/assumed-type-actual-args.f90
@@ -1,6 +1,6 @@
 ! Test lowering to FIR of actual arguments that are assumed type
 ! variables (Fortran 2018 7.3.2.2 point 3).
-! RUN: bbc --polymorphic-type -emit-hlfir -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine test1(x)
   interface
diff --git a/flang/test/HLFIR/boxchar_emboxing.f90 b/flang/test/HLFIR/boxchar_emboxing.f90
index 3e9ccde6babab5..fbc41bbea72d8a 100644
--- a/flang/test/HLFIR/boxchar_emboxing.f90
+++ b/flang/test/HLFIR/boxchar_emboxing.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK-LABEL:   func.func @_QPtest1(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.class<none> {fir.bindc_name = "x"}) {
diff --git a/flang/test/HLFIR/call_with_poly_dummy.f90 b/flang/test/HLFIR/call_with_poly_dummy.f90
index af6876e26603ed..00a795c5b1fbed 100644
--- a/flang/test/HLFIR/call_with_poly_dummy.f90
+++ b/flang/test/HLFIR/call_with_poly_dummy.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
 ! Test passing arguments to subprograms with polymorphic dummy arguments.
 
diff --git a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
index c1e89d1978e4c2..f4327b3261751f 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
@@ -18,15 +18,17 @@ contains
 ! CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMtest_callFhostEa"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
     call dev_kernel0<<<10, 20>>>()
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}>>>()
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}>>>()
 
-    call dev_kernel0<<< __builtin_dim3(1,1), __builtin_dim3(32,1,1) >>>
+    call dev_kernel0<<< __builtin_dim3(1,1,4), __builtin_dim3(32,1,1) >>>
 ! CHECK: %[[ADDR_DIM3_GRID:.*]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_dim3.{{.*}}) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
 ! CHECK: %[[DIM3_GRID:.*]]:2 = hlfir.declare %[[ADDR_DIM3_GRID]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_dim3.0"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
 ! CHECK: %[[GRID_X:.*]] = hlfir.designate %[[DIM3_GRID]]#1{"x"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
 ! CHECK: %[[GRID_X_LOAD:.*]] = fir.load %[[GRID_X]] : !fir.ref<i32>
 ! CHECK: %[[GRID_Y:.*]] = hlfir.designate %[[DIM3_GRID]]#1{"y"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
 ! CHECK: %[[GRID_Y_LOAD:.*]] = fir.load %[[GRID_Y]] : !fir.ref<i32>
+! CHECK: %[[GRID_Z:.*]] = hlfir.designate %[[DIM3_GRID]]#1{"z"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: %[[GRID_Z_LOAD:.*]] = fir.load %[[GRID_Z]] : !fir.ref<i32>
 ! CHECK: %[[ADDR_DIM3_BLOCK:.*]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_dim3.{{.*}}) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
 ! CHECK: %[[DIM3_BLOCK:.*]]:2 = hlfir.declare %[[ADDR_DIM3_BLOCK]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_dim3.1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>)
 ! CHECK: %[[BLOCK_X:.*]] = hlfir.designate %[[DIM3_BLOCK]]#1{"x"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
@@ -35,16 +37,16 @@ contains
 ! CHECK: %[[BLOCK_Y_LOAD:.*]] = fir.load %[[BLOCK_Y]] : !fir.ref<i32>
 ! CHECK: %[[BLOCK_Z:.*]] = hlfir.designate %[[DIM3_BLOCK]]#1{"z"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
 ! CHECK: %[[BLOCK_Z_LOAD:.*]] = fir.load %[[BLOCK_Z]] : !fir.ref<i32>
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%[[GRID_X_LOAD]], %[[GRID_Y_LOAD]], %[[BLOCK_X_LOAD]], %[[BLOCK_Y_LOAD]], %[[BLOCK_Z_LOAD]]>>>()
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%[[GRID_X_LOAD]], %[[GRID_Y_LOAD]], %[[GRID_Z_LOAD]], %[[BLOCK_X_LOAD]], %[[BLOCK_Y_LOAD]], %[[BLOCK_Z_LOAD]]>>>()
 
     call dev_kernel0<<<10, 20, 2>>>()
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}>>>() 
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}>>>() 
 
     call dev_kernel0<<<10, 20, 2, 0>>>()
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>()
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>()
 
     call dev_kernel1<<<1, 32>>>(a)
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%1#1 : !fir.ref<f32>)
+! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%1#1) : (!fir.ref<f32>)
   end
 
 end
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index db628fe756b952..6179e609db383c 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -42,10 +42,20 @@ subroutine sub1()
 ! CHECK: fir.cuda_kernel<<<%c1{{.*}}, (%c256{{.*}}, %c1{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index) step (%{{.*}}, %{{.*}} : index, index)
 ! CHECK: {n = 2 : i64}
 
-! TODO: currently these trigger error in the parser
-! !$cuf kernel do(2) <<< (1,*), (256,1) >>>
-! !$cuf kernel do(2) <<< (*,*), (32,4) >>>
-end
-
+  !$cuf kernel do(2) <<< (1,*), (256,1) >>>
+  do i = 1, n
+    do j = 1, n
+      c(i,j) = c(i,j) * d(i,j)
+    end do
+  end do
+! CHECK: fir.cuda_kernel<<<(%c1{{.*}}, %c0{{.*}}), (%c256{{.*}}, %c1{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index)  step (%{{.*}}, %{{.*}} : index, index)
 
+!$cuf kernel do(2) <<< (*,*), (32,4) >>>
+  do i = 1, n
+    do j = 1, n
+      c(i,j) = c(i,j) * d(i,j)
+    end do
+  end do
 
+! CHECK: fir.cuda_kernel<<<*, (%c32{{.*}}, %c4{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index)  step (%{{.*}}, %{{.*}} : index, index)
+end
diff --git a/flang/test/Lower/CUDA/cuda-module-use.cuf b/flang/test/Lower/CUDA/cuda-module-use.cuf
index f54083b026ee88..47d3805065a5a7 100644
--- a/flang/test/Lower/CUDA/cuda-module-use.cuf
+++ b/flang/test/Lower/CUDA/cuda-module-use.cuf
@@ -5,7 +5,7 @@
 
 subroutine sub1()
   use cuf_mod
-  md = 1.0
+!  md = 1.0 ! currently a TODO
 end
 
 ! CHECK-LABEL: func.func @_QPsub1()
diff --git a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90 b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
index 76ed237fb6720a..129aa49b811db0 100644
--- a/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
+++ b/flang/test/Lower/HLFIR/actual_target_for_dummy_pointer.f90
@@ -1,5 +1,5 @@
 ! Test actual TARGET argument association to dummy POINTER:
-! RUN: bbc -emit-hlfir --polymorphic-type -o - -I nowhere %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nowhere %s 2>&1 | FileCheck %s
 
 module target_to_pointer_types
   type t1
diff --git a/flang/test/Lower/HLFIR/allocatable-return.f90 b/flang/test/Lower/HLFIR/allocatable-return.f90
index d652b2f9f767dc..e1bac34ef1a0ad 100644
--- a/flang/test/Lower/HLFIR/allocatable-return.f90
+++ b/flang/test/Lower/HLFIR/allocatable-return.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir --polymorphic-type -I nowhere %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere %s -o - | FileCheck %s
 
 ! Test allocatable return.
 ! Allocatable arrays must have default runtime lbounds after the return.
diff --git a/flang/test/Lower/HLFIR/array-ctor-derived.f90 b/flang/test/Lower/HLFIR/array-ctor-derived.f90
index 21a3eb78a19952..111225462a4bbe 100644
--- a/flang/test/Lower/HLFIR/array-ctor-derived.f90
+++ b/flang/test/Lower/HLFIR/array-ctor-derived.f90
@@ -1,5 +1,5 @@
 ! Test lowering of derived type array constructors to HLFIR.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 module types
   type simple
diff --git a/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90 b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
new file mode 100644
index 00000000000000..7a2ea5cc14b644
--- /dev/null
+++ b/flang/test/Lower/HLFIR/call-sequence-associated-descriptors.f90
@@ -0,0 +1,309 @@
+! Test lowering of sequence associated arguments (F'2023 15.5.2.12) passed
+! by descriptor. The descriptor on the caller side is prepared according to
+! the dummy argument shape.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+module bindc_seq_assoc
+  interface
+    subroutine takes_char(x, n) bind(c)
+      integer :: n
+      character(*) :: x(n)
+    end subroutine
+    subroutine takes_char_assumed_size(x) bind(c)
+      character(*) :: x(10, *)
+    end subroutine
+    subroutine takes_optional_char(x, n) bind(c)
+      integer :: n
+      character(*), optional :: x(n)
+    end subroutine
+  end interface
+contains
+  subroutine test_char_1(x)
+    character(*) :: x(10, 20)
+    call takes_char(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_1(
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 {uniq_name = "_QMbindc_seq_assocFtest_char_1Ex"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_9:.*]] = fir.shift %[[VAL_8]], %[[VAL_8]] : (index, index) -> !fir.shift<2>
+! CHECK:           %[[VAL_10:.*]] = fir.rebox %[[VAL_6]]#0(%[[VAL_9]]) : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:           %[[VAL_11:.*]]:3 = hlfir.associate %[[VAL_7]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_12:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_11]]#1 {uniq_name = "_QMbindc_seq_assocFtakes_charEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_14:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i32) -> i64
+! CHECK:           %[[VAL_16:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_17:.*]] = arith.subi %[[VAL_15]], %[[VAL_16]] : i64
+! CHECK:           %[[VAL_18:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_19:.*]] = arith.addi %[[VAL_17]], %[[VAL_18]] : i64
+! CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i64) -> index
+! CHECK:           %[[VAL_21:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_22:.*]] = arith.cmpi sgt, %[[VAL_20]], %[[VAL_21]] : index
+! CHECK:           %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_20]], %[[VAL_21]] : index
+! CHECK:           %[[VAL_24:.*]] = fir.shape_shift %[[VAL_12]], %[[VAL_23]] : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_25:.*]] = fir.box_elesize %[[VAL_10]] : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_26:.*]] = fir.box_addr %[[VAL_10]] : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> !fir.ref<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_28:.*]] = fir.embox %[[VAL_27]](%[[VAL_24]]) typeparams %[[VAL_25]] : (!fir.ref<!fir.array<?x!fir.char<1,?>>>, !fir.shapeshift<1>, index) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           fir.call @takes_char(%[[VAL_28]], %[[VAL_11]]#1) fastmath<contract> : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_11]]#1, %[[VAL_11]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_char_copy_in_copy_out(x)
+    character(*) :: x(:, :)
+    call takes_char(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_copy_in_copy_out(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMbindc_seq_assocFtest_char_copy_in_copy_outEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]] = fir.shift %[[VAL_4]], %[[VAL_4]] : (index, index) -> !fir.shift<2>
+! CHECK:           %[[VAL_6:.*]] = fir.rebox %[[VAL_3]]#0(%[[VAL_5]]) : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_7:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]]#1 {uniq_name = "_QMbindc_seq_assocFtakes_charEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> i64
+! CHECK:           %[[VAL_12:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_13:.*]] = arith.subi %[[VAL_11]], %[[VAL_12]] : i64
+! CHECK:           %[[VAL_14:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i64
+! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i64) -> index
+! CHECK:           %[[VAL_17:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_18:.*]] = arith.cmpi sgt, %[[VAL_16]], %[[VAL_17]] : index
+! CHECK:           %[[VAL_19:.*]] = arith.select %[[VAL_18]], %[[VAL_16]], %[[VAL_17]] : index
+! CHECK:           %[[VAL_20:.*]] = fir.shape_shift %[[VAL_8]], %[[VAL_19]] : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_21:.*]] = fir.box_elesize %[[VAL_6]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_22:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_24:.*]] = fir.embox %[[VAL_23]](%[[VAL_20]]) typeparams %[[VAL_21]] : (!fir.ref<!fir.array<?x!fir.char<1,?>>>, !fir.shapeshift<1>, index) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           fir.call @takes_char(%[[VAL_24]], %[[VAL_7]]#1) fastmath<contract> : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.copy_out %[[VAL_3]]#0, %[[VAL_3]]#1 to %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1, !fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_7]]#1, %[[VAL_7]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_char_assumed_size(x)
+    character(*) :: x(:, :)
+    call takes_char_assumed_size(x)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_char_assumed_size(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMbindc_seq_assocFtest_char_assumed_sizeEx"} : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.box<!fir.array<?x?x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shift %[[VAL_3]], %[[VAL_3]] : (index, index) -> !fir.shift<2>
+! CHECK:           %[[VAL_5:.*]] = fir.rebox %[[VAL_2]]#0(%[[VAL_4]]) : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_8]] : i64
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i64
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i64) -> index
+! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_16:.*]] = arith.constant -1 : i64
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i64) -> index
+! CHECK:           %[[VAL_18:.*]] = fir.shape_shift %[[VAL_6]], %[[VAL_15]], %[[VAL_6]], %[[VAL_17]] : (index, index, index, index) -> !fir.shapeshift<2>
+! CHECK:           %[[VAL_19:.*]] = fir.box_elesize %[[VAL_5]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_20:.*]] = fir.box_addr %[[VAL_5]] : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.array<?x?x!fir.char<1,?>>>) -> !fir.ref<!fir.array<10x?x!fir.char<1,?>>>
+! CHECK:           %[[VAL_22:.*]] = fir.embox %[[VAL_21]](%[[VAL_18]]) typeparams %[[VAL_19]] : (!fir.ref<!fir.array<10x?x!fir.char<1,?>>>, !fir.shapeshift<2>, index) -> !fir.box<!fir.array<10x?x!fir.char<1,?>>>
+! CHECK:           fir.call @takes_char_assumed_size(%[[VAL_22]]) fastmath<contract> : (!fir.box<!fir.array<10x?x!fir.char<1,?>>>) -> ()
+! CHECK:           hlfir.copy_out %[[VAL_2]]#0, %[[VAL_2]]#1 to %[[VAL_1]]#0 : (!fir.box<!fir.array<?x?x!fir.char<1,?>>>, i1, !fir.box<!fir.array<?x?x!fir.char<1,?>>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_optional_char(x)
+    character(*), optional :: x(10, 20)
+    call takes_optional_char(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMbindc_seq_assocPtest_optional_char(
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_2:.*]](%[[VAL_5:.*]]) typeparams %[[VAL_1:.*]]#1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMbindc_seq_assocFtest_optional_charEx"} : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>, !fir.shape<2>, index) -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.ref<!fir.array<10x20x!fir.char<1,?>>>)
+! CHECK:           %[[VAL_7:.*]] = fir.is_present %[[VAL_6]]#0 : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> i1
+! CHECK:           %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_9:.*]] = fir.if %[[VAL_7]] -> (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) {
+! CHECK:             %[[VAL_10:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_11:.*]] = fir.shift %[[VAL_10]], %[[VAL_10]] : (index, index) -> !fir.shift<2>
+! CHECK:             %[[VAL_12:.*]] = fir.rebox %[[VAL_6]]#0(%[[VAL_11]]) : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>, !fir.shift<2>) -> !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:             fir.result %[[VAL_12]] : !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_13:.*]] = fir.absent !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:             fir.result %[[VAL_13]] : !fir.box<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:           }
+! CHECK:           %[[VAL_14:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_15:.*]] = fir.if %[[VAL_7]] -> (!fir.box<!fir.array<?x!fir.char<1,?>>>) {
+! CHECK:             %[[VAL_16:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_14]]#1 {uniq_name = "_QMbindc_seq_assocFtakes_optional_charEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i64
+! CHECK:             %[[VAL_21:.*]] = arith.subi %[[VAL_19]], %[[VAL_20]] : i64
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i64
+! CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_22]] : i64
+! CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i64) -> index
+! CHECK:             %[[VAL_25:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_26:.*]] = arith.cmpi sgt, %[[VAL_24]], %[[VAL_25]] : index
+! CHECK:             %[[VAL_27:.*]] = arith.select %[[VAL_26]], %[[VAL_24]], %[[VAL_25]] : index
+! CHECK:             %[[VAL_28:.*]] = fir.shape_shift %[[VAL_16]], %[[VAL_27]] : (index, index) -> !fir.shapeshift<1>
+! CHECK:             %[[VAL_29:.*]] = fir.box_elesize %[[VAL_9]] : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> index
+! CHECK:             %[[VAL_30:.*]] = fir.box_addr %[[VAL_9]] : (!fir.box<!fir.array<10x20x!fir.char<1,?>>>) -> !fir.ref<!fir.array<10x20x!fir.char<1,?>>>
+! CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.ref<!fir.array<10x20x!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
+! CHECK:             %[[VAL_32:.*]] = fir.embox %[[VAL_31]](%[[VAL_28]]) typeparams %[[VAL_29]] : (!fir.ref<!fir.array<?x!fir.char<1,?>>>, !fir.shapeshift<1>, index) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:             fir.result %[[VAL_32]] : !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_33:.*]] = fir.absent !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:             fir.result %[[VAL_33]] : !fir.box<!fir.array<?x!fir.char<1,?>>>
+! CHECK:           }
+! CHECK:           fir.call @takes_optional_char(%[[VAL_15]], %[[VAL_14]]#1) fastmath<contract> : (!fir.box<!fir.array<?x!fir.char<1,?>>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_14]]#1, %[[VAL_14]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+end module
+
+module poly_seq_assoc
+  interface
+    subroutine takes_poly(x, n)
+      integer :: n
+      class(*) :: x(n)
+    end subroutine
+    subroutine takes_poly_assumed_size(x)
+      class(*) :: x(10, *)
+    end subroutine
+    subroutine takes_optional_poly(x, n)
+      integer :: n
+      class(*), optional :: x(n)
+    end subroutine
+  end interface
+contains
+  subroutine test_poly_1(x)
+    class(*) :: x(10, 20)
+    call takes_poly(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_1(
+! CHECK-SAME:                                             %[[VAL_0:.*]]: !fir.class<!fir.array<10x20xnone>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMpoly_seq_assocFtest_poly_1Ex"} : (!fir.class<!fir.array<10x20xnone>>) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
+! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_3:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#1 {uniq_name = "_QMpoly_seq_assocFtakes_polyEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i32) -> i64
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_8:.*]] = arith.subi %[[VAL_6]], %[[VAL_7]] : i64
+! CHECK:           %[[VAL_9:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_8]], %[[VAL_9]] : i64
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i64) -> index
+! CHECK:           %[[VAL_12:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_11]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_16:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.class<!fir.array<10x20xnone>>) -> !fir.ref<!fir.array<10x20xnone>>
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.array<10x20xnone>>) -> !fir.ref<!fir.array<?xnone>>
+! CHECK:           %[[VAL_18:.*]] = fir.embox %[[VAL_17]](%[[VAL_15]]) source_box %[[VAL_1]]#0 : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.class<!fir.array<10x20xnone>>) -> !fir.class<!fir.array<?xnone>>
+! CHECK:           fir.call @_QPtakes_poly(%[[VAL_18]], %[[VAL_3]]#1) fastmath<contract> : (!fir.class<!fir.array<?xnone>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_3]]#1, %[[VAL_3]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_poly_copy_in_copy_out(x)
+    class(*) :: x(:, :)
+    call takes_poly(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_copy_in_copy_out(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMpoly_seq_assocFtest_poly_copy_in_copy_outEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_2:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
+! CHECK:           %[[VAL_4:.*]]:3 = hlfir.associate %[[VAL_2]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#1 {uniq_name = "_QMpoly_seq_assocFtakes_polyEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i32) -> i64
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_8]] : i64
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i64
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i64) -> index
+! CHECK:           %[[VAL_13:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_12]], %[[VAL_13]] : index
+! CHECK:           %[[VAL_16:.*]] = fir.shape %[[VAL_15]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_17:.*]] = fir.box_addr %[[VAL_3]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?x?xnone>>
+! CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (!fir.ref<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?xnone>>
+! CHECK:           %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) source_box %[[VAL_3]]#0 : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.class<!fir.array<?x?xnone>>) -> !fir.class<!fir.array<?xnone>>
+! CHECK:           fir.call @_QPtakes_poly(%[[VAL_19]], %[[VAL_4]]#1) fastmath<contract> : (!fir.class<!fir.array<?xnone>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.copy_out %[[VAL_3]]#0, %[[VAL_3]]#1 to %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>, i1, !fir.class<!fir.array<?x?xnone>>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_4]]#1, %[[VAL_4]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_poly_assumed_size(x)
+    class(*) :: x(:, :)
+    call takes_poly_assumed_size(x)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_poly_assumed_size(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {uniq_name = "_QMpoly_seq_assocFtest_poly_assumed_sizeEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.copy_in %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, i1)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_5:.*]] = arith.subi %[[VAL_3]], %[[VAL_4]] : i64
+! CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_7:.*]] = arith.addi %[[VAL_5]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i64) -> index
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : index
+! CHECK:           %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : index
+! CHECK:           %[[VAL_12:.*]] = arith.constant -1 : i64
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i64) -> index
+! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_11]], %[[VAL_13]] : (index, index) -> !fir.shape<2>
+! CHECK:           %[[VAL_15:.*]] = fir.box_addr %[[VAL_2]]#0 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<?x?xnone>>
+! CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (!fir.ref<!fir.array<?x?xnone>>) -> !fir.ref<!fir.array<10x?xnone>>
+! CHECK:           %[[VAL_17:.*]] = fir.embox %[[VAL_16]](%[[VAL_14]]) source_box %[[VAL_2]]#0 : (!fir.ref<!fir.array<10x?xnone>>, !fir.shape<2>, !fir.class<!fir.array<?x?xnone>>) -> !fir.class<!fir.array<10x?xnone>>
+! CHECK:           fir.call @_QPtakes_poly_assumed_size(%[[VAL_17]]) fastmath<contract> : (!fir.class<!fir.array<10x?xnone>>) -> ()
+! CHECK:           hlfir.copy_out %[[VAL_2]]#0, %[[VAL_2]]#1 to %[[VAL_1]]#0 : (!fir.class<!fir.array<?x?xnone>>, i1, !fir.class<!fir.array<?x?xnone>>) -> ()
+! CHECK:           return
+! CHECK:         }
+
+  subroutine test_optional_poly(x)
+    class(*), optional :: x(10, 20)
+    call takes_optional_poly(x, 100)
+  end subroutine
+! CHECK-LABEL:   func.func @_QMpoly_seq_assocPtest_optional_poly(
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0:.*]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QMpoly_seq_assocFtest_optional_polyEx"} : (!fir.class<!fir.array<10x20xnone>>) -> (!fir.class<!fir.array<10x20xnone>>, !fir.class<!fir.array<10x20xnone>>)
+! CHECK:           %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#0 : (!fir.class<!fir.array<10x20xnone>>) -> i1
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : i32
+! CHECK:           %[[VAL_4:.*]] = fir.if %[[VAL_2]] -> (!fir.class<!fir.array<10x20xnone>>) {
+! CHECK:             fir.result %[[VAL_1]]#0 : !fir.class<!fir.array<10x20xnone>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_5:.*]] = fir.absent !fir.class<!fir.array<10x20xnone>>
+! CHECK:             fir.result %[[VAL_5]] : !fir.class<!fir.array<10x20xnone>>
+! CHECK:           }
+! CHECK:           %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_3]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+! CHECK:           %[[VAL_7:.*]] = fir.if %[[VAL_2]] -> (!fir.class<!fir.array<?xnone>>) {
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]]#1 {uniq_name = "_QMpoly_seq_assocFtakes_optional_polyEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i64
+! CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_10]], %[[VAL_11]] : i64
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i64
+! CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_12]], %[[VAL_13]] : i64
+! CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i64) -> index
+! CHECK:             %[[VAL_16:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_17:.*]] = arith.cmpi sgt, %[[VAL_15]], %[[VAL_16]] : index
+! CHECK:             %[[VAL_18:.*]] = arith.select %[[VAL_17]], %[[VAL_15]], %[[VAL_16]] : index
+! CHECK:             %[[VAL_19:.*]] = fir.shape %[[VAL_18]] : (index) -> !fir.shape<1>
+! CHECK:             %[[VAL_20:.*]] = fir.box_addr %[[VAL_4]] : (!fir.class<!fir.array<10x20xnone>>) -> !fir.ref<!fir.array<10x20xnone>>
+! CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.array<10x20xnone>>) -> !fir.ref<!fir.array<?xnone>>
+! CHECK:             %[[VAL_22:.*]] = fir.embox %[[VAL_21]](%[[VAL_19]]) source_box %[[VAL_4]] : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.class<!fir.array<10x20xnone>>) -> !fir.class<!fir.array<?xnone>>
+! CHECK:             fir.result %[[VAL_22]] : !fir.class<!fir.array<?xnone>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_23:.*]] = fir.absent !fir.class<!fir.array<?xnone>>
+! CHECK:             fir.result %[[VAL_23]] : !fir.class<!fir.array<?xnone>>
+! CHECK:           }
+! CHECK:           fir.call @_QPtakes_optional_poly(%[[VAL_7]], %[[VAL_6]]#1) fastmath<contract> : (!fir.class<!fir.array<?xnone>>, !fir.ref<i32>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_6]]#1, %[[VAL_6]]#2 : !fir.ref<i32>, i1
+! CHECK:           return
+! CHECK:         }
+end module
diff --git a/flang/test/Lower/HLFIR/calls-assumed-shape.f90 b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
index 92b87a6987a558..a2094f1f1f0eb1 100644
--- a/flang/test/Lower/HLFIR/calls-assumed-shape.f90
+++ b/flang/test/Lower/HLFIR/calls-assumed-shape.f90
@@ -1,6 +1,6 @@
 ! Test lowering of calls involving assumed shape arrays or arrays with
 ! VALUE attribute.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine test_assumed_to_assumed(x)
   interface
diff --git a/flang/test/Lower/HLFIR/calls-constant-expr-arg-polymorphic.f90 b/flang/test/Lower/HLFIR/calls-constant-expr-arg-polymorphic.f90
index a862033c0fefaf..4ee70e55079dad 100644
--- a/flang/test/Lower/HLFIR/calls-constant-expr-arg-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/calls-constant-expr-arg-polymorphic.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 ! Test when constant argument are copied in memory
 ! and passed to polymorphic arguments.
diff --git a/flang/test/Lower/HLFIR/calls-optional.f90 b/flang/test/Lower/HLFIR/calls-optional.f90
index cf684981f80031..df9519a24fb702 100644
--- a/flang/test/Lower/HLFIR/calls-optional.f90
+++ b/flang/test/Lower/HLFIR/calls-optional.f90
@@ -2,7 +2,7 @@
 ! that is syntactically present, but may be absent at runtime (is
 ! an optional or a pointer/allocatable).
 !
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine optional_copy_in_out(x)
   interface
diff --git a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90 b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
index ffd21e01ef98dd..b14f1bb1f443b5 100644
--- a/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
+++ b/flang/test/Lower/HLFIR/calls-poly-to-assumed-type.f90
@@ -1,6 +1,6 @@
 ! Test passing rank 2 CLASS(*) deferred shape to assumed size assumed type
 ! This requires copy-in/copy-out logic.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine pass_poly_to_assumed_type_assumed_size(x)
   class(*), target :: x(:,:)
diff --git a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90 b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
index b943cd3225a564..b9d55d3fde4f09 100644
--- a/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
+++ b/flang/test/Lower/HLFIR/convert-mbox-to-value.f90
@@ -1,5 +1,5 @@
 ! Test conversion of MutableBoxValue to value.
-! RUN: bbc -emit-hlfir -polymorphic-type -I nowhere %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere %s -o - | FileCheck %s
 
 subroutine test_int_allocatable(a)
   integer, allocatable :: a
diff --git a/flang/test/Lower/HLFIR/designators-component-ref.f90 b/flang/test/Lower/HLFIR/designators-component-ref.f90
index 3e9fa037040d52..392eda66fd03c1 100644
--- a/flang/test/Lower/HLFIR/designators-component-ref.f90
+++ b/flang/test/Lower/HLFIR/designators-component-ref.f90
@@ -1,5 +1,5 @@
 ! Test lowering of component reference to HLFIR
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 module comp_ref
 type t1
   integer :: scalar_i
diff --git a/flang/test/Lower/HLFIR/designators-parameter-array-slice.f90 b/flang/test/Lower/HLFIR/designators-parameter-array-slice.f90
index 3bf19923453352..42e130ee3f49c3 100644
--- a/flang/test/Lower/HLFIR/designators-parameter-array-slice.f90
+++ b/flang/test/Lower/HLFIR/designators-parameter-array-slice.f90
@@ -1,5 +1,5 @@
 ! Test non-contiguous slice of parameter array.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 subroutine test2(i)
   integer, parameter :: a(*,*) = reshape( [ 1,2,3,4 ], [ 2,2 ])
   integer :: x(2)
diff --git a/flang/test/Lower/HLFIR/elemental-array-ops.f90 b/flang/test/Lower/HLFIR/elemental-array-ops.f90
index 6984e5ef74c697..9778adeb6179a0 100644
--- a/flang/test/Lower/HLFIR/elemental-array-ops.f90
+++ b/flang/test/Lower/HLFIR/elemental-array-ops.f90
@@ -1,5 +1,5 @@
 ! Test lowering of elemental intrinsic operations with array arguments to HLFIR
-! RUN: bbc -emit-hlfir --polymorphic-type -I nowhere -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere -o - %s 2>&1 | FileCheck %s
 
 subroutine binary(x, y)
   integer :: x(100), y(100)
diff --git a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90 b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
index c2f97b6fccfeb9..eb93099b3890fb 100644
--- a/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
+++ b/flang/test/Lower/HLFIR/elemental-polymorphic-merge.f90
@@ -1,5 +1,5 @@
 ! Test that the produced hlfir.elemental had proper result type and the mold.
-! RUN: bbc --emit-hlfir --polymorphic-type -I nowhere -o - %s | FileCheck %s
+! RUN: bbc --emit-hlfir -I nowhere -o - %s | FileCheck %s
 
 subroutine test_polymorphic_merge(x, y, r, m)
   type t
diff --git a/flang/test/Lower/HLFIR/elemental-user-procedure-ref-polymorphic.f90 b/flang/test/Lower/HLFIR/elemental-user-procedure-ref-polymorphic.f90
index 1cd5c5133bd961..84bb7a55377b89 100644
--- a/flang/test/Lower/HLFIR/elemental-user-procedure-ref-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/elemental-user-procedure-ref-polymorphic.f90
@@ -1,6 +1,6 @@
 ! Test lowering of user defined elemental procedure reference to HLFIR
 ! With polymorphic arguments.
-! RUN: bbc -emit-hlfir -I nw -polymorphic-type -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -I nw -o - %s 2>&1 | FileCheck %s
 module def_some_types
   type :: t
     integer :: i
diff --git a/flang/test/Lower/HLFIR/function-return-as-expr.f90 b/flang/test/Lower/HLFIR/function-return-as-expr.f90
index ae4f679a0fe602..95a0c090ef0433 100644
--- a/flang/test/Lower/HLFIR/function-return-as-expr.f90
+++ b/flang/test/Lower/HLFIR/function-return-as-expr.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nowhere 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s -I nowhere 2>&1 | FileCheck %s
 
 module types
   type t1
diff --git a/flang/test/Lower/HLFIR/function-return-destroy.f90 b/flang/test/Lower/HLFIR/function-return-destroy.f90
index 4663dc57e0457f..5bd014981c128d 100644
--- a/flang/test/Lower/HLFIR/function-return-destroy.f90
+++ b/flang/test/Lower/HLFIR/function-return-destroy.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir -polymorphic-type %s -o - -I nowhere | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - -I nowhere | FileCheck %s
 
 module types
   type t1
diff --git a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90 b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
index 952e8f565eb93a..43986c8198b943 100644
--- a/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/ignore-rank-unlimited-polymorphic.f90
@@ -1,6 +1,6 @@
 ! Test passing mismatching rank arguments to unlimited polymorphic
 ! dummy with IGNORE_TKR(R).
-! RUN: bbc -emit-hlfir -polymorphic-type -o - -I nowhere %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nowhere %s 2>&1 | FileCheck %s
 
 module m
   interface
diff --git a/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90 b/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90
index 3ad74ced61a3b9..27747ff2ad943d 100644
--- a/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90
+++ b/flang/test/Lower/HLFIR/ignore-type-assumed-shape.f90
@@ -2,7 +2,7 @@
 ! dummy has IGNORE_TKR(t). The descriptor should be prepared
 ! according to the actual argument type, but its bounds and
 ! attributes should still be set as expected for the dummy.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 module tkr_ifaces
   interface
diff --git a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90 b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
index 932fafd322a3e3..797e4c89ae2396 100644
--- a/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
+++ b/flang/test/Lower/HLFIR/intentout-allocatable-components.f90
@@ -1,6 +1,6 @@
 ! Test that allocatable components of non pointer/non allocatable INTENT(OUT)
 ! dummy arguments are deallocated.
-! RUN: bbc -emit-hlfir -polymorphic-type %s -o - -I nowhere | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - -I nowhere | FileCheck %s
 
 subroutine test_intentout_component_deallocate(a)
   type :: t
diff --git a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90 b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
index 8645488290d715..5763d84cfd605a 100644
--- a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
@@ -1,6 +1,6 @@
 ! Test lowering of internal procedure capturing OPTIONAL polymorphic
 ! objects.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nw | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s -I nw | FileCheck %s
 
 
 module captured_optional_polymorphic
diff --git a/flang/test/Lower/HLFIR/intrinsic-assumed-type.f90 b/flang/test/Lower/HLFIR/intrinsic-assumed-type.f90
index d0381344c89314..e82bc342ff88f6 100644
--- a/flang/test/Lower/HLFIR/intrinsic-assumed-type.f90
+++ b/flang/test/Lower/HLFIR/intrinsic-assumed-type.f90
@@ -2,7 +2,7 @@
 ! arguments. These are a bit special because semantics do not represent
 ! assumed types actual arguments with an evaluate::Expr like for usual
 ! arguments.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 subroutine assumed_type_to_intrinsic(a)
   type(*) :: a(:)
diff --git a/flang/test/Lower/HLFIR/parent-component-ref.f90 b/flang/test/Lower/HLFIR/parent-component-ref.f90
index b08d8f450e6d71..7d04ef3b40a650 100644
--- a/flang/test/Lower/HLFIR/parent-component-ref.f90
+++ b/flang/test/Lower/HLFIR/parent-component-ref.f90
@@ -1,5 +1,5 @@
 ! Test lowering of parent component references to HLFIR.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s -I nw | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s -I nw | FileCheck %s
 
 module pc_types
   type t
diff --git a/flang/test/Lower/HLFIR/poly_expr_for_nonpoly_dummy.f90 b/flang/test/Lower/HLFIR/poly_expr_for_nonpoly_dummy.f90
index ec5be37f551529..f5ec7c35594bdc 100644
--- a/flang/test/Lower/HLFIR/poly_expr_for_nonpoly_dummy.f90
+++ b/flang/test/Lower/HLFIR/poly_expr_for_nonpoly_dummy.f90
@@ -1,6 +1,6 @@
 ! Test passing polymorphic expression for non-polymorphic contiguous
 ! dummy argument:
-! RUN: bbc -emit-hlfir --polymorphic-type -o - -I nowhere %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nowhere %s | FileCheck %s
 
 module types
   type t
diff --git a/flang/test/Lower/HLFIR/polymorphic-expressions.f90 b/flang/test/Lower/HLFIR/polymorphic-expressions.f90
index 37e602895db395..d6934e6ae6bd41 100644
--- a/flang/test/Lower/HLFIR/polymorphic-expressions.f90
+++ b/flang/test/Lower/HLFIR/polymorphic-expressions.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nowhere | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s -I nowhere | FileCheck %s
 
 module polymorphic_expressions_types
   type t
diff --git a/flang/test/Lower/HLFIR/proc-pointer-comp-nopass.f90 b/flang/test/Lower/HLFIR/proc-pointer-comp-nopass.f90
index ebb310f581c101..6f41f678dd84a7 100644
--- a/flang/test/Lower/HLFIR/proc-pointer-comp-nopass.f90
+++ b/flang/test/Lower/HLFIR/proc-pointer-comp-nopass.f90
@@ -1,5 +1,5 @@
 ! Test lowering of NOPASS procedure pointers components.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 module proc_comp_defs
   interface
diff --git a/flang/test/Lower/HLFIR/proc-pointer-comp-pass.f90 b/flang/test/Lower/HLFIR/proc-pointer-comp-pass.f90
index 25e4393f9dac7c..247008e3a93df2 100644
--- a/flang/test/Lower/HLFIR/proc-pointer-comp-pass.f90
+++ b/flang/test/Lower/HLFIR/proc-pointer-comp-pass.f90
@@ -1,5 +1,5 @@
 ! Test lowering of PASS procedure pointers components.
-! RUN: bbc -emit-hlfir -polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 module m
   type t
diff --git a/flang/test/Lower/HLFIR/select-type-selector.f90 b/flang/test/Lower/HLFIR/select-type-selector.f90
index a9536c9ccee692..256f003dcbf697 100644
--- a/flang/test/Lower/HLFIR/select-type-selector.f90
+++ b/flang/test/Lower/HLFIR/select-type-selector.f90
@@ -14,7 +14,7 @@
 ! (16.9.109) applied to the corresponding dimension of selector. The upper bound of each dimension is one less
 ! than the sum of the lower bound and the extent.
 
-! RUN: bbc -emit-hlfir -polymorphic-type -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
 
 subroutine test()
   type t
diff --git a/flang/test/Lower/HLFIR/transpose.f90 b/flang/test/Lower/HLFIR/transpose.f90
index e63e1ec1d6890d..e37e83c7a50112 100644
--- a/flang/test/Lower/HLFIR/transpose.f90
+++ b/flang/test/Lower/HLFIR/transpose.f90
@@ -1,5 +1,5 @@
 ! Test lowering of TRANSPOSE intrinsic to HLFIR
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s 2>&1 | FileCheck %s
 
 subroutine transpose1(m, res)
   integer :: m(1,2), res(2, 1)
diff --git a/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90 b/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90
index 6794d11ece42d2..2e0c72ccfe0484 100644
--- a/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90
+++ b/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90
@@ -1,6 +1,6 @@
 ! Test interface that lowering handles small interface mismatch with
 ! type bound procedures.
-! RUN: bbc -emit-hlfir --polymorphic-type %s -o - -I nw | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - -I nw | FileCheck %s
 
 module dispatch_mismatch
 type t
diff --git a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
index d4026a37720f75..ee8ded197c9592 100644
--- a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
+++ b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
@@ -1,6 +1,6 @@
 ! Test lowering of vector subscript designators outside of the
 ! assignment left-and side and input IO context.
-! RUN: bbc -emit-hlfir -o - -I nw %s --polymorphic-type 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nw %s 2>&1 | FileCheck %s
 
 subroutine foo(x, y)
   integer :: x(100)
diff --git a/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90 b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
new file mode 100644
index 00000000000000..38468739ead526
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/c_ptr_eq_ne.f90
@@ -0,0 +1,56 @@
+! Test C_PTR_EQ and C_PTR_NE lowering.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+function test_c_ptr_eq(ptr1, ptr2)
+  use, intrinsic :: iso_c_binding
+  type(c_ptr), intent(in) :: ptr1, ptr2
+  logical :: test_c_ptr_eq
+  test_c_ptr_eq = (ptr1 .eq. ptr2)
+end
+
+! CHECK-LABEL: func.func @_QPtest_c_ptr_eq(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> {
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_eqEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_eq", uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"}
+! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_eqEtest_c_ptr_eq"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS0:.*]] = fir.coordinate_of %[[DECL_ARG0]]#1, %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS0:.*]] = fir.load %[[COORD_ADDRESS0]] : !fir.ref<i64>
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS1:.*]] = fir.coordinate_of %[[DECL_ARG1]]#1, %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS1:.*]] = fir.load %[[COORD_ADDRESS1]] : !fir.ref<i64>
+! CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[ADDRESS0]], %[[ADDRESS1]] : i64
+! CHECK: %[[RES:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
+! CHECK: %[[NO_REASSOC_RES:.*]] = hlfir.no_reassoc %[[RES]] : !fir.logical<4>
+! CHECK: hlfir.assign %[[NO_REASSOC_RES]] to %[[DECL_RET]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK: %[[LOAD_RET:.*]] = fir.load %[[DECL_RET]]#1 : !fir.ref<!fir.logical<4>>
+! CHECK: return %[[LOAD_RET]] : !fir.logical<4>
+! CHECK: }
+
+function test_c_ptr_ne(ptr1, ptr2)
+  use, intrinsic :: iso_c_binding
+  type(c_ptr), intent(in) :: ptr1, ptr2
+  logical :: test_c_ptr_ne
+  test_c_ptr_ne = (ptr1 .ne. ptr2)
+end
+
+! CHECK-LABEL: func.func @_QPtest_c_ptr_ne(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr1"}, %[[ARG1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> {fir.bindc_name = "ptr2"}) -> !fir.logical<4> {
+! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr1"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[DECL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_c_ptr_neEptr2"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
+! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.logical<4> {bindc_name = "test_c_ptr_ne", uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"}
+! CHECK: %[[DECL_RET:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFtest_c_ptr_neEtest_c_ptr_ne"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS0:.*]] = fir.coordinate_of %[[DECL_ARG0]]#1, %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS0:.*]] = fir.load %[[COORD_ADDRESS0]] : !fir.ref<i64>
+! CHECK: %[[FIELD_ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>
+! CHECK: %[[COORD_ADDRESS1:.*]] = fir.coordinate_of %[[DECL_ARG1]]#1, %[[FIELD_ADDRESS]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK: %[[ADDRESS1:.*]] = fir.load %[[COORD_ADDRESS1]] : !fir.ref<i64>
+! CHECK: %[[CMP:.*]] = arith.cmpi ne, %[[ADDRESS0]], %[[ADDRESS1]] : i64
+! CHECK: %[[RES:.*]] = fir.convert %[[CMP]] : (i1) -> !fir.logical<4>
+! CHECK: %[[NO_REASSOC_RES:.*]] = hlfir.no_reassoc %[[RES]] : !fir.logical<4>
+! CHECK: hlfir.assign %[[NO_REASSOC_RES]] to %[[DECL_RET]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK: %[[LOAD_RET:.*]] = fir.load %[[DECL_RET]]#1 : !fir.ref<!fir.logical<4>>
+! CHECK: return %[[LOAD_RET]] : !fir.logical<4>
+! CHECK: }
diff --git a/flang/test/Lower/Intrinsics/extends_type_of.f90 b/flang/test/Lower/Intrinsics/extends_type_of.f90
index b36d4e8b7c0d4e..f99a63e30a552d 100644
--- a/flang/test/Lower/Intrinsics/extends_type_of.f90
+++ b/flang/test/Lower/Intrinsics/extends_type_of.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module extends_type_of_mod
 
diff --git a/flang/test/Lower/Intrinsics/modulo.f90 b/flang/test/Lower/Intrinsics/modulo.f90
index 001e307aa077a3..383cb34f83c705 100644
--- a/flang/test/Lower/Intrinsics/modulo.f90
+++ b/flang/test/Lower/Intrinsics/modulo.f90
@@ -40,17 +40,6 @@ subroutine modulo_testi(r, a, p)
 ! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f128>{{.*}}, %[[arg1:.*]]: !fir.ref<f128>{{.*}}, %[[arg2:.*]]: !fir.ref<f128>{{.*}}) {
 subroutine modulo_testr16(r, a, p)
   real(16) :: r, a, p
-  ! CHECK-DAG: %[[a:.*]] = fir.load %[[arg1]] : !fir.ref<f128>
-  ! CHECK-DAG: %[[p:.*]] = fir.load %[[arg2]] : !fir.ref<f128>
-  ! CHECK-DAG: %[[rem:.*]] = arith.remf %[[a]], %[[p]] {{.*}}: f128
-  ! CHECK-DAG: %[[zero:.*]] = arith.constant 0.000000e+00 : f128
-  ! CHECK-DAG: %[[remNotZero:.*]] = arith.cmpf une, %[[rem]], %[[zero]] {{.*}} : f128
-  ! CHECK-DAG: %[[aNeg:.*]] = arith.cmpf olt, %[[a]], %[[zero]] {{.*}} : f128
-  ! CHECK-DAG: %[[pNeg:.*]] = arith.cmpf olt, %[[p]], %[[zero]] {{.*}} : f128
-  ! CHECK-DAG: %[[signDifferent:.*]] = arith.xori %[[aNeg]], %[[pNeg]] : i1
-  ! CHECK-DAG: %[[mustAddP:.*]] = arith.andi %[[remNotZero]], %[[signDifferent]] : i1
-  ! CHECK-DAG: %[[remPlusP:.*]] = arith.addf %[[rem]], %[[p]] {{.*}}: f128
-  ! CHECK: %[[res:.*]] = arith.select %[[mustAddP]], %[[remPlusP]], %[[rem]] : f128
-  ! CHECK: fir.store %[[res]] to %[[arg0]] : !fir.ref<f128>
+  ! CHECK: fir.call @_FortranAModuloReal16({{.*}}){{.*}}: (f128, f128, !fir.ref<i8>, i32) -> f128
   r = modulo(a, p)
 end subroutine
diff --git a/flang/test/Lower/Intrinsics/same_type_as.f90 b/flang/test/Lower/Intrinsics/same_type_as.f90
index e1333024927d54..e0fda563f08bb1 100644
--- a/flang/test/Lower/Intrinsics/same_type_as.f90
+++ b/flang/test/Lower/Intrinsics/same_type_as.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module same_type_as_mod
 
diff --git a/flang/test/Lower/Intrinsics/sizeof.f90 b/flang/test/Lower/Intrinsics/sizeof.f90
index 959ca1692b5144..e10cb79981a692 100644
--- a/flang/test/Lower/Intrinsics/sizeof.f90
+++ b/flang/test/Lower/Intrinsics/sizeof.f90
@@ -1,5 +1,5 @@
 ! Test SIZEOF lowering for polymorphic entities.
-! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
 
 integer(8) function test1(x)
   class(*) :: x
diff --git a/flang/test/Lower/Intrinsics/spread.f90 b/flang/test/Lower/Intrinsics/spread.f90
index 4cd823d8374539..d58725aba69874 100644
--- a/flang/test/Lower/Intrinsics/spread.f90
+++ b/flang/test/Lower/Intrinsics/spread.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module spread_mod
 
diff --git a/flang/test/Lower/Intrinsics/storage_size.f90 b/flang/test/Lower/Intrinsics/storage_size.f90
index 668322f45bd851..b0c9d51f953286 100644
--- a/flang/test/Lower/Intrinsics/storage_size.f90
+++ b/flang/test/Lower/Intrinsics/storage_size.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module storage_size_test
   type :: p1
diff --git a/flang/test/Lower/OpenMP/FIR/flush.f90 b/flang/test/Lower/OpenMP/FIR/flush.f90
index 2c281632b85cb0..2868367fbdba64 100644
--- a/flang/test/Lower/OpenMP/FIR/flush.f90
+++ b/flang/test/Lower/OpenMP/FIR/flush.f90
@@ -1,7 +1,7 @@
 ! This test checks lowering of OpenMP Flush Directive.
 
 !RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="LLVMIRDialect,OMPDialect"
+!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion-on-func-opt | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="LLVMIRDialect,OMPDialect"
 
 subroutine flush_standalone(a, b, c)
     integer, intent(inout) :: a, b, c
diff --git a/flang/test/Lower/OpenMP/FIR/master.f90 b/flang/test/Lower/OpenMP/FIR/master.f90
index dd9910da2f4190..3bac582c7725a8 100644
--- a/flang/test/Lower/OpenMP/FIR/master.f90
+++ b/flang/test/Lower/OpenMP/FIR/master.f90
@@ -1,5 +1,5 @@
 !RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect"
+!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion-on-func-opt | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect"
 
 !===============================================================================
 ! parallel construct with function call which has master construct internally
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-sections.f90 b/flang/test/Lower/OpenMP/FIR/parallel-sections.f90
index 33fda178323f2b..78d73f038f0793 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-sections.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-sections.f90
@@ -1,5 +1,5 @@
 !RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | FileCheck %s --check-prefixes="FIRDialect,OMPDialect"
-!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect,LLVMDialect"
+!RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp %s -o - | fir-opt --cfg-conversion-on-func-opt | fir-opt --fir-to-llvm-ir | FileCheck %s --check-prefixes="OMPDialect,LLVMDialect"
 
 !===============================================================================
 ! Parallel sections construct
diff --git a/flang/test/Lower/OpenMP/Todo/reduction-arrays.f90 b/flang/test/Lower/OpenMP/Todo/reduction-arrays.f90
deleted file mode 100644
index a21611faf248ca..00000000000000
--- a/flang/test/Lower/OpenMP/Todo/reduction-arrays.f90
+++ /dev/null
@@ -1,15 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Reduction of some types is not supported
-subroutine reduction_array(y)
-  integer :: x(100), y(100,100)
-  !$omp parallel
-  !$omp do reduction(+:x)
-  do i=1, 100
-    x = x + y(:,i)
-  end do
-  !$omp end do
-  !$omp end parallel
-  print *, x
-end subroutine
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
new file mode 100644
index 00000000000000..aa835f82b2730b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-firstprivate.f90
@@ -0,0 +1,36 @@
+! Test delayed privatization for allocatables: `firstprivate`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+subroutine delayed_privatization_allocatable
+  implicit none
+  integer, allocatable :: var1
+
+!$omp parallel firstprivate(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CHECK: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+
+! CHECK-NEXT:  %[[PRIV_BASE_VAL:.*]] = fir.load %[[PRIV_PRIV_ARG]]
+! CHECK-NEXT:  %[[PRIV_BASE_BOX:.*]] = fir.box_addr %[[PRIV_BASE_VAL]]
+! CHECK-NEXT:  %[[PRIV_BASE_ADDR:.*]] = fir.convert %[[PRIV_BASE_BOX]]
+! CHECK-NEXT:  %[[C0:.*]] = arith.constant 0 : i64
+! CHECK-NEXT:  %[[COPY_COND:.*]] = arith.cmpi ne, %[[PRIV_BASE_ADDR]], %[[C0]] : i64
+
+! CHECK-NEXT:  fir.if %[[COPY_COND]] {
+! CHECK-NEXT:    %[[ORIG_BASE_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
+! CHECK-NEXT:    %[[ORIG_BASE_ADDR:.*]] = fir.box_addr %[[ORIG_BASE_VAL]]
+! CHECK-NEXT:    %[[ORIG_BASE_LD:.*]] = fir.load %[[ORIG_BASE_ADDR]]
+! CHECK-NEXT:    hlfir.assign %[[ORIG_BASE_LD]] to %[[PRIV_BASE_BOX]] temporary_lhs
+! CHECK-NEXT:  }
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90 b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
new file mode 100644
index 00000000000000..cc1818b00b809c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-allocatable-private.f90
@@ -0,0 +1,43 @@
+! Test delayed privatization for allocatables: `private`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+subroutine delayed_privatization_allocatable
+  implicit none
+  integer, allocatable :: var1
+
+!$omp parallel private(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.heap<i32>>>]] alloc {
+
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_allocatableEvar1"}
+
+! CHECK-NEXT:   %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK-NEXT:   %[[PRIV_ARG_BOX:.*]] = fir.box_addr %[[PRIV_ARG_VAL]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK-NEXT:   %[[PRIV_ARG_ADDR:.*]] = fir.convert %[[PRIV_ARG_BOX]] : (!fir.heap<i32>) -> i64
+! CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i64
+! CHECK-NEXT:   %[[ALLOC_COND:.*]] = arith.cmpi ne, %[[PRIV_ARG_ADDR]], %[[C0]] : i64
+
+! CHECK-NEXT:   fir.if %[[ALLOC_COND]] {
+! CHECK-NEXT:     %[[PRIV_ALLOCMEM:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFdelayed_privatization_allocatableEvar1.alloc"}
+! CHECK-NEXT:     %[[PRIV_ALLOCMEM_BOX:.*]] = fir.embox %[[PRIV_ALLOCMEM]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK-NEXT:     fir.store %[[PRIV_ALLOCMEM_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK-NEXT:   } else {
+! CHECK-NEXT:     %[[ZERO_BITS:.*]] = fir.zero_bits !fir.heap<i32>
+! CHECK-NEXT:     %[[ZERO_BOX:.*]] = fir.embox %[[ZERO_BITS]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK-NEXT:     fir.store %[[ZERO_BOX]] to %[[PRIV_ALLOC]] : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK-NEXT:   }
+
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+
+! CHECK-NEXT: }
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-pointer.f90 b/flang/test/Lower/OpenMP/delayed-privatization-pointer.f90
new file mode 100644
index 00000000000000..796e4720c8c954
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-pointer.f90
@@ -0,0 +1,31 @@
+! Test delayed privatization for pointers: `private`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --openmp-enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+subroutine delayed_privatization_pointer
+  implicit none
+  integer, pointer :: var1
+
+!$omp parallel firstprivate(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : [[TYPE:!fir.ref<!fir.box<!fir.ptr<i32>>>]] alloc {
+
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE]]):
+
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_pointerEvar1"}
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]]
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : [[TYPE]])
+
+! CHECK-NEXT: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: [[TYPE]], %[[PRIV_PRIV_ARG:.*]]: [[TYPE]]):
+! CHECK-NEXT:    %[[ORIG_BASE_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]]
+ ! CHECK-NEXT:   fir.store %[[ORIG_BASE_VAL]] to %[[PRIV_PRIV_ARG]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+! CHECK-NEXT:   omp.yield(%[[PRIV_PRIV_ARG]] : [[TYPE]])
+! CHECK-NEXT: }
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
new file mode 100644
index 00000000000000..19c45863108512
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
@@ -0,0 +1,74 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+program reduce
+integer, dimension(3) :: i = 0
+
+!$omp parallel reduction(+:i)
+i(1) = 1
+i(2) = 2
+i(3) = 3
+!$omp end parallel
+
+print *,i
+end program
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_32_box_3_byref : !fir.ref<!fir.box<!fir.array<3xi32>>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<3xi32> {bindc_name = ".tmp"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<3xi32>>
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CHECK:         } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<3xi32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]]#0, %[[VAL_5]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_7]] to %[[VAL_5]]#1 step %[[VAL_7]] unordered {
+! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QQmain()
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
+! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#1(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           omp.parallel byref reduction(@add_reduction_i_32_box_3_byref %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
+! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> (!fir.ref<!fir.box<!fir.array<3xi32>>>, !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : index
+! CHECK:             %[[VAL_11:.*]] = hlfir.designate %[[VAL_9]] (%[[VAL_10]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_8]] to %[[VAL_11]] : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_12:.*]] = arith.constant 2 : i32
+! CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:             %[[VAL_14:.*]] = arith.constant 2 : index
+! CHECK:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_13]] (%[[VAL_14]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_12]] to %[[VAL_15]] : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_16:.*]] = arith.constant 3 : i32
+! CHECK:             %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:             %[[VAL_18:.*]] = arith.constant 3 : index
+! CHECK:             %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref<i32>
+! CHECK:             omp.terminator
+! CHECK:           }
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
new file mode 100644
index 00000000000000..04c4045a6729ea
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
@@ -0,0 +1,89 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+program reduce
+integer, dimension(3) :: i = 0
+
+!$omp parallel reduction(+:i)
+i(1) = i(1) + 1
+i(2) = i(2) + 2
+i(3) = i(3) + 3
+!$omp end parallel
+
+print *,i
+end program
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_32_box_3_byref : !fir.ref<!fir.box<!fir.array<3xi32>>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<3xi32> {bindc_name = ".tmp"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<3xi32>>
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CHECK:         } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<3xi32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]]#0, %[[VAL_5]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_7]] to %[[VAL_5]]#1 step %[[VAL_7]] unordered {
+! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<!fir.array<3xi32>>
+! CHECK:           %[[VAL_1:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "_QFEi"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xi32>>, !fir.ref<!fir.array<3xi32>>)
+! CHECK:           %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#1(%[[VAL_2]]) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+! CHECK:           %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_5]] : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:           omp.parallel byref reduction(@add_reduction_i_32_box_3_byref %[[VAL_5]] -> %[[VAL_6:.*]] : !fir.ref<!fir.box<!fir.array<3xi32>>>) {
+! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> (!fir.ref<!fir.box<!fir.array<3xi32>>>, !fir.ref<!fir.box<!fir.array<3xi32>>>)
+! CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : index
+! CHECK:             %[[VAL_10:.*]] = hlfir.designate %[[VAL_8]] (%[[VAL_9]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+! CHECK:             %[[VAL_14:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : index
+! CHECK:             %[[VAL_16:.*]] = hlfir.designate %[[VAL_14]] (%[[VAL_15]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_13]] to %[[VAL_16]] : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_17:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:             %[[VAL_18:.*]] = arith.constant 2 : index
+! CHECK:             %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<i32>
+! CHECK:             %[[VAL_21:.*]] = arith.constant 2 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_20]], %[[VAL_21]] : i32
+! CHECK:             %[[VAL_23:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:             %[[VAL_24:.*]] = arith.constant 2 : index
+! CHECK:             %[[VAL_25:.*]] = hlfir.designate %[[VAL_23]] (%[[VAL_24]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_22]] to %[[VAL_25]] : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_26:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:             %[[VAL_27:.*]] = arith.constant 3 : index
+! CHECK:             %[[VAL_28:.*]] = hlfir.designate %[[VAL_26]] (%[[VAL_27]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_29:.*]] = fir.load %[[VAL_28]] : !fir.ref<i32>
+! CHECK:             %[[VAL_30:.*]] = arith.constant 3 : i32
+! CHECK:             %[[VAL_31:.*]] = arith.addi %[[VAL_29]], %[[VAL_30]] : i32
+! CHECK:             %[[VAL_32:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+! CHECK:             %[[VAL_33:.*]] = arith.constant 3 : index
+! CHECK:             %[[VAL_34:.*]] = hlfir.designate %[[VAL_32]] (%[[VAL_33]])  : (!fir.box<!fir.array<3xi32>>, index) -> !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_31]] to %[[VAL_34]] : i32, !fir.ref<i32>
+! CHECK:             omp.terminator
+! CHECK:           }
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-rename.f90 b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
new file mode 100644
index 00000000000000..86db2ffbcfeba3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90
@@ -0,0 +1,36 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+module m1
+  intrinsic max
+end module m1
+program main
+  use m1, ren=>max
+  n=0
+  !$omp parallel reduction(ren:n)
+  print *, "par"
+  !$omp end parallel
+end program main
+
+! test that we understood that this should be a max reduction
+
+! CHECK-LABEL:   omp.reduction.declare @max_i_32 : i32 init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
+! CHECK:           %[[VAL_1:.*]] = arith.constant -2147483648 : i32
+! CHECK:           omp.yield(%[[VAL_1]] : i32)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+! CHECK:           %[[VAL_2:.*]] = arith.maxsi %[[VAL_0]], %[[VAL_1]] : i32
+! CHECK:           omp.yield(%[[VAL_2]] : i32)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "main"} {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFEn"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_1]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel reduction(@max_i_32 %[[VAL_1]]#0 -> %[[VAL_3:.*]] : !fir.ref<i32>) {
+! ...
+! CHECK:             omp.terminator
+
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
new file mode 100644
index 00000000000000..4350ace820c264
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -0,0 +1,84 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+program reduce
+integer :: i = 0
+integer, dimension(2) :: r = 0
+
+!$omp parallel do reduction(+:r)
+do i=0,10
+  r(1) = i
+  r(2) = -i
+enddo
+!$omp end parallel do
+
+print *,r
+end program
+
+! CHECK-LABEL   omp.reduction.declare @add_reduction_i_32_box_2_byref : !fir.ref<!fir.box<!fir.array<2xi32>>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>):
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<2xi32> {bindc_name = ".tmp"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<2xi32>>
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
+
+! CHECK-LABEL   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>):
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<2xi32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]]#0, %[[VAL_5]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_7]] to %[[VAL_5]]#1 step %[[VAL_7]] unordered {
+! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<2xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<2xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
+! CHECK:         }
+
+! CHECK-LABEL   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.array<2xi32>>
+! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_4]]) {uniq_name = "_QFEr"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_6:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_8:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#1(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:             %[[VAL_12:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
+! CHECK:             fir.store %[[VAL_11]] to %[[VAL_12]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_box_2_byref %[[VAL_12]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_7]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.array<2xi32>>>) -> (!fir.ref<!fir.box<!fir.array<2xi32>>>, !fir.ref<!fir.box<!fir.array<2xi32>>>)
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:               %[[VAL_18:.*]] = arith.constant 1 : index
+! CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_18]])  : (!fir.box<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[VAL_19]] : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = arith.constant 0 : i32
+! CHECK:               %[[VAL_22:.*]] = arith.subi %[[VAL_21]], %[[VAL_20]] : i32
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:               %[[VAL_24:.*]] = arith.constant 2 : index
+! CHECK:               %[[VAL_25:.*]] = hlfir.designate %[[VAL_23]] (%[[VAL_24]])  : (!fir.box<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+! CHECK:               hlfir.assign %[[VAL_22]] to %[[VAL_25]] : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
new file mode 100644
index 00000000000000..8543c58dec4d50
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -0,0 +1,92 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+program reduce
+integer :: i = 0
+integer, dimension(2) :: r = 0
+
+!$omp parallel do reduction(+:r)
+do i=0,10
+  r(1) = r(1) + i
+  r(2) = r(2) - i
+enddo
+!$omp end parallel do
+
+print *,r
+end program
+
+! CHECK-LABEL   omp.reduction.declare @add_reduction_i_32_box_2_byref : !fir.ref<!fir.box<!fir.array<2xi32>>> init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>):
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<2xi32> {bindc_name = ".tmp"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_5]]) {uniq_name = ".tmp"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+! CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]]#0(%[[VAL_5]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_7]] : i32, !fir.box<!fir.array<2xi32>>
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
+
+! CHECK-LABEL   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>):
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_4]] : (!fir.box<!fir.array<2xi32>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_6:.*]] = fir.shape_shift %[[VAL_5]]#0, %[[VAL_5]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_7]] to %[[VAL_5]]#1 step %[[VAL_7]] unordered {
+! CHECK:             %[[VAL_9:.*]] = fir.array_coor %[[VAL_2]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<2xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = fir.array_coor %[[VAL_3]](%[[VAL_6]]) %[[VAL_8]] : (!fir.box<!fir.array<2xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
+! CHECK:             fir.store %[[VAL_13]] to %[[VAL_9]] : !fir.ref<i32>
+! CHECK:           }
+! CHECK:           omp.yield(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<!fir.array<2xi32>>
+! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_4]]) {uniq_name = "_QFEr"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_6:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_8:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_11:.*]] = fir.embox %[[VAL_5]]#1(%[[VAL_4]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+! CHECK:             %[[VAL_12:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
+! CHECK:             fir.store %[[VAL_11]] to %[[VAL_12]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_box_2_byref %[[VAL_12]] -> %[[VAL_13:.*]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_7]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QFEr"} : (!fir.ref<!fir.box<!fir.array<2xi32>>>) -> (!fir.ref<!fir.box<!fir.array<2xi32>>>, !fir.ref<!fir.box<!fir.array<2xi32>>>)
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:               %[[VAL_17:.*]] = arith.constant 1 : index
+! CHECK:               %[[VAL_18:.*]] = hlfir.designate %[[VAL_16]] (%[[VAL_17]])  : (!fir.box<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_18]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = arith.addi %[[VAL_19]], %[[VAL_20]] : i32
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:               %[[VAL_23:.*]] = arith.constant 1 : index
+! CHECK:               %[[VAL_24:.*]] = hlfir.designate %[[VAL_22]] (%[[VAL_23]])  : (!fir.box<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_24]] : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:               %[[VAL_26:.*]] = arith.constant 2 : index
+! CHECK:               %[[VAL_27:.*]] = hlfir.designate %[[VAL_25]] (%[[VAL_26]])  : (!fir.box<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+! CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_27]] : !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_30:.*]] = arith.subi %[[VAL_28]], %[[VAL_29]] : i32
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<!fir.box<!fir.array<2xi32>>>
+! CHECK:               %[[VAL_32:.*]] = arith.constant 2 : index
+! CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_31]] (%[[VAL_32]])  : (!fir.box<!fir.array<2xi32>>, index) -> !fir.ref<i32>
+! CHECK:               hlfir.assign %[[VAL_30]] to %[[VAL_33]] : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
index 9289973bae2029..86fb067fc48eb7 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
@@ -17,8 +17,16 @@ program reduce
 
 end program
 
-! TODO: the reduction is not curently lowered correctly. This test is checking
-! that we do not crash and we still produce the same broken IR as before.
+! CHECK-LABEL:   omp.reduction.declare @min_i_32 : i32 init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 2147483647 : i32
+! CHECK:           omp.yield(%[[VAL_1]] : i32)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32):
+! CHECK:           %[[VAL_2:.*]] = arith.minsi %[[VAL_0]], %[[VAL_1]] : i32
+! CHECK:           omp.yield(%[[VAL_2]] : i32)
+! CHECK:         }
 
 ! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
 ! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
@@ -31,10 +39,11 @@ program reduce
 ! CHECK:             %[[VAL_6:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_7:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:             omp.wsloop  for  (%[[VAL_9:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:               fir.store %[[VAL_9]] to %[[VAL_5]]#1 : !fir.ref<i32>
-! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
-! CHECK:               hlfir.assign %[[VAL_10]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:             omp.wsloop reduction(@min_i_32 %[[VAL_3]]#0 -> %[[VAL_9:.*]] : !fir.ref<i32>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_9]] {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_12:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:               hlfir.assign %[[VAL_12]] to %[[VAL_11]]#0 : i32, !fir.ref<i32>
 ! CHECK:               omp.yield
 ! CHECK:             }
 ! CHECK:             omp.terminator
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index 6ba0b13b24b6c4..10d7d957a25732 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -1,5 +1,5 @@
-! RUN: bbc --use-desc-for-alloc=false -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
-! RUN: bbc --use-desc-for-alloc=false -polymorphic-type -emit-hlfir %s -o - | tco | FileCheck %s --check-prefix=LLVM
+! RUN: bbc --use-desc-for-alloc=false -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc --use-desc-for-alloc=false -emit-hlfir %s -o - | tco | FileCheck %s --check-prefix=LLVM
 
 module poly
   type p1
diff --git a/flang/test/Lower/allocatable-return.f90 b/flang/test/Lower/allocatable-return.f90
index 92c5f8ad78d7cf..363d16237b9bfe 100644
--- a/flang/test/Lower/allocatable-return.f90
+++ b/flang/test/Lower/allocatable-return.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false --polymorphic-type -I nowhere %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -I nowhere %s -o - | FileCheck %s
 
 ! Test allocatable return.
 ! Allocatable arrays must have default runtime lbounds after the return.
diff --git a/flang/test/Lower/assumed-type.f90 b/flang/test/Lower/assumed-type.f90
index 99472a255adb73..44ce41d7573790 100644
--- a/flang/test/Lower/assumed-type.f90
+++ b/flang/test/Lower/assumed-type.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module assumed_type_test
 
diff --git a/flang/test/Lower/default-initialization.f90 b/flang/test/Lower/default-initialization.f90
index e692b9b08446ea..7a6133452b3a25 100644
--- a/flang/test/Lower/default-initialization.f90
+++ b/flang/test/Lower/default-initialization.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of local and dummy variables (dynamic initialization)
-! RUN: bbc -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module test_dinit
   type t
diff --git a/flang/test/Lower/derived-type-finalization.f90 b/flang/test/Lower/derived-type-finalization.f90
index 5c4531884213d2..e7ade0d8145bb6 100644
--- a/flang/test/Lower/derived-type-finalization.f90
+++ b/flang/test/Lower/derived-type-finalization.f90
@@ -1,5 +1,5 @@
 ! Test derived type finalization
-! RUN: bbc --use-desc-for-alloc=false -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 ! Missing tests:
 ! - finalization within BLOCK construct
diff --git a/flang/test/Lower/dispatch-table.f90 b/flang/test/Lower/dispatch-table.f90
index 0df4981b3eaf86..023dbcbedeaf49 100644
--- a/flang/test/Lower/dispatch-table.f90
+++ b/flang/test/Lower/dispatch-table.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir %s -o - | FileCheck %s
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
 
 ! Tests the generation of fir.type_info operations.
 
diff --git a/flang/test/Lower/dispatch.f90 b/flang/test/Lower/dispatch.f90
index 1aad4a4b8e46f5..60364076e633bf 100644
--- a/flang/test/Lower/dispatch.f90
+++ b/flang/test/Lower/dispatch.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
 ! Tests the different possible type involving polymorphic entities.
 
diff --git a/flang/test/Lower/intentout-deallocate.f90 b/flang/test/Lower/intentout-deallocate.f90
index 30502fd5d722ab..8e7ccbcc9fdb9d 100644
--- a/flang/test/Lower/intentout-deallocate.f90
+++ b/flang/test/Lower/intentout-deallocate.f90
@@ -1,6 +1,6 @@
 ! Test correct deallocation of intent(out) allocatables.
-! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false -polymorphic-type %s -o - | FileCheck %s --check-prefixes=CHECK,FIR
-! RUN: bbc -emit-hlfir -polymorphic-type %s -o - -I nw | FileCheck %s --check-prefixes=CHECK,HLFIR
+! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false %s -o - | FileCheck %s --check-prefixes=CHECK,FIR
+! RUN: bbc -emit-hlfir %s -o - -I nw | FileCheck %s --check-prefixes=CHECK,HLFIR
 
 module mod1
   type, bind(c) :: t1
diff --git a/flang/test/Lower/io-derived-type-2.f90 b/flang/test/Lower/io-derived-type-2.f90
index c2f1ff1850725d..eeea2ae156f945 100644
--- a/flang/test/Lower/io-derived-type-2.f90
+++ b/flang/test/Lower/io-derived-type-2.f90
@@ -1,6 +1,6 @@
 ! Check that InputDerivedType/OutputDeriverType APIs are used
 ! for io of derived types.
-! RUN: bbc -polymorphic-type -emit-fir -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -o - %s | FileCheck %s
 
 module p
   type :: person
diff --git a/flang/test/Lower/io-derived-type.f90 b/flang/test/Lower/io-derived-type.f90
index 84f57f46289bd9..08b1207f55ad1a 100644
--- a/flang/test/Lower/io-derived-type.f90
+++ b/flang/test/Lower/io-derived-type.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 module m
   type t
diff --git a/flang/test/Lower/nullify-polymorphic.f90 b/flang/test/Lower/nullify-polymorphic.f90
index ee271b815bdd52..5cb966810f1b92 100644
--- a/flang/test/Lower/nullify-polymorphic.f90
+++ b/flang/test/Lower/nullify-polymorphic.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-hlfir %s -o - | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
 module poly
   type p1
diff --git a/flang/test/Lower/pass-null-for-class-arg.f90 b/flang/test/Lower/pass-null-for-class-arg.f90
index d3d657f29a36cd..f60cad30b1bf69 100644
--- a/flang/test/Lower/pass-null-for-class-arg.f90
+++ b/flang/test/Lower/pass-null-for-class-arg.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -emit-fir -polymorphic-type %s -o - | FileCheck %s --check-prefix=FIR
-! RUN: bbc -emit-fir -polymorphic-type -hlfir %s -o - | FileCheck %s --check-prefix=HLFIR
+! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefix=FIR
+! RUN: bbc -emit-fir -hlfir %s -o - | FileCheck %s --check-prefix=HLFIR
 
 subroutine test
   interface
diff --git a/flang/test/Lower/pointer-association-polymorphic.f90 b/flang/test/Lower/pointer-association-polymorphic.f90
index bde96429c0ee95..6c56db892d1b8e 100644
--- a/flang/test/Lower/pointer-association-polymorphic.f90
+++ b/flang/test/Lower/pointer-association-polymorphic.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module poly
   type p1
diff --git a/flang/test/Lower/pointer-disassociate.f90 b/flang/test/Lower/pointer-disassociate.f90
index 8d719058316554..e341bca5cd89b3 100644
--- a/flang/test/Lower/pointer-disassociate.f90
+++ b/flang/test/Lower/pointer-disassociate.f90
@@ -1,5 +1,5 @@
 ! Test lowering of pointer disassociation
-! RUN: bbc -emit-fir -hlfir=false --polymorphic-type %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 
 ! -----------------------------------------------------------------------------
diff --git a/flang/test/Lower/polymorphic-temp.f90 b/flang/test/Lower/polymorphic-temp.f90
index 46cf14dcf93f4a..8633620e8430e1 100644
--- a/flang/test/Lower/polymorphic-temp.f90
+++ b/flang/test/Lower/polymorphic-temp.f90
@@ -1,5 +1,5 @@
 ! Test creation of temporary from polymorphic enities
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 module poly_tmp
   type p1
diff --git a/flang/test/Lower/polymorphic-types.f90 b/flang/test/Lower/polymorphic-types.f90
index 0288fea6eb38b7..a06e0a29b6ae84 100644
--- a/flang/test/Lower/polymorphic-types.f90
+++ b/flang/test/Lower/polymorphic-types.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 ! Tests the different possible type involving polymorphic entities. 
 
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index 15d8a86e4ef47c..e031b4805dc5b1 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -1,4 +1,4 @@
-! RUN: bbc --use-desc-for-alloc=false -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false %s -o - | FileCheck %s
 
 ! Tests various aspect of the lowering of polymorphic entities.
 
diff --git a/flang/test/Lower/select-type-2.f90 b/flang/test/Lower/select-type-2.f90
index 3769132b01c1fb..7b4cf96069e53d 100644
--- a/flang/test/Lower/select-type-2.f90
+++ b/flang/test/Lower/select-type-2.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | fir-opt --fir-polymorphic-op | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | fir-opt --fir-polymorphic-op | FileCheck %s
 module select_type_2
   type p1
     integer :: a
diff --git a/flang/test/Lower/select-type.f90 b/flang/test/Lower/select-type.f90
index ef91fb913a1c86..3243a813e9d59f 100644
--- a/flang/test/Lower/select-type.f90
+++ b/flang/test/Lower/select-type.f90
@@ -1,5 +1,5 @@
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: bbc -polymorphic-type -emit-fir -hlfir=false %s -o - | fir-opt --fir-polymorphic-op | FileCheck --check-prefix=CFG %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false %s -o - | fir-opt --fir-polymorphic-op | FileCheck --check-prefix=CFG %s
 module select_type_lower_test
   type p1
     integer :: a
diff --git a/flang/test/Parser/cuf-sanity-tree.CUF b/flang/test/Parser/cuf-sanity-tree.CUF
index f6cf9bbdd6b0cc..dc12759d3ce52f 100644
--- a/flang/test/Parser/cuf-sanity-tree.CUF
+++ b/flang/test/Parser/cuf-sanity-tree.CUF
@@ -144,11 +144,11 @@ include "cuf-sanity-common"
 !CHECK: | | | | | | EndDoStmt -> 
 !CHECK: | | | | ExecutionPartConstruct -> ExecutableConstruct -> CUFKernelDoConstruct
 !CHECK: | | | | | Directive
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '1_4'
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Integer -> Expr = '1_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '2_4'
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Integer -> Expr = '2_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '3_4'
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Integer -> Expr = '3_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '3'
 !CHECK: | | | | | | Scalar -> Integer -> Expr = '1_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
diff --git a/flang/test/Semantics/OpenMP/reduction11.f90 b/flang/test/Semantics/OpenMP/reduction11.f90
new file mode 100644
index 00000000000000..3893fe70b407f4
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/reduction11.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols -o - %s 2>&1 | FileCheck %s
+! Check intrinsic reduction symbols (in this case "max" are marked as INTRINSIC
+
+! CHECK: MainProgram scope: omp_reduction
+program omp_reduction
+  ! CHECK: i size=4 offset=0: ObjectEntity type: INTEGER(4)
+  integer i
+  ! CHECK: k size=4 offset=4: ObjectEntity type: INTEGER(4) init:10_4
+  integer :: k = 10
+  ! CHECK: m size=4 offset=8: ObjectEntity type: INTEGER(4) init:12_4
+  integer :: m = 12
+
+  ! CHECK: OtherConstruct scope
+  ! CHECK: i (OmpPrivate, OmpPreDetermined): HostAssoc
+  ! CHECK: k (OmpReduction): HostAssoc
+  ! CHECK: max, INTRINSIC: ProcEntity
+  !$omp parallel do  reduction(max:k)
+  do i=1,10
+    k = i
+  end do
+  !$omp end parallel do
+end program omp_reduction
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index dd70c3b1ff5efd..4bc93132044fdd 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -10,6 +10,15 @@ module m
 end
 
 program main
+  !$cuf kernel do <<< *, * >>> ! ok
+  do j = 1, 0
+  end do
+  !$cuf kernel do <<< (*), (*) >>> ! ok
+  do j = 1, 0
+  end do
+  !$cuf kernel do <<< (1,*), (2,*) >>> ! ok
+  do j = 1, 0
+  end do
   !ERROR: !$CUF KERNEL DO (1) must be followed by a DO construct with tightly nested outer levels of counted DO loops
   !$cuf kernel do <<< 1, 2 >>>
   do while (.false.)
diff --git a/flang/test/Semantics/declarations02.f90 b/flang/test/Semantics/declarations02.f90
index 439527a0edb6af..f39c233c1c3a43 100644
--- a/flang/test/Semantics/declarations02.f90
+++ b/flang/test/Semantics/declarations02.f90
@@ -10,6 +10,20 @@ module m
   integer, parameter :: x3 = 1
   bind(c) :: x3
 
+  !ERROR: 'x4' may not have both the ALLOCATABLE and PARAMETER attributes
+  !ERROR: 'x4' may not have both the ASYNCHRONOUS and PARAMETER attributes
+  !ERROR: 'x4' may not have both the SAVE and PARAMETER attributes
+  !ERROR: 'x4' may not have both the TARGET and PARAMETER attributes
+  !ERROR: 'x4' may not have both the VOLATILE and PARAMETER attributes
+  !ERROR: The entity 'x4' with an explicit SAVE attribute must be a variable, procedure pointer, or COMMON block
+  !ERROR: An entity may not have the ASYNCHRONOUS attribute unless it is a variable
+  integer, parameter :: x4 = 1
+  allocatable x4
+  asynchronous x4
+  save x4
+  target x4
+  volatile x4
+
   type :: my_type1
     integer :: x4
   end type
diff --git a/flang/test/Semantics/doconcurrent08.f90 b/flang/test/Semantics/doconcurrent08.f90
index 41cd71e233d0d3..52b382741d0731 100644
--- a/flang/test/Semantics/doconcurrent08.f90
+++ b/flang/test/Semantics/doconcurrent08.f90
@@ -209,6 +209,8 @@ module m2
   type :: impureFinal
    contains
     final :: impureSub
+    final :: impureSubRank1
+    final :: impureSubRank2
   end type
 
   type :: pureFinal
@@ -222,16 +224,27 @@ impure subroutine impureSub(x)
     type(impureFinal), intent(in) :: x
   end subroutine
 
+  impure subroutine impureSubRank1(x)
+    type(impureFinal), intent(in) :: x(:)
+  end subroutine
+
+  impure subroutine impureSubRank2(x)
+    type(impureFinal), intent(in) :: x(:,:)
+  end subroutine
+
   pure subroutine pureSub(x)
     type(pureFinal), intent(in) :: x
   end subroutine
 
   subroutine s4()
     type(impureFinal), allocatable :: ifVar, ifvar1
+    type(impureFinal), allocatable :: ifArr1(:), ifArr2(:,:)
+    type(impureFinal) :: if0
     type(pureFinal), allocatable :: pfVar
     allocate(ifVar)
     allocate(ifVar1)
     allocate(pfVar)
+    allocate(ifArr1(5), ifArr2(5,5))
 
     ! OK for an ordinary DO loop
     do i = 1,10
@@ -239,11 +252,9 @@ subroutine s4()
     end do
 
     ! OK to invoke a PURE FINAL procedure in a DO CONCURRENT
-    ! This case does not work currently because the compiler's test for
-    ! HasImpureFinal() in .../lib/Semantics/tools.cc doesn't work correctly
-!    do concurrent (i = 1:10)
-!      if (i .eq. 1) deallocate(pfVar)
-!    end do
+    do concurrent (i = 1:10)
+      if (i .eq. 1) deallocate(pfVar)
+    end do
 
     ! Error to invoke an IMPURE FINAL procedure in a DO CONCURRENT
     do concurrent (i = 1:10)
@@ -271,6 +282,34 @@ subroutine s4()
         ifvar = ifvar1
       end if
     end do
+
+    do concurrent (i = 1:5)
+      if (i .eq. 1) then
+        !ERROR: Deallocation of an entity with an IMPURE FINAL procedure 'impuresub' caused by assignment not allowed in DO CONCURRENT
+        ifArr1(i) = if0
+      end if
+    end do
+
+    do concurrent (i = 1:5)
+      if (i .eq. 1) then
+        !ERROR: Deallocation of an entity with an IMPURE FINAL procedure 'impuresubrank1' caused by assignment not allowed in DO CONCURRENT
+        ifArr1 = if0
+      end if
+    end do
+
+    do concurrent (i = 1:5)
+      if (i .eq. 1) then
+        !ERROR: Deallocation of an entity with an IMPURE FINAL procedure 'impuresubrank1' caused by assignment not allowed in DO CONCURRENT
+        ifArr2(i,:) = if0
+      end if
+    end do
+
+    do concurrent (i = 1:5)
+      if (i .eq. 1) then
+        !ERROR: Deallocation of an entity with an IMPURE FINAL procedure 'impuresubrank2' caused by assignment not allowed in DO CONCURRENT
+        ifArr2(:,:) = if0
+      end if
+    end do
   end subroutine s4
 
 end module m2
diff --git a/flang/test/Semantics/forall02.f90 b/flang/test/Semantics/forall02.f90
new file mode 100644
index 00000000000000..c4f4311a175a3e
--- /dev/null
+++ b/flang/test/Semantics/forall02.f90
@@ -0,0 +1,67 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+
+module m1
+  type :: impureFinal
+  contains
+    final :: impureSub
+    final :: impureSubRank1
+    final :: impureSubRank2
+  end type
+
+ contains
+
+  impure subroutine impureSub(x)
+    type(impureFinal), intent(in) :: x
+  end subroutine
+
+  impure subroutine impureSubRank1(x)
+    type(impureFinal), intent(in) :: x(:)
+  end subroutine
+
+  impure subroutine impureSubRank2(x)
+    type(impureFinal), intent(in) :: x(:,:)
+  end subroutine
+
+  subroutine s1()
+    implicit none
+    integer :: i
+    type(impureFinal), allocatable :: ifVar, ifvar1
+    type(impureFinal), allocatable :: ifArr1(:), ifArr2(:,:)
+    type(impureFinal) :: if0
+    integer a(10)
+    allocate(ifVar)
+    allocate(ifVar1)
+    allocate(ifArr1(5), ifArr2(5,5))
+
+    ! Error to invoke an IMPURE FINAL procedure in a FORALL
+    forall (i = 1:10)
+      !WARNING: FORALL index variable 'i' not used on left-hand side of assignment
+      !ERROR: Impure procedure 'impuresub' is referenced by finalization in a FORALL
+      ifvar = ifvar1
+    end forall
+
+    forall (i = 1:5)
+      !ERROR: Impure procedure 'impuresub' is referenced by finalization in a FORALL
+      ifArr1(i) = if0
+    end forall
+
+    forall (i = 1:5)
+      !WARNING: FORALL index variable 'i' not used on left-hand side of assignment
+      !ERROR: Impure procedure 'impuresubrank1' is referenced by finalization in a FORALL
+      ifArr1 = if0
+    end forall
+
+    forall (i = 1:5)
+      !ERROR: Impure procedure 'impuresubrank1' is referenced by finalization in a FORALL
+      ifArr2(i,:) = if0
+    end forall
+
+    forall (i = 1:5)
+      !WARNING: FORALL index variable 'i' not used on left-hand side of assignment
+      !ERROR: Impure procedure 'impuresubrank2' is referenced by finalization in a FORALL
+      ifArr2(:,:) = if0
+    end forall
+  end subroutine
+
+end module m1
+
diff --git a/flang/test/Semantics/separate-mp05.f90 b/flang/test/Semantics/separate-mp05.f90
index 5b7e2523a22866..ad002142fd285e 100644
--- a/flang/test/Semantics/separate-mp05.f90
+++ b/flang/test/Semantics/separate-mp05.f90
@@ -7,8 +7,10 @@ module m
   !DEF: /m/smp MODULE, PUBLIC, PURE (Function) Subprogram REAL(4)
   !DEF: /m/smp/f EXTERNAL, PURE (Function) Subprogram REAL(4)
   !DEF: /m/smp/x INTENT(IN) ObjectEntity REAL(4)
-  !DEF: /m/smp/res (Implicit) ObjectEntity REAL(4)
+  !DEF: /m/smp/res ObjectEntity REAL(4)
   pure module function smp(f, x) result(res)
+  !REF: /m/smp/res
+  real res
    interface
     !REF: /m/smp/f
     !DEF: /m/smp/f/x INTENT(IN) ObjectEntity REAL(4)
@@ -32,7 +34,7 @@ pure function f(x) result(r)
 contains
  !DEF: /m/sm/smp MODULE, PUBLIC, PURE (Function) Subprogram REAL(4)
  module procedure smp
-  !DEF: /m/sm/smp/res (Implicit) ObjectEntity REAL(4)
+  !DEF: /m/sm/smp/res ObjectEntity REAL(4)
   !DEF: /m/sm/smp/f EXTERNAL, PURE (Function) Subprogram REAL(4)
   !DEF: /m/sm/smp/x INTENT(IN) ObjectEntity REAL(4)
   res = f(x)
diff --git a/flang/test/Transforms/omp-reduction-cfg-conversion.fir b/flang/test/Transforms/omp-reduction-cfg-conversion.fir
new file mode 100644
index 00000000000000..c27bba2f7b0764
--- /dev/null
+++ b/flang/test/Transforms/omp-reduction-cfg-conversion.fir
@@ -0,0 +1,57 @@
+// RUN: fir-opt --cfg-conversion-on-reduce-opt %s | FileCheck %s
+
+omp.reduction.declare @add_reduction_i_32_box_3_byref : !fir.ref<!fir.box<!fir.array<3xi32>>> init {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+  %c4_i32 = arith.constant 4 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c3 = arith.constant 3 : index
+  %0 = fir.alloca !fir.box<!fir.array<3xi32>>
+  %1 = fir.alloca !fir.array<3xi32> {bindc_name = "omp.reduction.array.init"}
+  %2 = fir.shape %c3 : (index) -> !fir.shape<1>
+  %3 = fir.declare %1(%2) {uniq_name = "omp.reduction.array.init"} : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<3xi32>>
+  %4 = fir.embox %3(%2) : (!fir.ref<!fir.array<3xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<3xi32>>
+  %5 = fir.alloca i32
+  fir.store %c0_i32 to %5 : !fir.ref<i32>
+  %6 = fir.embox %5 : (!fir.ref<i32>) -> !fir.box<i32>
+  fir.store %4 to %0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+  %7 = fir.address_of(@_QQclX9a9fdf8c5fd329fbbf2b0c08e2ca9a1e) : !fir.ref<!fir.char<1,40>>
+  %8 = fir.convert %0 : (!fir.ref<!fir.box<!fir.array<3xi32>>>) -> !fir.ref<!fir.box<none>>
+  %9 = fir.convert %6 : (!fir.box<i32>) -> !fir.box<none>
+  %10 = fir.convert %7 : (!fir.ref<!fir.char<1,40>>) -> !fir.ref<i8>
+  %11 = fir.call @_FortranAAssign(%8, %9, %10, %c4_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+  %12 = fir.alloca !fir.box<!fir.array<3xi32>>
+  fir.store %4 to %12 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+  omp.yield(%12 : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+} combiner {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<3xi32>>>, %arg1: !fir.ref<!fir.box<!fir.array<3xi32>>>):
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+  %1 = fir.load %arg1 : !fir.ref<!fir.box<!fir.array<3xi32>>>
+  %2:3 = fir.box_dims %0, %c0 : (!fir.box<!fir.array<3xi32>>, index) -> (index, index, index)
+  %3 = fir.shape_shift %2#0, %2#1 : (index, index) -> !fir.shapeshift<1>
+  fir.do_loop %arg2 = %c1 to %2#1 step %c1 unordered {
+    %4 = fir.array_coor %0(%3) %arg2 : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+    %5 = fir.array_coor %1(%3) %arg2 : (!fir.box<!fir.array<3xi32>>, !fir.shapeshift<1>, index) -> !fir.ref<i32>
+    %6 = fir.load %4 : !fir.ref<i32>
+    %7 = fir.load %5 : !fir.ref<i32>
+    %8 = arith.addi %6, %7 : i32
+    fir.store %8 to %4 : !fir.ref<i32>
+  }
+  omp.yield(%arg0 : !fir.ref<!fir.box<!fir.array<3xi32>>>)
+}
+
+// ensure cfg conversion has run on the do loop
+// CHECK: combiner {
+// CHECK-NOT: fir.do_loop
+// CHECK: ^bb0({{.*}}):
+// ...
+// CHECK:   cf.br ^bb1
+// CHECK: ^bb1({{.*}}):
+// ...
+// CHECK:   cf.cond_br %{{.*}} ^bb2, ^bb3
+// CHECK: ^bb2:
+// ...
+// CHECK:   cf.br ^bb1
+// CHECK: ^bb3:
+
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index 73d740ff439421..a0870d3649c2e4 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -191,11 +191,6 @@ static llvm::cl::opt<bool> enableOpenACC("fopenacc",
                                          llvm::cl::desc("enable openacc"),
                                          llvm::cl::init(false));
 
-static llvm::cl::opt<bool> enablePolymorphic(
-    "polymorphic-type",
-    llvm::cl::desc("enable polymorphic type lowering (experimental)"),
-    llvm::cl::init(false));
-
 static llvm::cl::opt<bool> enableNoPPCNativeVecElemOrder(
     "fno-ppc-native-vector-element-order",
     llvm::cl::desc("no PowerPC native vector element order."),
@@ -351,7 +346,6 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
   std::string targetTriple = targetMachine.getTargetTriple().normalize();
   // Use default lowering options for bbc.
   Fortran::lower::LoweringOptions loweringOptions{};
-  loweringOptions.setPolymorphicTypeImpl(enablePolymorphic);
   loweringOptions.setNoPPCNativeVecElemOrder(enableNoPPCNativeVecElemOrder);
   loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR);
   std::vector<Fortran::lower::EnvironmentDefault> envDefaults = {};
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index ba6c6642c0b62e..a249583168313e 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -63,7 +63,7 @@ if (NOT CMAKE_CROSSCOMPILING)
       COMMAND ${CMAKE_COMMAND} -E make_directory ${FLANG_INTRINSIC_MODULES_DIR}
       COMMAND flang-new -cpp -fsyntax-only ${opts} -module-dir ${FLANG_INTRINSIC_MODULES_DIR}
         ${FLANG_SOURCE_DIR}/module/${filename}.f90
-      DEPENDS flang-new ${FLANG_SOURCE_DIR}/module/${filename}.f90 ${depends}
+      DEPENDS flang-new ${FLANG_SOURCE_DIR}/module/${filename}.f90 ${FLANG_SOURCE_DIR}/module/__fortran_builtins.f90 ${depends}
     )
     add_custom_command(OUTPUT ${base}.f18.mod
       DEPENDS ${base}.mod
diff --git a/flang/unittests/Runtime/NumericalFormatTest.cpp b/flang/unittests/Runtime/NumericalFormatTest.cpp
index 37eecd7708a1eb..dee4dda4a22869 100644
--- a/flang/unittests/Runtime/NumericalFormatTest.cpp
+++ b/flang/unittests/Runtime/NumericalFormatTest.cpp
@@ -902,6 +902,14 @@ TEST(IOApiTests, EditDoubleInputValues) {
           0}, // max finite
       {"(EX22.0)", "0X.8P1025             ", 0x7ff0000000000000, ovf}, // +Inf
       {"(EX22.0)", "-0X.8P1025            ", 0xfff0000000000000, ovf}, // -Inf
+      {"(RC,EX22.0)", "0X1.0000000000000P0   ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X1.00000000000008P0  ", 0x3ff0000000000001, 0},
+      {"(RC,EX22.0)", "0X1.000000000000008P0 ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X1.00000000000004P0  ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X.80000000000000P1   ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X.80000000000004P1   ", 0x3ff0000000000001, 0},
+      {"(RC,EX22.0)", "0X.800000000000004P1  ", 0x3ff0000000000000, 0},
+      {"(RC,EX22.0)", "0X.80000000000002P1   ", 0x3ff0000000000000, 0},
       {"(RZ,F7.0)", " 2.e308", 0x7fefffffffffffff, 0}, // +HUGE()
       {"(RD,F7.0)", " 2.e308", 0x7fefffffffffffff, 0}, // +HUGE()
       {"(RU,F7.0)", " 2.e308", 0x7ff0000000000000, ovf}, // +Inf
diff --git a/libc/config/baremetal/api.td b/libc/config/baremetal/api.td
index 80d0e0ba22ca51..25aa06aacb642e 100644
--- a/libc/config/baremetal/api.td
+++ b/libc/config/baremetal/api.td
@@ -52,6 +52,14 @@ def IntTypesAPI : PublicAPI<"inttypes.h"> {
   let Types = ["imaxdiv_t"];
 }
 
+def MathAPI : PublicAPI<"math.h"> {
+  let Types = ["double_t", "float_t"];
+}
+
+def StdIOAPI : PublicAPI<"stdio.h"> {
+  let Types = ["size_t"];
+}
+
 def StdlibAPI : PublicAPI<"stdlib.h"> {
   let Types = [
     "div_t",
@@ -60,7 +68,6 @@ def StdlibAPI : PublicAPI<"stdlib.h"> {
     "size_t",
     "__bsearchcompare_t",
     "__qsortcompare_t",
-    "__atexithandler_t",
   ];
 }
 
@@ -68,6 +75,15 @@ def StringAPI : PublicAPI<"string.h"> {
   let Types = ["size_t"];
 }
 
+def TimeAPI : PublicAPI<"time.h"> {
+  let Types = [
+    "clock_t",
+    "time_t",
+    "struct tm",
+    "struct timespec",
+  ];
+}
+
 def UCharAPI : PublicAPI<"uchar.h"> {
   let Types = ["mbstate_t"];
 }
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index 589ec5237e98da..17ce56e228a6ac 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -79,6 +79,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdio.h entrypoints
+    libc.src.stdio.remove
     libc.src.stdio.sprintf
     libc.src.stdio.snprintf
     libc.src.stdio.vsprintf
@@ -170,7 +171,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.llabs
     libc.src.stdlib.lldiv
     libc.src.stdlib.qsort
-    libc.src.stdlib.qsort_r
     libc.src.stdlib.rand
     libc.src.stdlib.srand
     libc.src.stdlib.strtod
@@ -180,6 +180,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoll
     libc.src.stdlib.strtoul
     libc.src.stdlib.strtoull
+
+    # time.h entrypoints
+    libc.src.time.difftime
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/config/baremetal/arm/headers.txt b/libc/config/baremetal/arm/headers.txt
index 962981f8a2087d..3608364e45bdeb 100644
--- a/libc/config/baremetal/arm/headers.txt
+++ b/libc/config/baremetal/arm/headers.txt
@@ -13,5 +13,6 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.string
     libc.include.strings
     libc.include.sys_queue
+    libc.include.time
     libc.include.uchar
 )
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 09de1b416e0e94..39756e1ee29f54 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -79,6 +79,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.inttypes.strtoumax
 
     # stdio.h entrypoints
+    libc.src.stdio.remove
     libc.src.stdio.sprintf
     libc.src.stdio.snprintf
     libc.src.stdio.vsprintf
@@ -170,7 +171,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.llabs
     libc.src.stdlib.lldiv
     libc.src.stdlib.qsort
-    libc.src.stdlib.qsort_r
     libc.src.stdlib.rand
     libc.src.stdlib.srand
     libc.src.stdlib.strtod
@@ -180,6 +180,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoll
     libc.src.stdlib.strtoul
     libc.src.stdlib.strtoull
+
+    # time.h entrypoints
+    libc.src.time.difftime
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/config/baremetal/riscv/headers.txt b/libc/config/baremetal/riscv/headers.txt
index 962981f8a2087d..3608364e45bdeb 100644
--- a/libc/config/baremetal/riscv/headers.txt
+++ b/libc/config/baremetal/riscv/headers.txt
@@ -13,5 +13,6 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.string
     libc.include.strings
     libc.include.sys_queue
+    libc.include.time
     libc.include.uchar
 )
diff --git a/libc/config/gpu/api.td b/libc/config/gpu/api.td
index 607b8b6d5900c8..adaf5bfd747ac7 100644
--- a/libc/config/gpu/api.td
+++ b/libc/config/gpu/api.td
@@ -63,7 +63,6 @@ def StdIOAPI : PublicAPI<"stdio.h"> {
     SimpleMacroDef<"_IOFBF", "0">,
     SimpleMacroDef<"_IOLBF", "1">,
     SimpleMacroDef<"_IONBF", "2">,
-    SimpleMacroDef<"EOF", "-1">,
   ];
   let Types = ["size_t", "FILE"];
 }
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 7d69099e3cb9db..dbf81c284e7845 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -216,6 +216,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.mman.mlockall
     libc.src.sys.mman.munlockall
     libc.src.sys.mman.msync
+    libc.src.sys.mman.shm_open
+    libc.src.sys.mman.shm_unlink
 
     # sys/random.h entrypoints
     libc.src.sys.random.getrandom
@@ -415,9 +417,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainderf
     libc.src.math.remainder
@@ -468,7 +476,10 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.lrintf128
     libc.src.math.lroundf128
     libc.src.math.modff128
+    libc.src.math.nanf128
     libc.src.math.nextafterf128
+    libc.src.math.nextdownf128
+    libc.src.math.nextupf128
     libc.src.math.rintf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
@@ -539,6 +550,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.ferror_unlocked
     libc.src.stdio.fgetc
     libc.src.stdio.fflush
+    libc.src.stdio.fileno
     libc.src.stdio.fopen
     libc.src.stdio.fputc
     libc.src.stdio.fputs
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index 75432a2a298652..e9e82c5d478945 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -76,7 +76,6 @@ def StdIOAPI : PublicAPI<"stdio.h"> {
     SimpleMacroDef<"_IOFBF", "0">,
     SimpleMacroDef<"_IOLBF", "1">,
     SimpleMacroDef<"_IONBF", "2">,
-    SimpleMacroDef<"EOF", "-1">,
   ];
   let Types = ["size_t", "FILE", "cookie_io_functions_t"];
 }
@@ -118,7 +117,7 @@ def SchedAPI : PublicAPI<"sched.h"> {
 }
 
 def SysMManAPI : PublicAPI<"sys/mman.h"> {
-  let Types = ["off_t", "size_t"];
+  let Types = ["off_t", "size_t", "mode_t"];
 }
 
 def SignalAPI : PublicAPI<"signal.h"> {
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index bf1559b2f02369..3bc5d8efc9d26b 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -285,9 +285,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainder
     libc.src.math.remainderf
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index b1c9dd0428eea5..b42a55a4d712e1 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -221,6 +221,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.mman.mlockall
     libc.src.sys.mman.munlockall
     libc.src.sys.mman.msync
+    libc.src.sys.mman.shm_open
+    libc.src.sys.mman.shm_unlink
 
     # sys/random.h entrypoints
     libc.src.sys.random.getrandom
@@ -423,9 +425,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainderf
     libc.src.math.remainder
@@ -476,7 +484,10 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.lrintf128
     libc.src.math.lroundf128
     libc.src.math.modff128
+    libc.src.math.nanf128
     libc.src.math.nextafterf128
+    libc.src.math.nextdownf128
+    libc.src.math.nextupf128
     libc.src.math.rintf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
@@ -562,6 +573,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.fgetc_unlocked
     libc.src.stdio.fgets
     libc.src.stdio.fflush
+    libc.src.stdio.fileno
     libc.src.stdio.fopen
     libc.src.stdio.fputc
     libc.src.stdio.fputs
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 4fb31c593b9dc7..e8cf11266624a7 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -208,6 +208,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.sscanf
     libc.src.stdio.scanf
     libc.src.stdio.fscanf
+    libc.src.stdio.fileno
 
     # sys/epoll.h entrypoints
     libc.src.sys.epoll.epoll_wait
@@ -229,6 +230,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.mman.mlockall
     libc.src.sys.mman.munlockall
     libc.src.sys.mman.msync
+    libc.src.sys.mman.shm_open
+    libc.src.sys.mman.shm_unlink
 
     # sys/random.h entrypoints
     libc.src.sys.random.getrandom
@@ -426,9 +429,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.nextafter
     libc.src.math.nextafterf
     libc.src.math.nextafterl
+    libc.src.math.nextdown
+    libc.src.math.nextdownf
+    libc.src.math.nextdownl
     libc.src.math.nexttoward
     libc.src.math.nexttowardf
     libc.src.math.nexttowardl
+    libc.src.math.nextup
+    libc.src.math.nextupf
+    libc.src.math.nextupl
     libc.src.math.powf
     libc.src.math.remainderf
     libc.src.math.remainder
@@ -481,7 +490,10 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.lrintf128
     libc.src.math.lroundf128
     libc.src.math.modff128
+    libc.src.math.nanf128
     libc.src.math.nextafterf128
+    libc.src.math.nextdownf128
+    libc.src.math.nextupf128
     libc.src.math.rintf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
diff --git a/libc/docs/dev/printf_behavior.rst b/libc/docs/dev/printf_behavior.rst
index 00d6c83f4b0d36..9548bfda57aa7d 100644
--- a/libc/docs/dev/printf_behavior.rst
+++ b/libc/docs/dev/printf_behavior.rst
@@ -1,3 +1,5 @@
+.. _printf_behavior:
+
 ====================================
 Printf Behavior Under All Conditions
 ====================================
diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst
index 6e73a305e8e054..50e8bdde89ddd6 100644
--- a/libc/docs/dev/undefined_behavior.rst
+++ b/libc/docs/dev/undefined_behavior.rst
@@ -70,3 +70,14 @@ Design Decisions
 Resizable Tables for hsearch
 ----------------------------
 The POSIX.1 standard does not delineate the behavior consequent to invoking hsearch or hdestroy without prior initialization of the hash table via hcreate. Furthermore, the standard does not specify the outcomes of successive invocations of hsearch absent intervening hdestroy calls. Libraries such as MUSL and Glibc do not apply checks to these scenarios, potentially leading to memory corruption or leakage. Conversely, FreeBSD's libc and Bionic automatically initialize the hash table to a minimal size if it is found uninitialized, and proceeding to destroy the table only if initialization has occurred. This approach also avoids redundant table allocation if an initialized hash table is already present. Given that the hash table starts with a minimal size, resizing becomes necessary to accommodate additional user insertions. LLVM's libc mirrors the approach of FreeBSD's libc and Bionic, owing to its enhanced robustness and user-friendliness. Notably, such resizing behavior itself aligns with POSIX.1 standards, which explicitly permit implementations to modify the capacity of the hash table.
+
+Path without Leading Slashs in shm_open
+----------------------------------------
+POSIX.1 leaves that when the name of a shared memory object does not begin with a slash, the behavior is implementation defined. In such cases, the shm_open in LLVM libc is implemented to behave as if the name began with a slash.
+
+Handling of NULL arguments to the 's' format specifier
+------------------------------------------------------
+The C standard does not specify behavior for ``printf("%s", NULL)``. We will
+print the string literal ``(null)`` unless using the 
+``LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS`` option described in :ref:`printf 
+behavior<printf_behavior>`.
diff --git a/libc/docs/fullbuild_mode.rst b/libc/docs/fullbuild_mode.rst
index 0af923de42f84d..b1151017fbc794 100644
--- a/libc/docs/fullbuild_mode.rst
+++ b/libc/docs/fullbuild_mode.rst
@@ -6,7 +6,7 @@ Fullbuild Mode
 
 The *fullbuild* mode of LLVM's libc is the mode in which it is to be used as
 the only libc (as opposed to the :ref:`overlay_mode` in which it is used along
-with the system libc.) In to order use it as the only libc, one will have to
+with the system libc.) In order to use it as the only libc, one will have to
 build and install not only the static archives like ``libc.a`` from LLVM's libc,
 but also the start-up objects like ``crt1.o`` and the public headers.
 
diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst
index dab21e1324d281..6d94134a407d34 100644
--- a/libc/docs/gpu/building.rst
+++ b/libc/docs/gpu/building.rst
@@ -220,11 +220,15 @@ targets. This section will briefly describe their purpose.
   be used to enable host services for anyone looking to interface with the
   :ref:`RPC client<libc_gpu_rpc>`.
 
+.. _gpu_cmake_options:
+
 CMake options
 =============
 
 This section briefly lists a few of the CMake variables that specifically
-control the GPU build of the C library.
+control the GPU build of the C library. These options can be passed individually
+to each target using ``-DRUNTIMES_<target>_<variable>=<value>`` when using a
+standard runtime build.
 
 **LLVM_LIBC_FULL_BUILD**:BOOL
   This flag controls whether or not the libc build will generate its own
diff --git a/libc/docs/gpu/testing.rst b/libc/docs/gpu/testing.rst
index 9842a675283619..9f17159fb6d5ee 100644
--- a/libc/docs/gpu/testing.rst
+++ b/libc/docs/gpu/testing.rst
@@ -1,9 +1,9 @@
 .. _libc_gpu_testing:
 
 
-============================
-Testing the GPU libc library
-============================
+=========================
+Testing the GPU C library
+=========================
 
 .. note::
    Running GPU tests with high parallelism is likely to cause spurious failures,
@@ -14,24 +14,134 @@ Testing the GPU libc library
   :depth: 4
   :local:
 
-Testing Infrastructure
+Testing infrastructure
 ======================
 
-The testing support in LLVM's libc implementation for GPUs is designed to mimic
-the standard unit tests as much as possible. We use the :ref:`libc_gpu_rpc`
-support to provide the necessary utilities like printing from the GPU. Execution
-is performed by emitting a ``_start`` kernel from the GPU
-that is then called by an external loader utility. This is an example of how
-this can be done manually:
+The LLVM C library supports different kinds of :ref:`tests <build_and_test>`
+depending on the build configuration. The GPU target is considered a full build
+and therefore provides all of its own utilities to build and run the generated
+tests. Currently the GPU supports two kinds of tests.
+
+#. **Hermetic tests** - These are unit tests built with a test suite similar to
+   Google's ``gtest`` infrastructure. These use the same infrastructure as unit
+   tests except that the entire environment is self-hosted. This allows us to
+   run them on the GPU using our custom utilities. These are used to test the
+   majority of functional implementations.
+
+#. **Integration tests** - These are lightweight tests that simply call a
+   ``main`` function and checks if it returns non-zero. These are primarily used
+   to test interfaces that are sensitive to threading.
+
+The GPU uses the same testing infrastructure as the other supported ``libc``
+targets. We do this by treating the GPU as a standard hosted environment capable
+of launching a ``main`` function. Effectively, this means building our own
+startup libraries and loader.
+
+Testing utilities
+=================
+
+We provide two utilities to execute arbitrary programs on the GPU. That is the
+``loader`` and the ``start`` object.
+
+Startup object
+--------------
+
+This object mimics the standard object used by existing C library
+implementations. Its job is to perform the necessary setup prior to calling the
+``main`` function. In the GPU case, this means exporting GPU kernels that will
+perform the necessary operations. Here we use ``_begin`` and ``_end`` to handle
+calling global constructors and destructors while ``_start`` begins the standard
+execution. The following code block shows the implementation for AMDGPU
+architectures.
+
+.. code-block:: c++
+
+  extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+  _begin(int argc, char **argv, char **env) {
+    LIBC_NAMESPACE::atexit(&LIBC_NAMESPACE::call_fini_array_callbacks);
+    LIBC_NAMESPACE::call_init_array_callbacks(argc, argv, env);
+  }
+
+  extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+  _start(int argc, char **argv, char **envp, int *ret) {
+    __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+  }
+
+  extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+  _end(int retval) {
+    LIBC_NAMESPACE::exit(retval);
+  }
+
+Loader runtime
+--------------
+
+The startup object provides a GPU executable with callable kernels for the
+respective runtime. We can then define a minimal runtime that will launch these
+kernels on the given device. Currently we provide the ``amdhsa-loader`` and
+``nvptx-loader`` targeting the AMD HSA runtime and CUDA driver runtime
+respectively. By default these will launch with a single thread on the GPU.
 
 .. code-block:: sh
 
-   $> clang++ crt1.o test.cpp --target=amdgcn-amd-amdhsa -mcpu=gfx90a -flto
-   $> ./amdhsa_loader --threads 1 --blocks 1 a.out
+   $> clang++ crt1.o test.cpp --target=amdgcn-amd-amdhsa -mcpu=native -flto
+   $> amdhsa_loader --threads 1 --blocks 1 ./a.out
    Test Passed!
 
-Unlike the exported ``libcgpu.a``, the testing architecture can only support a
-single architecture at a time. This is either detected automatically, or set
-manually by the user using ``LIBC_GPU_TEST_ARCHITECTURE``. The latter is useful
-in cases where the user does not build LLVM's libc on machine with the GPU to
-use for testing.
+The loader utility will forward any arguments passed after the executable image
+to the program on the GPU as well as any set environment variables. The number
+of threads and blocks to be set can be controlled with ``--threads`` and
+``--blocks``. These also accept additional ``x``, ``y``, ``z`` variants for
+multidimensional grids.
+
+Running tests
+=============
+
+Tests will only be built and run if a GPU target architecture is set and the
+corresponding loader utility was built. These can be overridden with the
+``LIBC_GPU_TEST_ARCHITECTURE`` and ``LIBC_GPU_LOADER_EXECUTABLE`` :ref:`CMake
+options <gpu_cmake_options>`. Once built, they can be run like any other tests.
+The CMake target depends on how the library was built.
+
+#. **Cross build** - If the C library was built using ``LLVM_ENABLE_PROJECTS``
+   or a runtimes cross build, then the standard targets will be present in the
+   base CMake build directory.
+
+   #. All tests - You can run all supported tests with the command:
+
+      .. code-block:: sh
+
+        $> ninja check-libc
+
+   #. Hermetic tests - You can run hermetic with tests the command:
+
+      .. code-block:: sh
+
+        $> ninja libc-hermetic-tests
+
+   #. Integration tests - You can run integration tests by the command:
+
+      .. code-block:: sh
+
+        $> ninja libc-integration-tests
+
+#. **Runtimes build** - If the library was built using ``LLVM_ENABLE_RUNTIMES``
+   then the actual ``libc`` build will be in a separate directory.
+
+   #. All tests - You can run all supported tests with the command:
+
+      .. code-block:: sh
+
+        $> ninja check-libc-amdgcn-amd-amdhsa
+        $> ninja check-libc-nvptx64-nvidia-cuda
+
+   #. Specific tests - You can use the same targets as above by entering the
+      runtimes build directory.
+
+      .. code-block:: sh
+
+        $> ninja -C runtimes/runtimes-amdgcn-amd-amdhsa-bins check-libc
+        $> ninja -C runtimes/runtimes-nvptx64-nvidia-cuda-bins check-libc
+        $> cd runtimes/runtimes-amdgcn-amd-amdhsa-bins && ninja check-libc
+        $> cd runtimes/runtimes-nvptx64-nvidia-cuda-bins && ninja check-libc
+
+Tests can also be built and run manually using the respective loader utility.
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index 11a00cd620d866..1a9446eeb1130a 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -159,17 +159,21 @@ GPUs.
   }
 
 We can then compile this for both NVPTX and AMDGPU into LLVM-IR using the
-following commands.
+following commands. This will yield valid LLVM-IR for the given target just like
+if we were using CUDA, OpenCL, or OpenMP.
 
 .. code-block:: sh
 
   $> clang id.c --target=amdgcn-amd-amdhsa -mcpu=native -nogpulib -flto -c
   $> clang id.c --target=nvptx64-nvidia-cuda -march=native -nogpulib -flto -c
 
-We use this support to treat the GPU as a hosted environment by providing a C
-library and startup object just like a standard C library running on the host
-machine. Then, in order to execute these programs, we provide a loader utility
-to launch the executable on the GPU similar to a cross-compiling emulator.
+We can also use this support to treat the GPU as a hosted environment by
+providing a C library and startup object just like a standard C library running
+on the host machine. Then, in order to execute these programs, we provide a
+loader utility to launch the executable on the GPU similar to a cross-compiling
+emulator. This is how we run :ref:`unit tests <libc_gpu_testing>` targeting the
+GPU. This is clearly not the most efficient way to use a GPU, but it provides a
+simple method to test execution on a GPU for debugging or development.
 
 Building for AMDGPU targets
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index ed54a7d091ecb1..d337d060fb5dd9 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -259,6 +259,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nanl         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nanf128      | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nearbyint    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nearbyintf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
@@ -273,12 +275,28 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nextafterf128| |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextdown     | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextdownf    | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextdownl    | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextdownf128 | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nexttoward   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nexttowardf  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | nexttowardl  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextup       | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextupf      | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextupl      | |check| | |check| | |check| | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| nextupf128   | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | remainder    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | remainderf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
diff --git a/libc/fuzzing/CMakeLists.txt b/libc/fuzzing/CMakeLists.txt
index a3ef888167ee3c..82487688af1162 100644
--- a/libc/fuzzing/CMakeLists.txt
+++ b/libc/fuzzing/CMakeLists.txt
@@ -1,7 +1,8 @@
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
 add_custom_target(libc-fuzzer)
 
-add_subdirectory(math)
+# TODO(#85680): Re-enable math fuzzing after headers are sorted out
+# add_subdirectory(math)
 add_subdirectory(stdlib)
 add_subdirectory(stdio)
 add_subdirectory(string)
diff --git a/libc/fuzzing/math/RemQuoDiff.h b/libc/fuzzing/math/RemQuoDiff.h
index 2a962249617673..95a9866f29dbbe 100644
--- a/libc/fuzzing/math/RemQuoDiff.h
+++ b/libc/fuzzing/math/RemQuoDiff.h
@@ -11,7 +11,7 @@
 
 #include "src/__support/FPUtil/FPBits.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/libc/fuzzing/math/SingleInputSingleOutputDiff.h b/libc/fuzzing/math/SingleInputSingleOutputDiff.h
index 8c1e4b0d0290ed..bdf54ee9af6c4f 100644
--- a/libc/fuzzing/math/SingleInputSingleOutputDiff.h
+++ b/libc/fuzzing/math/SingleInputSingleOutputDiff.h
@@ -10,9 +10,7 @@
 #define LLVM_LIBC_FUZZING_MATH_SINGLE_INPUT_SINGLE_OUTPUT_DIFF_H
 
 #include "fuzzing/math/Compare.h"
-#include "src/__support/FPUtil/FPBits.h"
 
-#include <math.h>
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/libc/fuzzing/math/TwoInputSingleOutputDiff.h b/libc/fuzzing/math/TwoInputSingleOutputDiff.h
index e3e3fe703b139e..f017dbd63f4af5 100644
--- a/libc/fuzzing/math/TwoInputSingleOutputDiff.h
+++ b/libc/fuzzing/math/TwoInputSingleOutputDiff.h
@@ -10,9 +10,7 @@
 #define LLVM_LIBC_FUZZING_MATH_TWO_INPUT_SINGLE_OUTPUT_DIFF_H
 
 #include "fuzzing/math/Compare.h"
-#include "src/__support/FPUtil/FPBits.h"
 
-#include <math.h>
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index b773043bda67d8..b000321854d16d 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -16,7 +16,7 @@
 
 #include "src/__support/FPUtil/FPBits.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 #include <stddef.h>
 #include <stdint.h>
 
diff --git a/libc/include/llvm-libc-macros/stdio-macros.h b/libc/include/llvm-libc-macros/stdio-macros.h
index db747c5d5d6755..4664801c5731f0 100644
--- a/libc/include/llvm-libc-macros/stdio-macros.h
+++ b/libc/include/llvm-libc-macros/stdio-macros.h
@@ -9,6 +9,10 @@
 #ifndef LLVM_LIBC_MACROS_STDIO_MACROS_H
 #define LLVM_LIBC_MACROS_STDIO_MACROS_H
 
+#ifndef EOF
+#define EOF (-1)
+#endif
+
 #define BUFSIZ 1024
 
 #endif // LLVM_LIBC_MACROS_STDIO_MACROS_H
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index 591919aac95dd5..3b793ea90ffd32 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -246,6 +246,7 @@ def POSIX : StandardSpec<"POSIX"> {
       [
         SizeTType,
         OffTType,
+        ModeTType,
       ],
       [], // Enumerations
       [
@@ -310,6 +311,16 @@ def POSIX : StandardSpec<"POSIX"> {
           RetValSpec<IntType>,
           [ArgSpec<VoidPtr>, ArgSpec<SizeTType>, ArgSpec<IntType>]
         >,
+        FunctionSpec<
+          "shm_open",
+          RetValSpec<IntType>,
+          [ArgSpec<ConstCharPtr>, ArgSpec<IntType>, ArgSpec<ModeTType>]
+        >,
+        FunctionSpec<
+          "shm_unlink",
+          RetValSpec<IntType>,
+          [ArgSpec<ConstCharPtr>]
+        >,
       ]
   >;
 
@@ -1158,6 +1169,11 @@ def POSIX : StandardSpec<"POSIX"> {
               RetValSpec<IntType>,
               [ArgSpec<VoidType>]
           >,
+          FunctionSpec<
+            "fileno",
+            RetValSpec<IntType>,
+            [ArgSpec<FILEPtr>]
+          >,
       ]
   >;
 
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index afe01b1bb68566..84d28cc3350304 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -536,6 +536,16 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"nexttoward", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<LongDoubleType>]>,
           FunctionSpec<"nexttowardl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
 
+          FunctionSpec<"nextdown", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
+          FunctionSpec<"nextdownf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+          FunctionSpec<"nextdownl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"nextdownf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
+          FunctionSpec<"nextup", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
+          FunctionSpec<"nextupf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+          FunctionSpec<"nextupl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"nextupf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
+
           FunctionSpec<"powf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"pow", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
 
@@ -559,6 +569,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"nanf", RetValSpec<FloatType>, [ArgSpec<ConstCharPtr>]>,
           FunctionSpec<"nan", RetValSpec<DoubleType>, [ArgSpec<ConstCharPtr>]>,
           FunctionSpec<"nanl", RetValSpec<LongDoubleType>, [ArgSpec<ConstCharPtr>]>,
+          GuardedFunctionSpec<"nanf128", RetValSpec<Float128Type>, [ArgSpec<ConstCharPtr>], "LIBC_TYPES_HAS_FLOAT128">,
       ]
   >;
 
diff --git a/libc/src/__support/CPP/array.h b/libc/src/__support/CPP/array.h
index fb5a79225beb7d..4e69ba003e800b 100644
--- a/libc/src/__support/CPP/array.h
+++ b/libc/src/__support/CPP/array.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_ARRAY_H
 #define LLVM_LIBC_SRC___SUPPORT_CPP_ARRAY_H
 
+#include "src/__support/CPP/iterator.h" // reverse_iterator
 #include "src/__support/macros/attributes.h"
 #include <stddef.h> // For size_t.
 
@@ -23,6 +24,8 @@ template <class T, size_t N> struct array {
   using value_type = T;
   using iterator = T *;
   using const_iterator = const T *;
+  using reverse_iterator = cpp::reverse_iterator<iterator>;
+  using const_reverse_iterator = cpp::reverse_iterator<const_iterator>;
 
   LIBC_INLINE constexpr T *data() { return Data; }
   LIBC_INLINE constexpr const T *data() const { return Data; }
@@ -45,9 +48,29 @@ template <class T, size_t N> struct array {
 
   LIBC_INLINE constexpr iterator begin() { return Data; }
   LIBC_INLINE constexpr const_iterator begin() const { return Data; }
+  LIBC_INLINE constexpr const_iterator cbegin() const { return begin(); }
 
   LIBC_INLINE constexpr iterator end() { return Data + N; }
   LIBC_INLINE constexpr const_iterator end() const { return Data + N; }
+  LIBC_INLINE constexpr const_iterator cend() const { return end(); }
+
+  LIBC_INLINE constexpr reverse_iterator rbegin() {
+    return reverse_iterator{end()};
+  }
+  LIBC_INLINE constexpr const_reverse_iterator rbegin() const {
+    return const_reverse_iterator{end()};
+  }
+  LIBC_INLINE constexpr const_reverse_iterator crbegin() const {
+    return rbegin();
+  }
+
+  LIBC_INLINE constexpr reverse_iterator rend() {
+    return reverse_iterator{begin()};
+  }
+  LIBC_INLINE constexpr const_reverse_iterator rend() const {
+    return const_reverse_iterator{begin()};
+  }
+  LIBC_INLINE constexpr const_reverse_iterator crend() const { return rend(); }
 };
 
 } // namespace cpp
diff --git a/libc/src/__support/CPP/iterator.h b/libc/src/__support/CPP/iterator.h
new file mode 100644
index 00000000000000..c5bfb1912c7b74
--- /dev/null
+++ b/libc/src/__support/CPP/iterator.h
@@ -0,0 +1,63 @@
+//===-- Standalone implementation of iterator -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_CPP_ITERATOR_H
+#define LLVM_LIBC_SRC___SUPPORT_CPP_ITERATOR_H
+
+#include "src/__support/CPP/type_traits/enable_if.h"
+#include "src/__support/CPP/type_traits/is_convertible.h"
+#include "src/__support/CPP/type_traits/is_same.h"
+#include "src/__support/macros/attributes.h"
+
+namespace LIBC_NAMESPACE {
+namespace cpp {
+
+template <typename T> struct iterator_traits;
+template <typename T> struct iterator_traits<T *> {
+  using reference = T &;
+};
+
+template <typename Iter> class reverse_iterator {
+  Iter current;
+
+public:
+  using reference = typename iterator_traits<Iter>::reference;
+
+  LIBC_INLINE reverse_iterator() : current() {}
+  LIBC_INLINE constexpr explicit reverse_iterator(Iter it) : current(it) {}
+
+  template <typename Other,
+            cpp::enable_if_t<!cpp::is_same_v<Iter, Other> &&
+                                 cpp::is_convertible_v<const Other &, Iter>,
+                             int> = 0>
+  LIBC_INLINE constexpr explicit reverse_iterator(const Other &it)
+      : current(it) {}
+
+  LIBC_INLINE constexpr reference operator*() const {
+    Iter tmp = current;
+    return *--tmp;
+  }
+  LIBC_INLINE constexpr reverse_iterator operator--() {
+    ++current;
+    return *this;
+  }
+  LIBC_INLINE constexpr reverse_iterator &operator++() {
+    --current;
+    return *this;
+  }
+  LIBC_INLINE constexpr reverse_iterator operator++(int) {
+    reverse_iterator tmp(*this);
+    --current;
+    return tmp;
+  }
+};
+
+} // namespace cpp
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_CPP_ITERATOR_H
diff --git a/libc/src/__support/CPP/string_view.h b/libc/src/__support/CPP/string_view.h
index d23aa261ca0614..8aa96fa87f6986 100644
--- a/libc/src/__support/CPP/string_view.h
+++ b/libc/src/__support/CPP/string_view.h
@@ -179,7 +179,8 @@ class string_view {
   LIBC_INLINE char back() const { return Data[Len - 1]; }
 
   // Finds the first occurence of c in this view, starting at position From.
-  LIBC_INLINE size_t find_first_of(const char c, size_t From = 0) const {
+  LIBC_INLINE constexpr size_t find_first_of(const char c,
+                                             size_t From = 0) const {
     for (size_t Pos = From; Pos < size(); ++Pos)
       if ((*this)[Pos] == c)
         return Pos;
@@ -187,13 +188,29 @@ class string_view {
   }
 
   // Finds the last occurence of c in this view, ending at position End.
-  LIBC_INLINE size_t find_last_of(const char c, size_t End = npos) const {
+  LIBC_INLINE constexpr size_t find_last_of(const char c,
+                                            size_t End = npos) const {
     End = End >= size() ? size() : End + 1;
     for (; End > 0; --End)
       if ((*this)[End - 1] == c)
         return End - 1;
     return npos;
   }
+
+  // Finds the first character not equal to c in this view, starting at position
+  // From.
+  LIBC_INLINE constexpr size_t find_first_not_of(const char c,
+                                                 size_t From = 0) const {
+    for (size_t Pos = From; Pos < size(); ++Pos)
+      if ((*this)[Pos] != c)
+        return Pos;
+    return npos;
+  }
+
+  // Check if this view contains the given character.
+  LIBC_INLINE constexpr bool contains(char c) const {
+    return find_first_of(c) != npos;
+  }
 };
 
 } // namespace cpp
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index 6810659733de2c..a6a533dcfdf4aa 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -9,13 +9,12 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_FENVIMPL_H
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_FENVIMPL_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 #include "src/__support/macros/properties/architectures.h"
 #include "src/errno/libc_errno.h"
-
 #include <fenv.h>
-#include <math.h>
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64)
 #if defined(__APPLE__)
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index f148d984f5f35c..2c90b4888c2e5d 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -15,6 +15,7 @@
 #include "dyadic_float.h"
 #include "rounding_mode.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/limits.h" // INT_MAX, INT_MIN
 #include "src/__support/CPP/type_traits.h"
@@ -22,8 +23,6 @@
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include <math.h>
-
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
@@ -230,11 +229,36 @@ LIBC_INLINE T nextafter(T from, U to) {
   return from_bits.get_val();
 }
 
+template <bool IsDown, typename T,
+          cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
+LIBC_INLINE constexpr T nextupdown(T x) {
+  constexpr Sign sign = IsDown ? Sign::NEG : Sign::POS;
+
+  FPBits<T> xbits(x);
+  if (xbits.is_nan() || xbits == FPBits<T>::max_normal(sign) ||
+      xbits == FPBits<T>::inf(sign))
+    return x;
+
+  using StorageType = typename FPBits<T>::StorageType;
+  if (x != T(0)) {
+    if (xbits.sign() == sign) {
+      xbits = FPBits<T>(StorageType(xbits.uintval() + 1));
+    } else {
+      xbits = FPBits<T>(StorageType(xbits.uintval() - 1));
+    }
+  } else {
+    xbits = FPBits<T>::min_subnormal(sign);
+  }
+
+  return xbits.get_val();
+}
+
 } // namespace fputil
 } // namespace LIBC_NAMESPACE
 
 #ifdef LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80
 #include "x86_64/NextAfterLongDouble.h"
+#include "x86_64/NextUpDownLongDouble.h"
 #endif // LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80
 
 #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_MANIPULATIONFUNCTIONS_H
diff --git a/libc/src/__support/FPUtil/NearestIntegerOperations.h b/libc/src/__support/FPUtil/NearestIntegerOperations.h
index 19ae75ea788912..e890e38ba4ae7d 100644
--- a/libc/src/__support/FPUtil/NearestIntegerOperations.h
+++ b/libc/src/__support/FPUtil/NearestIntegerOperations.h
@@ -13,11 +13,10 @@
 #include "FPBits.h"
 #include "rounding_mode.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/common.h"
 
-#include <math.h>
-
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
diff --git a/libc/src/__support/FPUtil/x86_64/NextUpDownLongDouble.h b/libc/src/__support/FPUtil/x86_64/NextUpDownLongDouble.h
new file mode 100644
index 00000000000000..1bc849500be048
--- /dev/null
+++ b/libc/src/__support/FPUtil/x86_64/NextUpDownLongDouble.h
@@ -0,0 +1,60 @@
+//===-- nextupdown implementation for x86 long double numbers ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_X86_64_NEXTUPDOWNLONGDOUBLE_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_X86_64_NEXTUPDOWNLONGDOUBLE_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/properties/architectures.h"
+
+#if !defined(LIBC_TARGET_ARCH_IS_X86)
+#error "Invalid include"
+#endif
+
+namespace LIBC_NAMESPACE::fputil {
+
+template <bool IsDown>
+LIBC_INLINE constexpr long double nextupdown(long double x) {
+  constexpr Sign sign = IsDown ? Sign::NEG : Sign::POS;
+
+  using FPBits_t = FPBits<long double>;
+  FPBits_t xbits(x);
+  if (xbits.is_nan() || xbits == FPBits_t::max_normal(sign) ||
+      xbits == FPBits_t::inf(sign))
+    return x;
+
+  if (x == 0.0l)
+    return FPBits_t::min_subnormal(sign).get_val();
+
+  using StorageType = typename FPBits_t::StorageType;
+
+  if (xbits.sign() == sign) {
+    if (xbits.get_mantissa() == FPBits_t::FRACTION_MASK) {
+      xbits.set_mantissa(0);
+      xbits.set_biased_exponent(xbits.get_biased_exponent() + 1);
+    } else {
+      xbits = FPBits_t(StorageType(xbits.uintval() + 1));
+    }
+
+    return xbits.get_val();
+  }
+
+  if (xbits.get_mantissa() == 0) {
+    xbits.set_mantissa(FPBits_t::FRACTION_MASK);
+    xbits.set_biased_exponent(xbits.get_biased_exponent() - 1);
+  } else {
+    xbits = FPBits_t(StorageType(xbits.uintval() - 1));
+  }
+
+  return xbits.get_val();
+}
+
+} // namespace LIBC_NAMESPACE::fputil
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_X86_64_NEXTUPDOWNLONGDOUBLE_H
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index d92d61ed094ebe..df01e081e3c19e 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -993,6 +993,68 @@ struct is_big_int<BigInt<Bits, Signed, T>> : cpp::true_type {};
 template <class T>
 LIBC_INLINE_VAR constexpr bool is_big_int_v = is_big_int<T>::value;
 
+// extensions of type traits to include BigInt
+
+// is_integral_or_big_int
+template <typename T>
+struct is_integral_or_big_int
+    : cpp::bool_constant<(cpp::is_integral_v<T> || is_big_int_v<T>)> {};
+
+template <typename T>
+LIBC_INLINE_VAR constexpr bool is_integral_or_big_int_v =
+    is_integral_or_big_int<T>::value;
+
+// make_big_int_unsigned
+template <typename T> struct make_big_int_unsigned;
+
+template <size_t Bits, bool Signed, typename T>
+struct make_big_int_unsigned<BigInt<Bits, Signed, T>>
+    : cpp::type_identity<BigInt<Bits, false, T>> {};
+
+template <typename T>
+using make_big_int_unsigned_t = typename make_big_int_unsigned<T>::type;
+
+// make_big_int_signed
+template <typename T> struct make_big_int_signed;
+
+template <size_t Bits, bool Signed, typename T>
+struct make_big_int_signed<BigInt<Bits, Signed, T>>
+    : cpp::type_identity<BigInt<Bits, true, T>> {};
+
+template <typename T>
+using make_big_int_signed_t = typename make_big_int_signed<T>::type;
+
+// make_integral_or_big_int_unsigned
+template <typename T, class = void> struct make_integral_or_big_int_unsigned;
+
+template <typename T>
+struct make_integral_or_big_int_unsigned<
+    T, cpp::enable_if_t<cpp::is_integral_v<T>>> : cpp::make_unsigned<T> {};
+
+template <typename T>
+struct make_integral_or_big_int_unsigned<T, cpp::enable_if_t<is_big_int_v<T>>>
+    : make_big_int_unsigned<T> {};
+
+template <typename T>
+using make_integral_or_big_int_unsigned_t =
+    typename make_integral_or_big_int_unsigned<T>::type;
+
+// make_integral_or_big_int_signed
+template <typename T, class = void> struct make_integral_or_big_int_signed;
+
+template <typename T>
+struct make_integral_or_big_int_signed<T,
+                                       cpp::enable_if_t<cpp::is_integral_v<T>>>
+    : cpp::make_signed<T> {};
+
+template <typename T>
+struct make_integral_or_big_int_signed<T, cpp::enable_if_t<is_big_int_v<T>>>
+    : make_big_int_signed<T> {};
+
+template <typename T>
+using make_integral_or_big_int_signed_t =
+    typename make_integral_or_big_int_signed<T>::type;
+
 namespace cpp {
 
 // Specialization of cpp::bit_cast ('bit.h') from T to BigInt.
diff --git a/libc/src/__support/blockstore.h b/libc/src/__support/blockstore.h
index dc5fdd1b92fc29..ac0eb22692b40c 100644
--- a/libc/src/__support/blockstore.h
+++ b/libc/src/__support/blockstore.h
@@ -16,7 +16,6 @@
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE {
-namespace cpp {
 
 // The difference between BlockStore a traditional vector types is that,
 // when more capacity is desired, a new block is added instead of allocating
@@ -203,7 +202,6 @@ void BlockStore<T, BLOCK_SIZE, REVERSE_ORDER>::destroy(
 template <typename T, size_t BLOCK_SIZE>
 using ReverseOrderBlockStore = BlockStore<T, BLOCK_SIZE, true>;
 
-} // namespace cpp
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC___SUPPORT_BLOCKSTORE_H
diff --git a/libc/src/__support/integer_to_string.h b/libc/src/__support/integer_to_string.h
index ac0bdd688aeab2..f72d00d1a7456d 100644
--- a/libc/src/__support/integer_to_string.h
+++ b/libc/src/__support/integer_to_string.h
@@ -67,7 +67,7 @@
 #include "src/__support/CPP/span.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/UInt.h" // is_big_int
+#include "src/__support/UInt.h" // make_integral_or_big_int_unsigned_t
 #include "src/__support/common.h"
 
 namespace LIBC_NAMESPACE {
@@ -150,18 +150,6 @@ template <bool forward> class StringBufferWriterImpl {
 using StringBufferWriter = StringBufferWriterImpl<true>;
 using BackwardStringBufferWriter = StringBufferWriterImpl<false>;
 
-template <typename T, class = void> struct IntegerWriterUnsigned {};
-
-template <typename T>
-struct IntegerWriterUnsigned<T, cpp::enable_if_t<cpp::is_integral_v<T>>> {
-  using type = cpp::make_unsigned_t<T>;
-};
-
-template <typename T>
-struct IntegerWriterUnsigned<T, cpp::enable_if_t<is_big_int_v<T>>> {
-  using type = typename T::unsigned_type;
-};
-
 } // namespace details
 
 namespace radix {
@@ -222,7 +210,7 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
   // An internal stateless structure that handles the number formatting logic.
   struct IntegerWriter {
     static_assert(cpp::is_integral_v<T> || is_big_int_v<T>);
-    using UNSIGNED_T = typename details::IntegerWriterUnsigned<T>::type;
+    using UNSIGNED_T = make_integral_or_big_int_unsigned_t<T>;
 
     LIBC_INLINE static char digit_char(uint8_t digit) {
       if (digit < 10)
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 073e1dc6721723..2cf2cfb027243e 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -1056,17 +1056,17 @@ hexadecimal_string_to_float(const char *__restrict src,
   return output;
 }
 
-LIBC_INLINE uint64_t
+template <class T>
+LIBC_INLINE typename fputil::FPBits<T>::StorageType
 nan_mantissa_from_ncharseq(const cpp::string_view ncharseq) {
-  uint64_t nan_mantissa = 0;
+  using FPBits = typename fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+
+  StorageType nan_mantissa = 0;
 
   if (ncharseq.data() != nullptr && isdigit(ncharseq[0])) {
-    // This is to prevent errors when StorageType is larger than 64
-    // bits, since strtointeger only supports up to 64 bits. This is
-    // actually more than is required by the specification, which says
-    // for the input type "NAN(n-char-sequence)" that "the meaning of
-    // the n-char sequence is implementation-defined."
-    auto strtoint_result = strtointeger<uint64_t>(ncharseq.data(), 0);
+    StrToNumResult<StorageType> strtoint_result =
+        strtointeger<StorageType>(ncharseq.data(), 0);
     if (!strtoint_result.has_error())
       nan_mantissa = strtoint_result.value;
 
@@ -1172,9 +1172,8 @@ LIBC_INLINE StrToNumResult<T> strtofloatingpoint(const char *__restrict src) {
           ++index;
         if (src[index] == ')') {
           ++index;
-          auto nan_mantissa_result = nan_mantissa_from_ncharseq(
+          nan_mantissa = nan_mantissa_from_ncharseq<T>(
               cpp::string_view(src + (left_paren + 1), index - left_paren - 2));
-          nan_mantissa = static_cast<StorageType>(nan_mantissa_result);
         } else {
           index = left_paren;
         }
@@ -1221,11 +1220,8 @@ template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) {
   while (isalnum(arg[index]) || arg[index] == '_')
     ++index;
 
-  if (arg[index] == '\0') {
-    auto nan_mantissa_result =
-        nan_mantissa_from_ncharseq(cpp::string_view(arg, index));
-    nan_mantissa = static_cast<StorageType>(nan_mantissa_result);
-  }
+  if (arg[index] == '\0')
+    nan_mantissa = nan_mantissa_from_ncharseq<T>(cpp::string_view(arg, index));
 
   result = FPBits::quiet_nan(fputil::Sign::POS, nan_mantissa);
   return {result.get_val(), 0, error};
diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h
index b87808993fee50..02c71d40a1c0ad 100644
--- a/libc/src/__support/str_to_integer.h
+++ b/libc/src/__support/str_to_integer.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/UInt128.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/str_to_num_result.h"
@@ -75,8 +76,12 @@ template <class T>
 LIBC_INLINE StrToNumResult<T>
 strtointeger(const char *__restrict src, int base,
              const size_t src_len = cpp::numeric_limits<size_t>::max()) {
-  // TODO: Rewrite to support numbers longer than long long
-  unsigned long long result = 0;
+  using ResultType = typename cpp::conditional_t<(cpp::is_same_v<T, UInt128> ||
+                                                  cpp::is_same_v<T, Int128>),
+                                                 UInt128, unsigned long long>;
+
+  ResultType result = 0;
+
   bool is_number = false;
   size_t src_cur = 0;
   int error_val = 0;
@@ -101,15 +106,16 @@ strtointeger(const char *__restrict src, int base,
   if (base == 16 && is_hex_start(src + src_cur, src_len - src_cur))
     src_cur = src_cur + 2;
 
-  constexpr bool IS_UNSIGNED = (cpp::numeric_limits<T>::min() == 0);
+  constexpr bool IS_UNSIGNED = cpp::is_unsigned_v<T>;
   const bool is_positive = (result_sign == '+');
-  unsigned long long constexpr NEGATIVE_MAX =
-      !IS_UNSIGNED
-          ? static_cast<unsigned long long>(cpp::numeric_limits<T>::max()) + 1
-          : cpp::numeric_limits<T>::max();
-  unsigned long long const abs_max =
+
+  ResultType constexpr NEGATIVE_MAX =
+      !IS_UNSIGNED ? static_cast<ResultType>(cpp::numeric_limits<T>::max()) + 1
+                   : cpp::numeric_limits<T>::max();
+  ResultType const abs_max =
       (is_positive ? cpp::numeric_limits<T>::max() : NEGATIVE_MAX);
-  unsigned long long const abs_max_div_by_base = abs_max / base;
+  ResultType const abs_max_div_by_base = abs_max / base;
+
   while (src_cur < src_len && isalnum(src[src_cur])) {
     int cur_digit = b36_char_to_int(src[src_cur]);
     if (cur_digit >= base)
@@ -149,10 +155,12 @@ strtointeger(const char *__restrict src, int base,
       return {cpp::numeric_limits<T>::min(), str_len, error_val};
   }
 
-  return {is_positive
-              ? static_cast<T>(result)
-              : static_cast<T>(-static_cast<cpp::make_unsigned_t<T>>(result)),
-          str_len, error_val};
+  return {
+      is_positive
+          ? static_cast<T>(result)
+          : static_cast<T>(
+                -static_cast<make_integral_or_big_int_unsigned_t<T>>(result)),
+      str_len, error_val};
 }
 
 } // namespace internal
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 750fd5f0e3a9ba..5e2e6e699d0e0c 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -190,6 +190,7 @@ add_math_entrypoint_object(modff128)
 add_math_entrypoint_object(nan)
 add_math_entrypoint_object(nanf)
 add_math_entrypoint_object(nanl)
+add_math_entrypoint_object(nanf128)
 
 add_math_entrypoint_object(nearbyint)
 add_math_entrypoint_object(nearbyintf)
@@ -204,6 +205,16 @@ add_math_entrypoint_object(nexttoward)
 add_math_entrypoint_object(nexttowardf)
 add_math_entrypoint_object(nexttowardl)
 
+add_math_entrypoint_object(nextdown)
+add_math_entrypoint_object(nextdownf)
+add_math_entrypoint_object(nextdownl)
+add_math_entrypoint_object(nextdownf128)
+
+add_math_entrypoint_object(nextup)
+add_math_entrypoint_object(nextupf)
+add_math_entrypoint_object(nextupl)
+add_math_entrypoint_object(nextupf128)
+
 add_math_entrypoint_object(pow)
 add_math_entrypoint_object(powf)
 
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 667381d615d1e0..176c32e47768df 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1780,6 +1780,19 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  nanf128
+  SRCS
+    nanf128.cpp
+  HDRS
+    ../nanf128.h
+  DEPENDS
+    libc.src.__support.str_to_float
+    libc.src.errno.errno
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   nextafter
   SRCS
@@ -1865,6 +1878,104 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  nextdown
+  SRCS
+    nextdown.cpp
+  HDRS
+    ../nextdown.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextdownl
+  SRCS
+    nextdownl.cpp
+  HDRS
+    ../nextdownl.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextdownf
+  SRCS
+    nextdownf.cpp
+  HDRS
+    ../nextdownf.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextdownf128
+  SRCS
+    nextdownf128.cpp
+  HDRS
+    ../nextdownf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextup
+  SRCS
+    nextup.cpp
+  HDRS
+    ../nextup.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextupl
+  SRCS
+    nextupl.cpp
+  HDRS
+    ../nextupl.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextupf
+  SRCS
+    nextupf.cpp
+  HDRS
+    ../nextupf.h
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  nextupf128
+  SRCS
+    nextupf128.cpp
+  HDRS
+    ../nextupf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   fmod
   SRCS
@@ -2044,16 +2155,9 @@ add_object_library(
   SRCS
     inv_trigf_utils.cpp
   DEPENDS
-    .math_utils
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.common
-    libc.include.errno
-    libc.src.errno.errno
-    libc.include.math
 )
 
 add_entrypoint_object(
@@ -2100,8 +2204,11 @@ add_entrypoint_object(
     ../atanf.h
   DEPENDS
     .inv_trigf_utils
-    .math_utils
+    libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
   COMPILE_OPTIONS
diff --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp
index e0f8a1bfc2ecc9..5f66ea52d0d7ae 100644
--- a/libc/src/math/generic/atanf.cpp
+++ b/libc/src/math/generic/atanf.cpp
@@ -7,11 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atanf.h"
-#include "math_utils.h"
+#include "inv_trigf_utils.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/math/generic/inv_trigf_utils.h"
 
 namespace LIBC_NAMESPACE {
 
@@ -19,48 +22,102 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
   using FPBits = typename fputil::FPBits<float>;
   using Sign = fputil::Sign;
 
-  // x == 0.0
-  if (LIBC_UNLIKELY(x == 0.0f))
-    return x;
+  constexpr double FINAL_SIGN[2] = {1.0, -1.0};
+  constexpr double SIGNED_PI_OVER_2[2] = {0x1.921fb54442d18p0,
+                                          -0x1.921fb54442d18p0};
 
-  FPBits xbits(x);
-  Sign sign = xbits.sign();
-  xbits.set_sign(Sign::POS);
+  FPBits x_bits(x);
+  Sign sign = x_bits.sign();
+  x_bits.set_sign(Sign::POS);
+  uint32_t x_abs = x_bits.uintval();
 
-  if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
-    if (xbits.is_inf())
-      return static_cast<float>(
-          opt_barrier(sign.is_neg() ? -M_MATH_PI_2 : M_MATH_PI_2));
-    else
+  // x is inf or nan, |x| < 2^-4 or |x|= > 16.
+  if (LIBC_UNLIKELY(x_abs <= 0x3d80'0000U || x_abs >= 0x4180'0000U)) {
+    double x_d = static_cast<double>(x);
+    double const_term = 0.0;
+    if (LIBC_UNLIKELY(x_abs >= 0x4180'0000)) {
+      // atan(+-Inf) = +-pi/2.
+      if (x_bits.is_inf()) {
+        volatile double sign_pi_over_2 = SIGNED_PI_OVER_2[sign.is_neg()];
+        return static_cast<float>(sign_pi_over_2);
+      }
+      if (x_bits.is_nan())
+        return x;
+      // x >= 16
+      x_d = -1.0 / x_d;
+      const_term = SIGNED_PI_OVER_2[sign.is_neg()];
+    }
+    // 0 <= x < 1/16;
+    if (LIBC_UNLIKELY(x_bits.is_zero()))
       return x;
-  }
-  // |x| == 0.06905200332403183
-  if (LIBC_UNLIKELY(xbits.uintval() == 0x3d8d6b23U)) {
-    if (fputil::fenv_is_round_to_nearest()) {
-      // 0.06894256919622421
-      FPBits br(0x3d8d31c3U);
-      br.set_sign(sign);
-      return br.get_val();
+    // x <= 2^-12;
+    if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
+#if defined(LIBC_TARGET_CPU_HAS_FMA)
+      return fputil::multiply_add(x, -0x1.0p-25f, x);
+#else
+      double x_d = static_cast<double>(x);
+      return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
+#endif // LIBC_TARGET_CPU_HAS_FMA
     }
+    // Use Taylor polynomial:
+    //   atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
+    double x2 = x_d * x_d;
+    double x4 = x2 * x2;
+    double c0 = fputil::multiply_add(x2, ATAN_COEFFS[0][1], ATAN_COEFFS[0][0]);
+    double c1 = fputil::multiply_add(x2, ATAN_COEFFS[0][3], ATAN_COEFFS[0][2]);
+    double c2 = fputil::multiply_add(x2, ATAN_COEFFS[0][5], ATAN_COEFFS[0][4]);
+    double p = fputil::polyeval(x4, c0, c1, c2);
+    double r = fputil::multiply_add(x_d, p, const_term);
+    return static_cast<float>(r);
   }
 
-  // |x| == 1.8670953512191772
-  if (LIBC_UNLIKELY(xbits.uintval() == 0x3feefcfbU)) {
-    int rounding_mode = fputil::quick_get_round();
-    if (sign.is_neg()) {
-      if (rounding_mode == FE_DOWNWARD) {
-        // -1.0790828466415405
-        return FPBits(0xbf8a1f63U).get_val();
-      }
-    } else {
-      if (rounding_mode == FE_UPWARD) {
-        // 1.0790828466415405
-        return FPBits(0x3f8a1f63U).get_val();
-      }
+  // Range reduction steps:
+  // 1)  atan(x) = sign(x) * atan(|x|)
+  // 2)  If |x| > 1, atan(|x|) = pi/2 - atan(1/|x|)
+  // 3)  For 1/16 < x <= 1, we find k such that: |x - k/16| <= 1/32.
+  // 4)  Then we use polynomial approximation:
+  //   atan(x) ~ atan((k/16) + (x - (k/16)) * Q(x - k/16)
+  //           = P(x - k/16)
+  double x_d, const_term, final_sign;
+  int idx;
+
+  if (x_abs > 0x3f80'0000U) {
+    // Exceptional value:
+    if (LIBC_UNLIKELY(x_abs == 0x3ffe'2ec1U)) { // |x| = 0x1.fc5d82p+0
+      return sign.is_pos() ? fputil::round_result_slightly_up(0x1.1ab2fp0f)
+                           : fputil::round_result_slightly_down(-0x1.1ab2fp0f);
     }
+    // |x| > 1, we need to invert x, so we will perform range reduction in
+    // double precision.
+    x_d = 1.0 / static_cast<double>(x_bits.get_val());
+    double k_d = fputil::nearest_integer(x_d * 0x1.0p4);
+    x_d = fputil::multiply_add(k_d, -0x1.0p-4, x_d);
+    idx = static_cast<int>(k_d);
+    final_sign = FINAL_SIGN[sign.is_pos()];
+    // Adjust constant term of the polynomial by +- pi/2.
+    const_term = fputil::multiply_add(final_sign, ATAN_COEFFS[idx][0],
+                                      SIGNED_PI_OVER_2[sign.is_neg()]);
+  } else {
+    // Exceptional value:
+    if (LIBC_UNLIKELY(x_abs == 0x3dbb'6ac7U)) { // |x| = 0x1.76d58ep-4
+      return sign.is_pos()
+                 ? fputil::round_result_slightly_up(0x1.75cb06p-4f)
+                 : fputil::round_result_slightly_down(-0x1.75cb06p-4f);
+    }
+    // Perform range reduction in single precision.
+    float x_f = x_bits.get_val();
+    float k_f = fputil::nearest_integer(x_f * 0x1.0p4f);
+    x_f = fputil::multiply_add(k_f, -0x1.0p-4f, x_f);
+    x_d = static_cast<double>(x_f);
+    idx = static_cast<int>(k_f);
+    final_sign = FINAL_SIGN[sign.is_neg()];
+    const_term = final_sign * ATAN_COEFFS[idx][0];
   }
 
-  return static_cast<float>(atan_eval(x));
+  double p = atan_eval(x_d, idx);
+  double r = fputil::multiply_add(final_sign * x_d, p, const_term);
+
+  return static_cast<float>(r);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/inv_trigf_utils.cpp b/libc/src/math/generic/inv_trigf_utils.cpp
index 8013c0470affb1..93d5bcbf7b567d 100644
--- a/libc/src/math/generic/inv_trigf_utils.cpp
+++ b/libc/src/math/generic/inv_trigf_utils.cpp
@@ -10,19 +10,71 @@
 
 namespace LIBC_NAMESPACE {
 
-// N[Table[ArcTan[x], {x, 1/16, 16/16, 1/16}], 40]
-alignas(64) const double ATAN_T[ATAN_T_SIZE] = {
-    0x1.ff55bb72cfdeap-5, 0x1.fd5ba9aac2f6ep-4, 0x1.7b97b4bce5b02p-3,
-    0x1.f5b75f92c80ddp-3, 0x1.362773707ebccp-2, 0x1.6f61941e4def1p-2,
-    0x1.a64eec3cc23fdp-2, 0x1.dac670561bb4fp-2, 0x1.0657e94db30d0p-1,
-    0x1.1e00babdefeb4p-1, 0x1.345f01cce37bbp-1, 0x1.4978fa3269ee1p-1,
-    0x1.5d58987169b18p-1, 0x1.700a7c5784634p-1, 0x1.819d0b7158a4dp-1,
-    0x1.921fb54442d18p-1};
-
-// for(int i = 0; i < 5; i++)
-//     printf("%.13a,\n", (-2 * (i % 2) + 1) * 1.0 / (2 * i + 1));
-alignas(64) const double ATAN_K[5] = {
-    0x1.0000000000000p+0, -0x1.5555555555555p-2, 0x1.999999999999ap-3,
-    -0x1.2492492492492p-3, 0x1.c71c71c71c71cp-4};
+// Polynomial approximation for 0 <= x <= 1:
+//   atan(x) ~ atan((i/16) + (x - (i/16)) * Q(x - i/16)
+//           = P(x - i/16)
+// Generated by Sollya with:
+// > for i from 1 to 16 do {
+//     mid_point = i/16;
+//     P = fpminimax(atan(mid_point + x), 7, [|D...|], [-1/32, 1/32]);
+//     print("{", coeff(P, 0), ",", coeff(P, 1), ",", coeff(P, 2), ",",
+//           coeff(P, 3), ",", coeff(P, 4), ",", coeff(P, 5), ",", coeff(P, 6),
+//           ",", coeff(P, 7), "},");
+//   };
+// For i = 0, ATAN_COEFFS[0][j] = (-1)^j * (1/(2*j + 1)) is the odd coefficients
+// of the Taylor polynomial of atan(x).
+double ATAN_COEFFS[17][8] = {
+    {0x1.0000000000000p+0, -0x1.5555555555555p-2, 0x1.999999999999ap-3,
+     -0x1.2492492492492p-3, 0x1.c71c71c71c71cp-4, -0x1.745d1745d1746p-4,
+     0x1.3b13b13b13b14p-4, -0x1.1111111111111p-4},
+    {0x1.ff55bb72cfdb1p-5, 0x1.fe01fe01fe1bp-1, -0x1.fc05f80821d1ap-5,
+     -0x1.4d6930419fc5fp-2, 0x1.f61b9f6d69313p-5, 0x1.8208a32f4346cp-3,
+     -0x1.ecb8fc53d04efp-5, -0x1.060710cb59cbcp-3},
+    {0x1.fd5ba9aac2f3cp-4, 0x1.f81f81f81f96ap-1, -0x1.f05e09cf4c1b2p-4,
+     -0x1.368c3aac7543ep-2, 0x1.d9b14bddfac55p-4, 0x1.4048e55ec725ep-3,
+     -0x1.b98ca3c1594b5p-4, -0x1.664eabaabbc16p-4},
+    {0x1.7b97b4bce5ae7p-3, 0x1.ee9c7f8458f06p-1, -0x1.665c226c8dc69p-3,
+     -0x1.1344bb77961b7p-2, 0x1.42ac97745d3ccp-3, 0x1.c32e142047ec1p-4,
+     -0x1.137ae41ab96cbp-3, -0x1.1a6ae8c09a4b6p-5},
+    {0x1.f5b75f92c80c6p-3, 0x1.e1e1e1e1e1ed4p-1, -0x1.c5894d101ad4p-3,
+     -0x1.ce6de02b38c38p-3, 0x1.78a3920c336b9p-3, 0x1.dd5ff94a9d499p-5,
+     -0x1.1ac2d3f9d072ep-3, 0x1.0af9735dff373p-6},
+    {0x1.362773707ebc5p-2, 0x1.d272ca3fc5b8bp-1, -0x1.0997e8ae90cb6p-2,
+     -0x1.6cf6667146798p-3, 0x1.8dd1dff17f3d3p-3, 0x1.24860eced656fp-7,
+     -0x1.f4220e8f18ed5p-4, 0x1.b700aed7cdc34p-5},
+    {0x1.6f61941e4deeep-2, 0x1.c0e070381c115p-1, -0x1.2726dd1347c7ep-2,
+     -0x1.09f37b3ad010dp-3, 0x1.85eaca5196f5cp-3, -0x1.04d640117852ap-5,
+     -0x1.802c2956871c7p-4, 0x1.2992b45df0ee7p-4},
+    {0x1.a64eec3cc23fep-2, 0x1.adbe87f94906bp-1, -0x1.3b9d8eab5eae5p-2,
+     -0x1.57c09646faabbp-4, 0x1.6795330e73aep-3, -0x1.f2d89a702a652p-5,
+     -0x1.f3afd90a9d4d7p-5, 0x1.3261723d3f153p-4},
+    {0x1.dac670561bb53p-2, 0x1.999999999998fp-1, -0x1.47ae147afd8cap-2,
+     -0x1.5d867c3dfd72ap-5, 0x1.3a92a76cba833p-3, -0x1.3ec460286928ap-4,
+     -0x1.ed02ff86892acp-6, 0x1.0a674c8f05727p-4},
+    {0x1.0657e94db30d2p-1, 0x1.84f00c27805ffp-1, -0x1.4c62cb564f677p-2,
+     -0x1.e6495b262dfe7p-8, 0x1.063c34eca262bp-3, -0x1.58b78dc79b5aep-4,
+     -0x1.4623815233be1p-8, 0x1.93afe94328089p-5},
+    {0x1.1e00babdefeb6p-1, 0x1.702e05c0b8159p-1, -0x1.4af2b78236bd6p-2,
+     0x1.5d0b7ea46ed08p-6, 0x1.a124870236935p-4, -0x1.519e1ec133a88p-4,
+     0x1.a54632a3f48c7p-7, 0x1.099ca0945096dp-5},
+    {0x1.345f01cce37bdp-1, 0x1.5babcc647fa7ep-1, -0x1.449db09443a67p-2,
+     0x1.655caac78a0fcp-5, 0x1.3bbbdb0d09efap-4, -0x1.34a306c27e021p-4,
+     0x1.83fe749c7966p-6, 0x1.2057cc96d9edcp-6},
+    {0x1.4978fa3269ee2p-1, 0x1.47ae147ae146bp-1, -0x1.3a92a305652e1p-2,
+     0x1.ec21b5172657fp-5, 0x1.c2f8c45d2f4eep-5, -0x1.0ba99c4aeb8acp-4,
+     0x1.d716a4af4d1d6p-6, 0x1.97fba0a9696dep-8},
+    {0x1.5d58987169b19p-1, 0x1.34679ace0133cp-1, -0x1.2ddfb03920e2fp-2,
+     0x1.2491307c0fa0bp-4, 0x1.29c7eca0136fp-5, -0x1.bca792caa6f1cp-5,
+     0x1.e5d92545576bcp-6, -0x1.8ca76fcf5ccd2p-10},
+    {0x1.700a7c5784634p-1, 0x1.21fb78121fb71p-1, -0x1.1f6a8499ea541p-2,
+     0x1.41b15e5e77bcfp-4, 0x1.59bc9bf54fb02p-6, -0x1.63b54ff058e0fp-5,
+     0x1.c8da01221306fp-6, -0x1.906b2c274c39cp-8},
+    {0x1.819d0b7158a4dp-1, 0x1.107fbbe01107cp-1, -0x1.0feeb40897d4ep-2,
+     0x1.50e5afb95f5d6p-4, 0x1.2a7c2f0c7495dp-7, -0x1.12bd2bb5062cdp-5,
+     0x1.93e8ceb89afebp-6, -0x1.10da9b8c6b731p-7},
+    {0x1.921fb54442d18p-1, 0x1.fffffffffffebp-2, -0x1.fffffffffcbbcp-3,
+     0x1.555555564e2fep-4, -0x1.20b17d5dd89dcp-30, -0x1.9999c5ad71711p-6,
+     0x1.5558b76e7aaf9p-6, -0x1.236e803c6c1f6p-7},
+};
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/inv_trigf_utils.h b/libc/src/math/generic/inv_trigf_utils.h
index 20f912de2ac008..c621f27e101460 100644
--- a/libc/src/math/generic/inv_trigf_utils.h
+++ b/libc/src/math/generic/inv_trigf_utils.h
@@ -9,85 +9,32 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_INV_TRIGF_UTILS_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_INV_TRIGF_UTILS_H
 
-#include "math_utils.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/common.h"
 
-#include <errno.h>
-
 namespace LIBC_NAMESPACE {
 
 // PI and PI / 2
 constexpr double M_MATH_PI = 0x1.921fb54442d18p+1;
 constexpr double M_MATH_PI_2 = 0x1.921fb54442d18p+0;
 
-// atan table size
-constexpr int ATAN_T_BITS = 4;
-constexpr int ATAN_T_SIZE = 1 << ATAN_T_BITS;
-
-// N[Table[ArcTan[x], {x, 1/8, 8/8, 1/8}], 40]
-extern const double ATAN_T[ATAN_T_SIZE];
-extern const double ATAN_K[5];
-
-// The main idea of the function is to use formula
-// atan(u) + atan(v) = atan((u+v)/(1-uv))
-
-// x should be positive, normal finite value
-LIBC_INLINE double atan_eval(double x) {
-  using FPB = fputil::FPBits<double>;
-  using Sign = fputil::Sign;
-  // Added some small value to umin and umax mantissa to avoid possible rounding
-  // errors.
-  FPB::StorageType umin =
-      FPB::create_value(Sign::POS, FPB::EXP_BIAS - ATAN_T_BITS - 1,
-                        0x100000000000UL)
-          .uintval();
-  FPB::StorageType umax =
-      FPB::create_value(Sign::POS, FPB::EXP_BIAS + ATAN_T_BITS,
-                        0xF000000000000UL)
-          .uintval();
-
-  FPB bs(x);
-  auto x_abs = bs.abs().uintval();
-
-  if (x_abs <= umin) {
-    double pe = LIBC_NAMESPACE::fputil::polyeval(
-        x * x, 0.0, ATAN_K[1], ATAN_K[2], ATAN_K[3], ATAN_K[4]);
-    return fputil::multiply_add(pe, x, x);
-  }
-
-  if (x_abs >= umax) {
-    double one_over_x_m = -1.0 / x;
-    double one_over_x2 = one_over_x_m * one_over_x_m;
-    double pe = LIBC_NAMESPACE::fputil::polyeval(
-        one_over_x2, ATAN_K[0], ATAN_K[1], ATAN_K[2], ATAN_K[3]);
-    return fputil::multiply_add(pe, one_over_x_m,
-                                bs.is_neg() ? (-M_MATH_PI_2) : (M_MATH_PI_2));
-  }
+extern double ATAN_COEFFS[17][8];
 
-  double pos_x = FPB(x_abs).get_val();
-  bool one_over_x = pos_x > 1.0;
-  if (one_over_x) {
-    pos_x = 1.0 / pos_x;
-  }
+// For |x| <= 1/32 and 1 <= i <= 16, return Q(x) such that:
+//   Q(x) ~ (atan(x + i/16) - atan(i/16)) / x.
+LIBC_INLINE double atan_eval(double x, int i) {
+  double x2 = x * x;
 
-  double near_x = fputil::nearest_integer(pos_x * ATAN_T_SIZE);
-  int val = static_cast<int>(near_x);
-  near_x *= 1.0 / ATAN_T_SIZE;
+  double c0 = fputil::multiply_add(x, ATAN_COEFFS[i][2], ATAN_COEFFS[i][1]);
+  double c1 = fputil::multiply_add(x, ATAN_COEFFS[i][4], ATAN_COEFFS[i][3]);
+  double c2 = fputil::multiply_add(x, ATAN_COEFFS[i][6], ATAN_COEFFS[i][5]);
 
-  double v = (pos_x - near_x) / fputil::multiply_add(near_x, pos_x, 1.0);
-  double v2 = v * v;
-  double pe = LIBC_NAMESPACE::fputil::polyeval(v2, ATAN_K[0], ATAN_K[1],
-                                               ATAN_K[2], ATAN_K[3], ATAN_K[4]);
-  double result;
-  if (one_over_x)
-    result = M_MATH_PI_2 - fputil::multiply_add(pe, v, ATAN_T[val - 1]);
-  else
-    result = fputil::multiply_add(pe, v, ATAN_T[val - 1]);
-  return bs.is_neg() ? -result : result;
+  double x4 = x2 * x2;
+  double d1 = fputil::multiply_add(x2, c1, c0);
+  double d2 = fputil::multiply_add(x2, ATAN_COEFFS[i][7], c2);
+  double p = fputil::multiply_add(x4, d2, d1);
+  return p;
 }
 
 // > Q = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20|],
diff --git a/libc/src/math/generic/math_utils.h b/libc/src/math/generic/math_utils.h
index e884fe2deae284..cced761fc8c822 100644
--- a/libc/src/math/generic/math_utils.h
+++ b/libc/src/math/generic/math_utils.h
@@ -9,13 +9,12 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_MATH_UTILS_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_MATH_UTILS_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/common.h"
 #include "src/errno/libc_errno.h"
 
-#include <math.h>
-
 #include <stdint.h>
 
 // TODO: evaluate which functions from this file are actually used.
diff --git a/libc/src/math/generic/nanf128.cpp b/libc/src/math/generic/nanf128.cpp
new file mode 100644
index 00000000000000..f087c9f074fde8
--- /dev/null
+++ b/libc/src/math/generic/nanf128.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of nanf128 function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nanf128.h"
+#include "src/__support/common.h"
+#include "src/__support/str_to_float.h"
+#include "src/errno/libc_errno.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, nanf128, (const char *arg)) {
+  auto result = internal::strtonan<float128>(arg);
+  if (result.has_error())
+    libc_errno = result.error;
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextdown.cpp b/libc/src/math/generic/nextdown.cpp
new file mode 100644
index 00000000000000..51dee483b70e62
--- /dev/null
+++ b/libc/src/math/generic/nextdown.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextdown function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextdown.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, nextdown, (double x)) {
+  return fputil::nextupdown</*IsDown=*/true>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextdownf.cpp b/libc/src/math/generic/nextdownf.cpp
new file mode 100644
index 00000000000000..857b412d64c3c4
--- /dev/null
+++ b/libc/src/math/generic/nextdownf.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextdownf function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextdownf.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, nextdownf, (float x)) {
+  return fputil::nextupdown</*IsDown=*/true>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextdownf128.cpp b/libc/src/math/generic/nextdownf128.cpp
new file mode 100644
index 00000000000000..2585a13df3a3c2
--- /dev/null
+++ b/libc/src/math/generic/nextdownf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextdownf128 function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextdownf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, nextdownf128, (float128 x)) {
+  return fputil::nextupdown</*IsDown=*/true>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextdownl.cpp b/libc/src/math/generic/nextdownl.cpp
new file mode 100644
index 00000000000000..06a09c9daea595
--- /dev/null
+++ b/libc/src/math/generic/nextdownl.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextdownl function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextdownl.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long double, nextdownl, (long double x)) {
+  return fputil::nextupdown</*IsDown=*/true>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextup.cpp b/libc/src/math/generic/nextup.cpp
new file mode 100644
index 00000000000000..d75a336eefcc8a
--- /dev/null
+++ b/libc/src/math/generic/nextup.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextup function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextup.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, nextup, (double x)) {
+  return fputil::nextupdown</*IsDown=*/false>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextupf.cpp b/libc/src/math/generic/nextupf.cpp
new file mode 100644
index 00000000000000..3b18dae4488d4d
--- /dev/null
+++ b/libc/src/math/generic/nextupf.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextupf function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextupf.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, nextupf, (float x)) {
+  return fputil::nextupdown</*IsDown=*/false>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextupf128.cpp b/libc/src/math/generic/nextupf128.cpp
new file mode 100644
index 00000000000000..7d862c30976cb4
--- /dev/null
+++ b/libc/src/math/generic/nextupf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextupf128 function -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextupf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, nextupf128, (float128 x)) {
+  return fputil::nextupdown</*IsDown=*/false>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/nextupl.cpp b/libc/src/math/generic/nextupl.cpp
new file mode 100644
index 00000000000000..ccc52445eec5c1
--- /dev/null
+++ b/libc/src/math/generic/nextupl.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of nextupl function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextupl.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long double, nextupl, (long double x)) {
+  return fputil::nextupdown</*IsDown=*/false>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nanf128.h b/libc/src/math/nanf128.h
new file mode 100644
index 00000000000000..b06d14e2f945e1
--- /dev/null
+++ b/libc/src/math/nanf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for nanf128 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NANF128_H
+#define LLVM_LIBC_SRC_MATH_NANF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 nanf128(const char *arg);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NANF128_H
diff --git a/libc/src/math/nextdown.h b/libc/src/math/nextdown.h
new file mode 100644
index 00000000000000..8049b170ee7211
--- /dev/null
+++ b/libc/src/math/nextdown.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextdown ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTDOWN_H
+#define LLVM_LIBC_SRC_MATH_NEXTDOWN_H
+
+namespace LIBC_NAMESPACE {
+
+double nextdown(double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTDOWN_H
diff --git a/libc/src/math/nextdownf.h b/libc/src/math/nextdownf.h
new file mode 100644
index 00000000000000..0a2f23480574dc
--- /dev/null
+++ b/libc/src/math/nextdownf.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextdownf ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTDOWNF_H
+#define LLVM_LIBC_SRC_MATH_NEXTDOWNF_H
+
+namespace LIBC_NAMESPACE {
+
+float nextdownf(float x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTDOWNF_H
diff --git a/libc/src/math/nextdownf128.h b/libc/src/math/nextdownf128.h
new file mode 100644
index 00000000000000..0a3043bb431d82
--- /dev/null
+++ b/libc/src/math/nextdownf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for nextdownf128 ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTDOWNF128_H
+#define LLVM_LIBC_SRC_MATH_NEXTDOWNF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 nextdownf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTDOWNF128_H
diff --git a/libc/src/math/nextdownl.h b/libc/src/math/nextdownl.h
new file mode 100644
index 00000000000000..9cb274a89b1c53
--- /dev/null
+++ b/libc/src/math/nextdownl.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextdownl ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTDOWNL_H
+#define LLVM_LIBC_SRC_MATH_NEXTDOWNL_H
+
+namespace LIBC_NAMESPACE {
+
+long double nextdownl(long double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTDOWNL_H
diff --git a/libc/src/math/nextup.h b/libc/src/math/nextup.h
new file mode 100644
index 00000000000000..97ae82270b06e4
--- /dev/null
+++ b/libc/src/math/nextup.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextup ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTUP_H
+#define LLVM_LIBC_SRC_MATH_NEXTUP_H
+
+namespace LIBC_NAMESPACE {
+
+double nextup(double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTUP_H
diff --git a/libc/src/math/nextupf.h b/libc/src/math/nextupf.h
new file mode 100644
index 00000000000000..ffc0fa168a1086
--- /dev/null
+++ b/libc/src/math/nextupf.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextupf -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTUPF_H
+#define LLVM_LIBC_SRC_MATH_NEXTUPF_H
+
+namespace LIBC_NAMESPACE {
+
+float nextupf(float x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTUPF_H
diff --git a/libc/src/math/nextupf128.h b/libc/src/math/nextupf128.h
new file mode 100644
index 00000000000000..b4429922e4beb7
--- /dev/null
+++ b/libc/src/math/nextupf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for nextupf128 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTUPF128_H
+#define LLVM_LIBC_SRC_MATH_NEXTUPF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 nextupf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTUPF128_H
diff --git a/libc/src/math/nextupl.h b/libc/src/math/nextupl.h
new file mode 100644
index 00000000000000..cbc6a168e4bea2
--- /dev/null
+++ b/libc/src/math/nextupl.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for nextupl -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_NEXTUPL_H
+#define LLVM_LIBC_SRC_MATH_NEXTUPL_H
+
+namespace LIBC_NAMESPACE {
+
+long double nextupl(long double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_NEXTUPL_H
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index bb8e41606c5dfb..ece93fd56ef0c5 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -236,6 +236,16 @@ add_entrypoint_object(
     libc.src.stdio.printf_core.vfprintf_internal
 )
 
+add_stdio_entrypoint_object(
+  fileno
+  SRCS
+    fileno.cpp
+  HDRS
+    fileno.h
+  DEPENDS
+    libc.src.stdio.fileno
+)
+
 add_subdirectory(printf_core)
 add_subdirectory(scanf_core)
 
diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt
new file mode 100644
index 00000000000000..65089274050bdd
--- /dev/null
+++ b/libc/src/stdio/baremetal/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_entrypoint_object(
+  remove
+  SRCS
+    remove.cpp
+  HDRS
+    ../remove.h
+  DEPENDS
+    libc.include.stdio
+)
diff --git a/libc/src/stdio/baremetal/remove.cpp b/libc/src/stdio/baremetal/remove.cpp
new file mode 100644
index 00000000000000..f4f2cdca7c69b5
--- /dev/null
+++ b/libc/src/stdio/baremetal/remove.cpp
@@ -0,0 +1,19 @@
+//===-- Linux implementation of remove ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/remove.h"
+
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+// TODO: This is a temporary workaround for issue #85335.
+
+LLVM_LIBC_FUNCTION(int, remove, (const char *)) { return -1; }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/fileno.h b/libc/src/stdio/fileno.h
new file mode 100644
index 00000000000000..d41f112226c512
--- /dev/null
+++ b/libc/src/stdio/fileno.h
@@ -0,0 +1,21 @@
+//===-- Implementation header of fileno --------------------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_FILENO_H
+#define LLVM_LIBC_SRC_STDIO_FILENO_H
+
+#include "include/llvm-libc-types/FILE.h"
+
+namespace LIBC_NAMESPACE {
+
+int fileno(::FILE *f);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDIO_FILENO_H
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index 4e4a709e94061b..0aa213caba7b8a 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -70,6 +70,18 @@ add_entrypoint_object(
     libc.src.__support.File.platform_file
 )
 
+add_entrypoint_object(
+  fileno
+  SRCS
+    fileno.cpp
+  HDRS
+    ../fileno.h
+  DEPENDS
+    libc.include.stdio
+    libc.src.__support.File.file
+    libc.src.__support.File.platform_file
+)
+
 add_entrypoint_object(
   fflush
   SRCS
diff --git a/libc/src/stdio/generic/fileno.cpp b/libc/src/stdio/generic/fileno.cpp
new file mode 100644
index 00000000000000..663ba926637622
--- /dev/null
+++ b/libc/src/stdio/generic/fileno.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of fileno
+//-------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/fileno.h"
+
+#include "include/llvm-libc-types/FILE.h"
+#include "src/__support/File/file.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, fileno, (::FILE * stream)) {
+  return get_fileno(reinterpret_cast<LIBC_NAMESPACE::File *>(stream));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/linux/CMakeLists.txt b/libc/src/stdio/linux/CMakeLists.txt
index 3b5a9960dc125e..774f24b2db0bdc 100644
--- a/libc/src/stdio/linux/CMakeLists.txt
+++ b/libc/src/stdio/linux/CMakeLists.txt
@@ -6,6 +6,7 @@ add_entrypoint_object(
     ../remove.h
   DEPENDS
     libc.include.fcntl
+    libc.include.stdio
     libc.include.unistd
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
diff --git a/libc/src/stdio/printf_core/string_converter.h b/libc/src/stdio/printf_core/string_converter.h
index 04dc5a06da2226..9e05591079faa4 100644
--- a/libc/src/stdio/printf_core/string_converter.h
+++ b/libc/src/stdio/printf_core/string_converter.h
@@ -25,7 +25,7 @@ LIBC_INLINE int convert_string(Writer *writer, const FormatSection &to_conv) {
 
 #ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
   if (str_ptr == nullptr) {
-    str_ptr = "null";
+    str_ptr = "(null)";
   }
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 
diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp
index 1513b7969f0dbc..741ea4f25103ac 100644
--- a/libc/src/stdlib/atexit.cpp
+++ b/libc/src/stdlib/atexit.cpp
@@ -36,7 +36,7 @@ struct AtExitUnit {
 //        mutexes simply passthrough. We will need a lock free stack.
 using ExitCallbackList = FixedVector<AtExitUnit, 64>;
 #elif defined(LIBC_COPT_PUBLIC_PACKAGING)
-using ExitCallbackList = cpp::ReverseOrderBlockStore<AtExitUnit, 32>;
+using ExitCallbackList = ReverseOrderBlockStore<AtExitUnit, 32>;
 #else
 // BlockStore uses dynamic memory allocation. To avoid dynamic memory
 // allocation in tests, we use a fixed size callback list when built for
diff --git a/libc/src/sys/mman/CMakeLists.txt b/libc/src/sys/mman/CMakeLists.txt
index b49f73873c2006..9c74202a09f035 100644
--- a/libc/src/sys/mman/CMakeLists.txt
+++ b/libc/src/sys/mman/CMakeLists.txt
@@ -85,3 +85,17 @@ add_entrypoint_object(
   DEPENDS
     .${LIBC_TARGET_OS}.msync
 )
+
+add_entrypoint_object(
+  shm_open
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.shm_open
+)
+
+add_entrypoint_object(
+  shm_unlink
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.shm_unlink
+)
diff --git a/libc/src/sys/mman/linux/CMakeLists.txt b/libc/src/sys/mman/linux/CMakeLists.txt
index 04086ee5d33254..00f4f0e64ec06b 100644
--- a/libc/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/src/sys/mman/linux/CMakeLists.txt
@@ -152,3 +152,40 @@ add_entrypoint_object(
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
+
+add_header_library(
+  shm_common
+  HDRS
+    shm_common.h
+  DEPENDS
+    libc.src.__support.CPP.array
+    libc.src.__support.CPP.string_view
+    libc.src.__support.CPP.optional
+    libc.src.__support.common
+    libc.src.errno.errno
+    libc.src.string.memory_utils.inline_memcpy
+)
+
+add_entrypoint_object(
+  shm_open
+  SRCS
+    shm_open.cpp
+  HDRS
+    ../shm_open.h
+  DEPENDS
+    libc.src.fcntl.open
+    libc.include.llvm-libc-macros.fcntl_macros
+    libc.include.llvm-libc-types.mode_t
+    .shm_common
+)
+
+add_entrypoint_object(
+  shm_unlink
+  SRCS
+    shm_unlink.cpp
+  HDRS
+    ../shm_unlink.h
+  DEPENDS
+    libc.src.unistd.unlink
+    .shm_common
+)
diff --git a/libc/src/sys/mman/linux/shm_common.h b/libc/src/sys/mman/linux/shm_common.h
new file mode 100644
index 00000000000000..6f2a3fdc71b647
--- /dev/null
+++ b/libc/src/sys/mman/linux/shm_common.h
@@ -0,0 +1,53 @@
+//===---------- Shared implementations for shm_open/shm_unlink ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/errno/libc_errno.h"
+#include "src/string/memory_utils/inline_memcpy.h"
+
+// TODO: Get PATH_MAX via https://github.com/llvm/llvm-project/issues/85121
+#include <linux/limits.h>
+
+namespace LIBC_NAMESPACE {
+
+namespace shm_common {
+
+LIBC_INLINE_VAR constexpr cpp::string_view SHM_PREFIX = "/dev/shm/";
+using SHMPath = cpp::array<char, NAME_MAX + SHM_PREFIX.size() + 1>;
+
+LIBC_INLINE cpp::optional<SHMPath> translate_name(cpp::string_view name) {
+  // trim leading slashes
+  size_t offset = name.find_first_not_of('/');
+  if (offset == cpp::string_view::npos) {
+    libc_errno = EINVAL;
+    return cpp::nullopt;
+  }
+  name = name.substr(offset);
+
+  // check the name
+  if (name.size() > NAME_MAX) {
+    libc_errno = ENAMETOOLONG;
+    return cpp::nullopt;
+  }
+  if (name == "." || name == ".." || name.contains('/')) {
+    libc_errno = EINVAL;
+    return cpp::nullopt;
+  }
+
+  // prepend the prefix
+  SHMPath buffer;
+  inline_memcpy(buffer.data(), SHM_PREFIX.data(), SHM_PREFIX.size());
+  inline_memcpy(buffer.data() + SHM_PREFIX.size(), name.data(), name.size());
+  buffer[SHM_PREFIX.size() + name.size()] = '\0';
+  return buffer;
+}
+} // namespace shm_common
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/sys/mman/linux/shm_open.cpp b/libc/src/sys/mman/linux/shm_open.cpp
new file mode 100644
index 00000000000000..0d39b8b4e53dbf
--- /dev/null
+++ b/libc/src/sys/mman/linux/shm_open.cpp
@@ -0,0 +1,25 @@
+//===---------- Linux implementation of the shm_open function -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/mman/shm_open.h"
+#include "llvm-libc-macros/fcntl-macros.h"
+#include "src/fcntl/open.h"
+#include "src/sys/mman/linux/shm_common.h"
+
+namespace LIBC_NAMESPACE {
+
+static constexpr int DEFAULT_OFLAGS = O_NOFOLLOW | O_CLOEXEC | O_NONBLOCK;
+
+LLVM_LIBC_FUNCTION(int, shm_open, (const char *name, int oflags, mode_t mode)) {
+  using namespace shm_common;
+  if (cpp::optional<SHMPath> buffer = translate_name(name))
+    return open(buffer->data(), oflags | DEFAULT_OFLAGS, mode);
+  return -1;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/sys/mman/linux/shm_unlink.cpp b/libc/src/sys/mman/linux/shm_unlink.cpp
new file mode 100644
index 00000000000000..32f48d3e3e7187
--- /dev/null
+++ b/libc/src/sys/mman/linux/shm_unlink.cpp
@@ -0,0 +1,22 @@
+//===---------- Linux implementation of the shm_unlink function -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/mman/shm_unlink.h"
+#include "src/sys/mman/linux/shm_common.h"
+#include "src/unistd/unlink.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, shm_unlink, (const char *name)) {
+  using namespace shm_common;
+  if (cpp::optional<SHMPath> buffer = translate_name(name))
+    return unlink(buffer->data());
+  return -1;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/sys/mman/shm_open.h b/libc/src/sys/mman/shm_open.h
new file mode 100644
index 00000000000000..91796d7b5c0501
--- /dev/null
+++ b/libc/src/sys/mman/shm_open.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for shm_open function -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_MMAN_SHM_OPEN_H
+#define LLVM_LIBC_SRC_SYS_MMAN_SHM_OPEN_H
+
+#include <llvm-libc-types/mode_t.h>
+
+namespace LIBC_NAMESPACE {
+
+int shm_open(const char *name, int oflag, mode_t mode);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_SYS_MMAN_SHM_OPEN_H
diff --git a/libc/src/sys/mman/shm_unlink.h b/libc/src/sys/mman/shm_unlink.h
new file mode 100644
index 00000000000000..c38c06adbaa292
--- /dev/null
+++ b/libc/src/sys/mman/shm_unlink.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for shm_unlink function ------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_MMAN_SHM_UNLINK_H
+#define LLVM_LIBC_SRC_SYS_MMAN_SHM_UNLINK_H
+
+namespace LIBC_NAMESPACE {
+
+int shm_unlink(const char *name);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_SYS_MMAN_SHM_UNLINK_H
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 4525b9e830195e..43000efa09a39c 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -17,7 +17,7 @@
 #include "test/UnitTest/StringUtils.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace LIBC_NAMESPACE {
 namespace testing {
@@ -102,8 +102,10 @@ template <typename T> struct FPTest : public Test {
   const T inf = FPBits::inf(Sign::POS).get_val();                              \
   const T neg_inf = FPBits::inf(Sign::NEG).get_val();                          \
   const T min_normal = FPBits::min_normal().get_val();                         \
-  const T max_normal = FPBits::max_normal().get_val();                         \
-  const T min_denormal = FPBits::min_subnormal().get_val();                    \
+  const T max_normal = FPBits::max_normal(Sign::POS).get_val();                \
+  const T neg_max_normal = FPBits::max_normal(Sign::NEG).get_val();            \
+  const T min_denormal = FPBits::min_subnormal(Sign::POS).get_val();           \
+  const T neg_min_denormal = FPBits::min_subnormal(Sign::NEG).get_val();       \
   const T max_denormal = FPBits::max_subnormal().get_val();
 
 #define EXPECT_FP_EQ(expected, actual)                                         \
diff --git a/libc/test/src/__support/CPP/CMakeLists.txt b/libc/test/src/__support/CPP/CMakeLists.txt
index f94429e03b3cbc..74aa0c705ec467 100644
--- a/libc/test/src/__support/CPP/CMakeLists.txt
+++ b/libc/test/src/__support/CPP/CMakeLists.txt
@@ -1,5 +1,15 @@
 add_custom_target(libc-cpp-utils-tests)
 
+add_libc_test(
+  array_test
+  SUITE
+    libc-cpp-utils-tests
+  SRCS
+    array_test.cpp
+  DEPENDS
+    libc.src.__support.CPP.array
+  )
+
 add_libc_test(
   bit_test
   SUITE
diff --git a/libc/test/src/__support/CPP/array_test.cpp b/libc/test/src/__support/CPP/array_test.cpp
new file mode 100644
index 00000000000000..f2d7bff636e42c
--- /dev/null
+++ b/libc/test/src/__support/CPP/array_test.cpp
@@ -0,0 +1,66 @@
+//===-- Unittests for Array -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/array.h"
+#include "test/UnitTest/Test.h"
+
+using LIBC_NAMESPACE::cpp::array;
+
+TEST(LlvmLibcArrayTest, Basic) {
+  array<int, 3> a = {0, 1, 2};
+
+  ASSERT_EQ(a.data(), &a.front());
+  ASSERT_EQ(a.front(), 0);
+  ASSERT_EQ(a.back(), 2);
+  ASSERT_EQ(a.size(), size_t{3});
+  ASSERT_EQ(a[1], 1);
+  ASSERT_FALSE(a.empty());
+  ASSERT_NE(a.begin(), a.end());
+  ASSERT_EQ(*a.begin(), a.front());
+
+  auto it = a.rbegin();
+  ASSERT_EQ(*it, 2);
+  ASSERT_EQ(*(++it), 1);
+  ASSERT_EQ(*(++it), 0);
+
+  for (int &x : a)
+    ASSERT_GE(x, 0);
+}
+
+// Test const_iterator and const variant methods.
+TEST(LlvmLibcArrayTest, Const) {
+  const array<int, 3> z = {3, 4, 5};
+
+  ASSERT_EQ(3, z.front());
+  ASSERT_EQ(4, z[1]);
+  ASSERT_EQ(5, z.back());
+  ASSERT_EQ(3, *z.data());
+
+  // begin, cbegin, end, cend
+  array<int, 3>::const_iterator it2 = z.begin();
+  ASSERT_EQ(*it2, z.front());
+  it2 = z.cbegin();
+  ASSERT_EQ(*it2, z.front());
+  it2 = z.end();
+  ASSERT_NE(it2, z.begin());
+  it2 = z.cend();
+  ASSERT_NE(it2, z.begin());
+
+  // rbegin, crbegin, rend, crend
+  array<int, 3>::const_reverse_iterator it = z.rbegin();
+  ASSERT_EQ(*it, z.back());
+  it = z.crbegin();
+  ASSERT_EQ(*it, z.back());
+  it = z.rend();
+  ASSERT_EQ(*--it, z.front());
+  it = z.crend();
+  ASSERT_EQ(*--it, z.front());
+
+  for (const int &x : z)
+    ASSERT_GE(x, 0);
+}
diff --git a/libc/test/src/__support/CPP/stringview_test.cpp b/libc/test/src/__support/CPP/stringview_test.cpp
index 33eb8ab4765728..6b68f2a1c47a9d 100644
--- a/libc/test/src/__support/CPP/stringview_test.cpp
+++ b/libc/test/src/__support/CPP/stringview_test.cpp
@@ -174,3 +174,111 @@ TEST(LlvmLibcStringViewTest, FindLastOf) {
   ASSERT_EQ(Empty1.find_last_of('a', 0), string_view::npos);
   ASSERT_EQ(Empty1.find_last_of('a', 123), string_view::npos);
 }
+
+TEST(LlvmLibcStringViewTest, FindFirstNotOf) {
+  string_view Tmp("abada");
+
+  EXPECT_EQ(Tmp.find_first_not_of('a'), size_t(1));
+  EXPECT_EQ(Tmp.find_first_not_of('a', 123), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('a', 5), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('a', 4), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('a', 3), size_t(3));
+  EXPECT_EQ(Tmp.find_first_not_of('a', 2), size_t(3));
+  EXPECT_EQ(Tmp.find_first_not_of('a', 1), size_t(1));
+  EXPECT_EQ(Tmp.find_first_not_of('a', 0), size_t(1));
+
+  EXPECT_EQ(Tmp.find_first_not_of('b'), size_t(0));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 123), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('b', 5), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('b', 4), size_t(4));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 3), size_t(3));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 2), size_t(2));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 1), size_t(2));
+  EXPECT_EQ(Tmp.find_first_not_of('b', 0), size_t(0));
+
+  EXPECT_EQ(Tmp.find_first_not_of('d'), size_t(0));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 123), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('d', 5), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('d', 4), size_t(4));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 3), size_t(4));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 2), size_t(2));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 1), size_t(1));
+  EXPECT_EQ(Tmp.find_first_not_of('d', 0), size_t(0));
+
+  EXPECT_EQ(Tmp.find_first_not_of('e'), size_t(0));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 123), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('e', 5), string_view::npos);
+  EXPECT_EQ(Tmp.find_first_not_of('e', 4), size_t(4));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 3), size_t(3));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 2), size_t(2));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 1), size_t(1));
+  EXPECT_EQ(Tmp.find_first_not_of('e', 0), size_t(0));
+
+  string_view Empty;
+  EXPECT_EQ(Empty.find_first_not_of('a'), string_view::npos);
+  EXPECT_EQ(Empty.find_first_not_of('a', 0), string_view::npos);
+  EXPECT_EQ(Empty.find_first_not_of('a', 123), string_view::npos);
+
+  string_view Empty1("");
+  EXPECT_EQ(Empty1.find_first_not_of('a'), string_view::npos);
+  EXPECT_EQ(Empty1.find_first_not_of('a', 0), string_view::npos);
+  EXPECT_EQ(Empty1.find_first_not_of('a', 123), string_view::npos);
+
+  string_view Full("aaaaaaa");
+  EXPECT_EQ(Full.find_first_not_of('a'), string_view::npos);
+  EXPECT_EQ(Full.find_first_not_of('a', 0), string_view::npos);
+  EXPECT_EQ(Full.find_first_not_of('a', 123), string_view::npos);
+
+  EXPECT_EQ(Full.find_first_not_of('b'), size_t(0));
+  EXPECT_EQ(Full.find_first_not_of('b', 0), size_t(0));
+  EXPECT_EQ(Full.find_first_not_of('b', 123), string_view::npos);
+}
+
+TEST(LlvmLibcStringViewTest, Contains) {
+  string_view Empty;
+  for (char c = 'a'; c < 'z'; ++c)
+    EXPECT_FALSE(Empty.contains(c));
+
+  string_view Tmp("abada");
+  EXPECT_TRUE(Tmp.contains('a'));
+  EXPECT_TRUE(Tmp.contains('b'));
+  EXPECT_FALSE(Tmp.contains('c'));
+  EXPECT_TRUE(Tmp.contains('d'));
+  EXPECT_FALSE(Tmp.contains('e'));
+
+  EXPECT_TRUE(Tmp.substr(1).contains('a'));
+  EXPECT_TRUE(Tmp.substr(1).contains('b'));
+  EXPECT_FALSE(Tmp.substr(1).contains('c'));
+  EXPECT_TRUE(Tmp.substr(1).contains('d'));
+  EXPECT_FALSE(Tmp.substr(1).contains('e'));
+
+  EXPECT_TRUE(Tmp.substr(2).contains('a'));
+  EXPECT_FALSE(Tmp.substr(2).contains('b'));
+  EXPECT_FALSE(Tmp.substr(2).contains('c'));
+  EXPECT_TRUE(Tmp.substr(2).contains('d'));
+  EXPECT_FALSE(Tmp.substr(2).contains('e'));
+
+  EXPECT_TRUE(Tmp.substr(3).contains('a'));
+  EXPECT_FALSE(Tmp.substr(3).contains('b'));
+  EXPECT_FALSE(Tmp.substr(3).contains('c'));
+  EXPECT_TRUE(Tmp.substr(3).contains('d'));
+  EXPECT_FALSE(Tmp.substr(3).contains('e'));
+
+  EXPECT_TRUE(Tmp.substr(4).contains('a'));
+  EXPECT_FALSE(Tmp.substr(4).contains('b'));
+  EXPECT_FALSE(Tmp.substr(4).contains('c'));
+  EXPECT_FALSE(Tmp.substr(4).contains('d'));
+  EXPECT_FALSE(Tmp.substr(4).contains('e'));
+
+  EXPECT_FALSE(Tmp.substr(5).contains('a'));
+  EXPECT_FALSE(Tmp.substr(5).contains('b'));
+  EXPECT_FALSE(Tmp.substr(5).contains('c'));
+  EXPECT_FALSE(Tmp.substr(5).contains('d'));
+  EXPECT_FALSE(Tmp.substr(5).contains('e'));
+
+  EXPECT_FALSE(Tmp.substr(6).contains('a'));
+  EXPECT_FALSE(Tmp.substr(6).contains('b'));
+  EXPECT_FALSE(Tmp.substr(6).contains('c'));
+  EXPECT_FALSE(Tmp.substr(6).contains('d'));
+  EXPECT_FALSE(Tmp.substr(6).contains('e'));
+}
diff --git a/libc/test/src/__support/blockstore_test.cpp b/libc/test/src/__support/blockstore_test.cpp
index f62857275fe4c9..5fe8fef1b6edcc 100644
--- a/libc/test/src/__support/blockstore_test.cpp
+++ b/libc/test/src/__support/blockstore_test.cpp
@@ -19,7 +19,7 @@ class LlvmLibcBlockStoreTest : public LIBC_NAMESPACE::testing::Test {
 public:
   template <size_t BLOCK_SIZE, size_t ELEMENT_COUNT, bool REVERSE>
   void populate_and_iterate() {
-    LIBC_NAMESPACE::cpp::BlockStore<Element, BLOCK_SIZE, REVERSE> block_store;
+    LIBC_NAMESPACE::BlockStore<Element, BLOCK_SIZE, REVERSE> block_store;
     for (int i = 0; i < int(ELEMENT_COUNT); ++i)
       ASSERT_TRUE(block_store.push_back({i, 2 * i, 3 * unsigned(i)}));
     auto end = block_store.end();
@@ -38,12 +38,12 @@ class LlvmLibcBlockStoreTest : public LIBC_NAMESPACE::testing::Test {
       }
     }
     ASSERT_EQ(i, int(ELEMENT_COUNT));
-    LIBC_NAMESPACE::cpp::BlockStore<Element, BLOCK_SIZE, REVERSE>::destroy(
+    LIBC_NAMESPACE::BlockStore<Element, BLOCK_SIZE, REVERSE>::destroy(
         &block_store);
   }
 
   template <bool REVERSE> void back_test() {
-    using LIBC_NAMESPACE::cpp::BlockStore;
+    using LIBC_NAMESPACE::BlockStore;
     BlockStore<int, 4, REVERSE> block_store;
     for (int i = 0; i < 20; i++)
       ASSERT_TRUE(block_store.push_back(i));
@@ -53,7 +53,7 @@ class LlvmLibcBlockStoreTest : public LIBC_NAMESPACE::testing::Test {
   }
 
   template <bool REVERSE> void empty_test() {
-    using LIBC_NAMESPACE::cpp::BlockStore;
+    using LIBC_NAMESPACE::BlockStore;
     BlockStore<int, 2, REVERSE> block_store;
 
     ASSERT_TRUE(block_store.empty());
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index 851656e9fcbbb0..5764324ca28815 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/UInt.h"
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
 
+#include "include/llvm-libc-macros/math-macros.h" // HUGE_VALF, HUGE_VALF
 #include "test/UnitTest/Test.h"
-#include <math.h> // HUGE_VALF, HUGE_VALF
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 7b952901da76e9..026bcd12928bdf 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1518,6 +1518,8 @@ add_fp_unittest(
   DEPENDS
     libc.include.math
     libc.src.math.generic.explogxf
+    libc.src.math.fabs
+    libc.src.math.fabsf
     libc.src.__support.FPUtil.fp_bits
 )
 
@@ -1643,20 +1645,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
-add_fp_unittest(
-  inv_trigf_utils_test
-  NEED_MPFR
-  SUITE
-    libc-math-unittests
-  HDRS
-    in_float_range_test_helper.h
-  SRCS
-    inv_trigf_utils_test.cpp
-  DEPENDS
-    libc.src.math.generic.inv_trigf_utils
-    libc.src.__support.FPUtil.fp_bits
-)
-
 add_fp_unittest(
   scalbn_test
   NEED_MPFR
diff --git a/libc/test/src/math/CeilTest.h b/libc/test/src/math/CeilTest.h
index 5ea4f349d008b3..74cc90614dfc2a 100644
--- a/libc/test/src/math/CeilTest.h
+++ b/libc/test/src/math/CeilTest.h
@@ -10,7 +10,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/CopySignTest.h b/libc/test/src/math/CopySignTest.h
index 8b81e8d7de2523..206626d66f5806 100644
--- a/libc/test/src/math/CopySignTest.h
+++ b/libc/test/src/math/CopySignTest.h
@@ -10,7 +10,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FAbsTest.h b/libc/test/src/math/FAbsTest.h
index 54f5f87e08e7e3..942991f23be1c6 100644
--- a/libc/test/src/math/FAbsTest.h
+++ b/libc/test/src/math/FAbsTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FDimTest.h b/libc/test/src/math/FDimTest.h
index e00b4fd5c42ec0..76f0f18bbc68d6 100644
--- a/libc/test/src/math/FDimTest.h
+++ b/libc/test/src/math/FDimTest.h
@@ -6,11 +6,11 @@
 //
 //===---------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 template <typename T>
 class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/FMaxTest.h b/libc/test/src/math/FMaxTest.h
index f8046f380f5fd5..2c7dc3dc13ec5a 100644
--- a/libc/test/src/math/FMaxTest.h
+++ b/libc/test/src/math/FMaxTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FMinTest.h b/libc/test/src/math/FMinTest.h
index 7a6534f320c92a..a986d5240d0da7 100644
--- a/libc/test/src/math/FMinTest.h
+++ b/libc/test/src/math/FMinTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FModTest.h b/libc/test/src/math/FModTest.h
index 2b1442923268d9..96ad299258a176 100644
--- a/libc/test/src/math/FModTest.h
+++ b/libc/test/src/math/FModTest.h
@@ -14,7 +14,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 #define TEST_SPECIAL(x, y, expected, dom_err, expected_exception)              \
   EXPECT_FP_EQ(expected, f(x, y));                                             \
diff --git a/libc/test/src/math/FloorTest.h b/libc/test/src/math/FloorTest.h
index 66b37d69d7ba3d..21ae291e61bc7e 100644
--- a/libc/test/src/math/FloorTest.h
+++ b/libc/test/src/math/FloorTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/FrexpTest.h b/libc/test/src/math/FrexpTest.h
index f3a64ce4aac31f..f971b45628f09f 100644
--- a/libc/test/src/math/FrexpTest.h
+++ b/libc/test/src/math/FrexpTest.h
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/HypotTest.h b/libc/test/src/math/HypotTest.h
index 8f84024a6ee117..46fcc462a2793a 100644
--- a/libc/test/src/math/HypotTest.h
+++ b/libc/test/src/math/HypotTest.h
@@ -14,7 +14,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/ILogbTest.h b/libc/test/src/math/ILogbTest.h
index 3e2db33e2c0524..dcc9d554eb3c28 100644
--- a/libc/test/src/math/ILogbTest.h
+++ b/libc/test/src/math/ILogbTest.h
@@ -9,11 +9,11 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/limits.h" // INT_MAX
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
 public:
diff --git a/libc/test/src/math/LdExpTest.h b/libc/test/src/math/LdExpTest.h
index fe84b5f4c192a4..738135d6afe27e 100644
--- a/libc/test/src/math/LdExpTest.h
+++ b/libc/test/src/math/LdExpTest.h
@@ -15,7 +15,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 #include <stdint.h>
 
 template <typename T>
diff --git a/libc/test/src/math/LogbTest.h b/libc/test/src/math/LogbTest.h
index d64c5c44e4281a..3859b56582e5e4 100644
--- a/libc/test/src/math/LogbTest.h
+++ b/libc/test/src/math/LogbTest.h
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/ModfTest.h b/libc/test/src/math/ModfTest.h
index a7e5e8810611b1..84e26db49695d5 100644
--- a/libc/test/src/math/ModfTest.h
+++ b/libc/test/src/math/ModfTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/NextAfterTest.h b/libc/test/src/math/NextAfterTest.h
index 2f1450a16fd1e5..d45d819bfdb6cd 100644
--- a/libc/test/src/math/NextAfterTest.h
+++ b/libc/test/src/math/NextAfterTest.h
@@ -9,13 +9,13 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 template <typename T>
 class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/RIntTest.h b/libc/test/src/math/RIntTest.h
index 3b16b902bcf5d3..d392d4fb14a2e2 100644
--- a/libc/test/src/math/RIntTest.h
+++ b/libc/test/src/math/RIntTest.h
@@ -15,8 +15,8 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include <fenv.h>
-#include <math.h>
 #include <stdio.h>
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
diff --git a/libc/test/src/math/RemQuoTest.h b/libc/test/src/math/RemQuoTest.h
index 7b3011d23055d9..d61b97554199e7 100644
--- a/libc/test/src/math/RemQuoTest.h
+++ b/libc/test/src/math/RemQuoTest.h
@@ -9,12 +9,12 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/RoundTest.h b/libc/test/src/math/RoundTest.h
index b255ecc4fa84e8..17da00f869d3b3 100644
--- a/libc/test/src/math/RoundTest.h
+++ b/libc/test/src/math/RoundTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index 9bd4ba52f61ba8..017f5867fc8de7 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -15,8 +15,8 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include <errno.h>
-#include <math.h>
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/SqrtTest.h b/libc/test/src/math/SqrtTest.h
index 75eb411810f5ee..9811b2767ee337 100644
--- a/libc/test/src/math/SqrtTest.h
+++ b/libc/test/src/math/SqrtTest.h
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/TruncTest.h b/libc/test/src/math/TruncTest.h
index 6d0ea1182ec11b..c3a89dbb837b52 100644
--- a/libc/test/src/math/TruncTest.h
+++ b/libc/test/src/math/TruncTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp
index c273184d58f3e6..6f8321bd7182ae 100644
--- a/libc/test/src/math/acosf_test.cpp
+++ b/libc/test/src/math/acosf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/acosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp
index a0e845b2b247ee..41d1166fb430db 100644
--- a/libc/test/src/math/acoshf_test.cpp
+++ b/libc/test/src/math/acoshf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/acoshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp
index a24fdcc36e140a..4e36f03f489551 100644
--- a/libc/test/src/math/asinf_test.cpp
+++ b/libc/test/src/math/asinf_test.cpp
@@ -7,13 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/asinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp
index 3127861c9a1be5..9a3bfbed1068d8 100644
--- a/libc/test/src/math/asinhf_test.cpp
+++ b/libc/test/src/math/asinhf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/asinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp
index 1fa7165805c7d1..e51932fc495525 100644
--- a/libc/test/src/math/atanf_test.cpp
+++ b/libc/test/src/math/atanf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/atanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
@@ -53,8 +53,15 @@ TEST_F(LlvmLibcAtanfTest, InFloatRange) {
 
 // For small values, tanh(x) is x.
 TEST_F(LlvmLibcAtanfTest, SpecialValues) {
-  uint32_t val_arr[] = {0x3d8d6b23U, 0x3feefcfbU, 0xbd8d6b23U,
-                        0xbfeefcfbU, 0x7F800000U, 0xFF800000U};
+  uint32_t val_arr[] = {
+      0x3d8d6b23U, // x = 0x1.1ad646p-4f
+      0x3feefcfbU, // x = 0x1.ddf9f6p+0f
+      0xbd8d6b23U, // x = -0x1.1ad646p-4f
+      0xbfeefcfbU, // x = -0x1.ddf9f6p+0f
+      0x7F800000U, // x = +Inf
+      0xFF800000U, // x = -Inf
+      0xbffe2ec1U, // x = -0x1.fc5d82p+0f
+  };
   for (uint32_t v : val_arr) {
     float x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, x,
diff --git a/libc/test/src/math/atanhf_test.cpp b/libc/test/src/math/atanhf_test.cpp
index 1b45436094da44..39c067f3e764c2 100644
--- a/libc/test/src/math/atanhf_test.cpp
+++ b/libc/test/src/math/atanhf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/atanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp
index 1f55e9e9a14959..6a1122997c51a2 100644
--- a/libc/test/src/math/cos_test.cpp
+++ b/libc/test/src/math/cos_test.cpp
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 using LlvmLibcCosTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index 93ab06dc80b261..8a5eb17fdcea50 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/cosf.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp
index 3b8e14fb5f0342..8792f56b03461a 100644
--- a/libc/test/src/math/coshf_test.cpp
+++ b/libc/test/src/math/coshf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/erff_test.cpp b/libc/test/src/math/erff_test.cpp
index 8ebde4ec24fb55..1e43c206aef0da 100644
--- a/libc/test/src/math/erff_test.cpp
+++ b/libc/test/src/math/erff_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/erff.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt
index 84a5ebc75ef04b..df32dd4f943f30 100644
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@@ -273,6 +273,7 @@ add_fp_unittest(
     fmod_generic_impl_test.cpp
   DEPENDS
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
     libc.src.__support.FPUtil.generic.fmod
 )
 
diff --git a/libc/test/src/math/exhaustive/atanf_test.cpp b/libc/test/src/math/exhaustive/atanf_test.cpp
index 508c288b050c55..6f23231d426741 100644
--- a/libc/test/src/math/exhaustive/atanf_test.cpp
+++ b/libc/test/src/math/exhaustive/atanf_test.cpp
@@ -25,7 +25,7 @@ TEST_F(LlvmLibcAtanfExhaustiveTest, PostiveRange) {
 }
 
 // Range: [-Inf, 0];
-static constexpr uint32_t NEG_START = 0xb000'0000U;
+static constexpr uint32_t NEG_START = 0x8000'0000U;
 static constexpr uint32_t NEG_STOP = 0xff80'0000U;
 
 TEST_F(LlvmLibcAtanfExhaustiveTest, NegativeRange) {
diff --git a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
index 25a5e3898599ae..c7aec5b7bc21b4 100644
--- a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
+++ b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
@@ -6,53 +6,55 @@
 //
 //===----------------------------------------------------------------------===//
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h" // ldexp
 #include "src/__support/FPUtil/generic/FMod.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
 #include <array>
-#include <limits>
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 template <typename T, bool InverseMultiplication>
 class LlvmLibcFModTest : public LIBC_NAMESPACE::testing::Test {
 
-  using U = typename LIBC_NAMESPACE::fputil::FPBits<T>::StorageType;
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+  using U = typename FPBits::StorageType;
   using DivisionHelper = LIBC_NAMESPACE::cpp::conditional_t<
       InverseMultiplication,
       LIBC_NAMESPACE::fputil::generic::FModDivisionInvMultHelper<U>,
       LIBC_NAMESPACE::fputil::generic::FModDivisionSimpleHelper<U>>;
 
-  static constexpr std::array<T, 11> test_bases = {
+  static constexpr std::array<T, 11> TEST_BASES = {
       T(0.0),
       T(1.0),
       T(3.0),
       T(27.0),
       T(11.0 / 8.0),
       T(2.764443),
-      T(1.0) - std::numeric_limits<T>::epsilon(),
-      T(1.0) + std::numeric_limits<T>::epsilon(),
-      T(M_PI),
-      T(M_SQRT2),
-      T(M_E)};
+      T(1.0) - T(0x1.0p-23) - T(0x1.0p-52) - T(0x1.0p-112),
+      T(1.0) + T(0x1.0p-23) + T(0x1.0p-52) + T(0x1.0p-112),
+      T(3.14159265),
+      T(1.41421356),
+      T(2.71828183)};
 
 public:
   void testExtensive() {
     using FMod = LIBC_NAMESPACE::fputil::generic::FMod<T, U, DivisionHelper>;
-    using nl = std::numeric_limits<T>;
-    int min2 = nl::min_exponent - nl::digits - 5;
-    int max2 = nl::max_exponent + 3;
-    for (T by : test_bases) {
+    int min2 = -(FPBits::MAX_BIASED_EXPONENT + FPBits::SIG_LEN) / 2;
+    int max2 = 3 + FPBits::MAX_BIASED_EXPONENT / 2;
+    for (T by : TEST_BASES) {
       for (int iy = min2; iy < max2; iy++) {
-        T y = by * std::ldexp(2, iy);
-        if (y == 0 || !std::isfinite(y))
+        T y = by * LIBC_NAMESPACE::fputil::ldexp(2.0, iy);
+        FPBits y_bits(y);
+        if (y_bits.is_zero() || !y_bits.is_finite())
           continue;
-        for (T bx : test_bases) {
+        for (T bx : TEST_BASES) {
           for (int ix = min2; ix < max2; ix++) {
-            T x = bx * std::ldexp(2, ix);
-            if (!std::isfinite(x))
+            T x = bx * LIBC_NAMESPACE::fputil::ldexp(2.0, ix);
+            if (!FPBits(x).is_finite())
               continue;
             T result = FMod::eval(x, y);
             mpfr::BinaryInput<T> input{x, y};
diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp
index d71b5a22303745..778189626a6179 100644
--- a/libc/test/src/math/exp10_test.cpp
+++ b/libc/test/src/math/exp10_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp
index 4e2d065f1292ab..9d44e8f65deccf 100644
--- a/libc/test/src/math/exp10f_test.cpp
+++ b/libc/test/src/math/exp10f_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp
index 2f9d7b3a2a6c90..845fda5451d4b7 100644
--- a/libc/test/src/math/exp2_test.cpp
+++ b/libc/test/src/math/exp2_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
index f5ea8554be5c0f..f63f091eab9a83 100644
--- a/libc/test/src/math/exp2f_test.cpp
+++ b/libc/test/src/math/exp2f_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
 #include "src/errno/libc_errno.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp
index 006db00b91949e..42018e608ae45b 100644
--- a/libc/test/src/math/exp_test.cpp
+++ b/libc/test/src/math/exp_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
index ffd9da500488b1..634958bdc43e56 100644
--- a/libc/test/src/math/expf_test.cpp
+++ b/libc/test/src/math/expf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/explogxf_test.cpp b/libc/test/src/math/explogxf_test.cpp
index 24f9f3c0f8e45e..a536a9f3ab8dea 100644
--- a/libc/test/src/math/explogxf_test.cpp
+++ b/libc/test/src/math/explogxf_test.cpp
@@ -7,12 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "in_float_range_test_helper.h"
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/math/fabs.h"
+#include "src/math/fabsf.h"
 #include "src/math/generic/explogxf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 using LlvmLibcExplogfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -22,7 +24,7 @@ constexpr int def_count = 100003;
 constexpr float def_prec = 0.500001f;
 
 auto f_normal = [](float x) -> bool {
-  return !(isnan(x) || isinf(x) || fabs(x) < 2E-38);
+  return !(isnan(x) || isinf(x) || LIBC_NAMESPACE::fabs(x) < 2E-38);
 };
 
 TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
@@ -32,8 +34,8 @@ TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) {
     return static_cast<float>(result.mh * r);
   };
   auto f_check = [](float x) -> bool {
-    return !(
-        (isnan(x) || isinf(x) || x < -70 || x > 70 || fabsf(x) < 0x1.0p-10));
+    return !((isnan(x) || isinf(x) || x < -70 || x > 70 ||
+              LIBC_NAMESPACE::fabsf(x) < 0x1.0p-10));
   };
   CHECK_DATA(0.0f, neg_inf, mpfr::Operation::Exp, fx, f_check, def_count,
              def_prec);
diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp
index ccc6e36095481f..198e6d5cdd8ab2 100644
--- a/libc/test/src/math/expm1_test.cpp
+++ b/libc/test/src/math/expm1_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expm1.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
index 94a9301f8eb222..c72815887ba8b3 100644
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expm1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/fdim_test.cpp b/libc/test/src/math/fdim_test.cpp
index 2f00a30ad1ee6f..6c0c3e204c5f9d 100644
--- a/libc/test/src/math/fdim_test.cpp
+++ b/libc/test/src/math/fdim_test.cpp
@@ -8,11 +8,11 @@
 
 #include "FDimTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdim.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 using LlvmLibcFDimTest = FDimTestTemplate<double>;
 
diff --git a/libc/test/src/math/fdimf_test.cpp b/libc/test/src/math/fdimf_test.cpp
index 27511baf25b6da..a74011b5a22498 100644
--- a/libc/test/src/math/fdimf_test.cpp
+++ b/libc/test/src/math/fdimf_test.cpp
@@ -8,11 +8,11 @@
 
 #include "FDimTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdimf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 using LlvmLibcFDimTest = FDimTestTemplate<float>;
 
diff --git a/libc/test/src/math/fdiml_test.cpp b/libc/test/src/math/fdiml_test.cpp
index 45aedb0a1cdea6..d3f2e68a7c1d79 100644
--- a/libc/test/src/math/fdiml_test.cpp
+++ b/libc/test/src/math/fdiml_test.cpp
@@ -8,11 +8,11 @@
 
 #include "FDimTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/fdiml.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 using LlvmLibcFDimTest = FDimTestTemplate<long double>;
 
diff --git a/libc/test/src/math/ilogb_test.cpp b/libc/test/src/math/ilogb_test.cpp
index 7011c43386e66a..45756ffa3d9a7c 100644
--- a/libc/test/src/math/ilogb_test.cpp
+++ b/libc/test/src/math/ilogb_test.cpp
@@ -8,12 +8,12 @@
 
 #include "ILogbTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogb.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogb) {
   test_special_numbers<double>(&LIBC_NAMESPACE::ilogb);
diff --git a/libc/test/src/math/ilogbf_test.cpp b/libc/test/src/math/ilogbf_test.cpp
index dcff8eeb151805..ff19dd145a198d 100644
--- a/libc/test/src/math/ilogbf_test.cpp
+++ b/libc/test/src/math/ilogbf_test.cpp
@@ -8,12 +8,12 @@
 
 #include "ILogbTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogbf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbf) {
   test_special_numbers<float>(&LIBC_NAMESPACE::ilogbf);
diff --git a/libc/test/src/math/ilogbl_test.cpp b/libc/test/src/math/ilogbl_test.cpp
index 29a221ad7f08f7..b2c5246669946a 100644
--- a/libc/test/src/math/ilogbl_test.cpp
+++ b/libc/test/src/math/ilogbl_test.cpp
@@ -8,12 +8,12 @@
 
 #include "ILogbTest.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogbl.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbl) {
   test_special_numbers<long double>(&LIBC_NAMESPACE::ilogbl);
diff --git a/libc/test/src/math/inv_trigf_utils_test.cpp b/libc/test/src/math/inv_trigf_utils_test.cpp
deleted file mode 100644
index 23420edcd0ca1a..00000000000000
--- a/libc/test/src/math/inv_trigf_utils_test.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- Unittests for supfuncf --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "in_float_range_test_helper.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/math/generic/inv_trigf_utils.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
-
-using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
-
-namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
-
-constexpr int def_count = 100003;
-constexpr float def_prec = 0.500001f;
-
-auto f_normal = [](float x) -> bool { return !(isnan(x) || isinf(x)); };
-
-TEST_F(LlvmLibcAtanfTest, InPositiveRange) {
-  CHECK_DATA(0.0f, inf, mpfr::Operation::Atan, LIBC_NAMESPACE::atan_eval,
-             f_normal, def_count, def_prec);
-}
-
-TEST_F(LlvmLibcAtanfTest, InNegativeRange) {
-  CHECK_DATA(-0.0f, neg_inf, mpfr::Operation::Atan, LIBC_NAMESPACE::atan_eval,
-             f_normal, def_count, def_prec);
-}
diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp
index ed4a24b9515951..dc4ac895546c43 100644
--- a/libc/test/src/math/log10_test.cpp
+++ b/libc/test/src/math/log10_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log10f_test.cpp b/libc/test/src/math/log10f_test.cpp
index c38a5159682cfd..f8a137e44c351e 100644
--- a/libc/test/src/math/log10f_test.cpp
+++ b/libc/test/src/math/log10f_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/log10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp
index d769659356c941..975fb8e05c35ea 100644
--- a/libc/test/src/math/log1p_test.cpp
+++ b/libc/test/src/math/log1p_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log1p.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp
index 2f6330ee6ce631..a1108fee48196f 100644
--- a/libc/test/src/math/log1pf_test.cpp
+++ b/libc/test/src/math/log1pf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log1pf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp
index f4ee0ff5185b47..8765279005798a 100644
--- a/libc/test/src/math/log2_test.cpp
+++ b/libc/test/src/math/log2_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp
index d8b4808f5cda12..c05b6b93cff777 100644
--- a/libc/test/src/math/log2f_test.cpp
+++ b/libc/test/src/math/log2f_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp
index b1f1ab775462b1..06a0dc574be518 100644
--- a/libc/test/src/math/log_test.cpp
+++ b/libc/test/src/math/log_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/logf_test.cpp b/libc/test/src/math/logf_test.cpp
index 3ab67ba807823b..1ab480744ba596 100644
--- a/libc/test/src/math/logf_test.cpp
+++ b/libc/test/src/math/logf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/logf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp
index 3dffeb603499fe..cf674ecf8f99e7 100644
--- a/libc/test/src/math/powf_test.cpp
+++ b/libc/test/src/math/powf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/powf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index 69981ed22ee69f..fa1c5370c30fbb 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -12,7 +12,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 using LlvmLibcSinTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index 2c0c7eaaa25f07..a7372fd53b319e 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sincosf.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index 94c1114e688984..a3c5384e3e626f 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sinf.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp
index f14150bb04420e..bea976055dbdff 100644
--- a/libc/test/src/math/sinhf_test.cpp
+++ b/libc/test/src/math/sinhf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 293e65abd44f5a..85dacce3b21dca 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1506,6 +1506,22 @@ add_fp_unittest(
   UNIT_TEST_ONLY
 )
 
+add_fp_unittest(
+  nanf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nanf128_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.include.signal
+    libc.src.math.nanf128
+    libc.src.__support.FPUtil.fp_bits
+  # FIXME: The nan tests currently have death tests, which aren't supported for
+  # hermetic tests.
+  UNIT_TEST_ONLY
+)
+
 add_fp_unittest(
   nextafter_test
   SUITE
@@ -1614,6 +1630,118 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  nextdown_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextdown_test.cpp
+  HDRS
+    NextDownTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextdown
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextdownf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextdownf_test.cpp
+  HDRS
+    NextDownTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextdownf
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextdownl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextdownl_test.cpp
+  HDRS
+    NextDownTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextdownl
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextdownf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextdownf128_test.cpp
+  HDRS
+    NextDownTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextdownf128
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextup_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextup_test.cpp
+  HDRS
+    NextUpTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextup
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextupf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextupf_test.cpp
+  HDRS
+    NextUpTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextupf
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextupl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextupl_test.cpp
+  HDRS
+    NextUpTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextupl
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  nextupf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextupf128_test.cpp
+  HDRS
+    NextUpTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextupf128
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
 # TODO(lntue): The current implementation of fputil::general::fma<float> is only
 # correctly rounded for the default rounding mode round-to-nearest tie-to-even.
 add_fp_unittest(
diff --git a/libc/test/src/math/smoke/CeilTest.h b/libc/test/src/math/smoke/CeilTest.h
index 5248dbca503701..ec70258fddec15 100644
--- a/libc/test/src/math/smoke/CeilTest.h
+++ b/libc/test/src/math/smoke/CeilTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class CeilTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/CopySignTest.h b/libc/test/src/math/smoke/CopySignTest.h
index 9ee34338ba8073..70a6a419e0a030 100644
--- a/libc/test/src/math/smoke/CopySignTest.h
+++ b/libc/test/src/math/smoke/CopySignTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T>
 class CopySignTest : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/smoke/FAbsTest.h b/libc/test/src/math/smoke/FAbsTest.h
index cf05882e22f97d..9309c2ada4a11f 100644
--- a/libc/test/src/math/smoke/FAbsTest.h
+++ b/libc/test/src/math/smoke/FAbsTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class FAbsTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h
index 2b1442923268d9..96ad299258a176 100644
--- a/libc/test/src/math/smoke/FModTest.h
+++ b/libc/test/src/math/smoke/FModTest.h
@@ -14,7 +14,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 #define TEST_SPECIAL(x, y, expected, dom_err, expected_exception)              \
   EXPECT_FP_EQ(expected, f(x, y));                                             \
diff --git a/libc/test/src/math/smoke/FloorTest.h b/libc/test/src/math/smoke/FloorTest.h
index 610f5c206ed3a0..8886e8e7518368 100644
--- a/libc/test/src/math/smoke/FloorTest.h
+++ b/libc/test/src/math/smoke/FloorTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class FloorTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h
index 619879188a3b31..43499267b71136 100644
--- a/libc/test/src/math/smoke/HypotTest.h
+++ b/libc/test/src/math/smoke/HypotTest.h
@@ -13,7 +13,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T>
 class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/smoke/ModfTest.h b/libc/test/src/math/smoke/ModfTest.h
index d7e15a7ed68206..107963665b8357 100644
--- a/libc/test/src/math/smoke/ModfTest.h
+++ b/libc/test/src/math/smoke/ModfTest.h
@@ -11,7 +11,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class ModfTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h
index dda86eeeb6e03c..23b3b15347407d 100644
--- a/libc/test/src/math/smoke/NextAfterTest.h
+++ b/libc/test/src/math/smoke/NextAfterTest.h
@@ -9,13 +9,13 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #define ASSERT_FP_EQ_WITH_EXCEPTION(result, expected, expected_exception)      \
   ASSERT_FP_EQ(result, expected);                                              \
diff --git a/libc/test/src/math/smoke/NextDownTest.h b/libc/test/src/math/smoke/NextDownTest.h
new file mode 100644
index 00000000000000..c678ab1db1deff
--- /dev/null
+++ b/libc/test/src/math/smoke/NextDownTest.h
@@ -0,0 +1,43 @@
+//===-- Utility class to test different flavors of nextdown -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTDOWNTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_NEXTDOWNTEST_H
+
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+template <typename T>
+class NextDownTestTemplate : public LIBC_NAMESPACE::testing::Test {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef T (*NextDownFunc)(T);
+
+  void testNaN(NextDownFunc func) { ASSERT_FP_EQ(func(aNaN), aNaN); }
+
+  void testBoundaries(NextDownFunc func) {
+    ASSERT_FP_EQ(zero, func(min_denormal));
+
+    ASSERT_FP_EQ(neg_min_denormal, func(zero));
+    ASSERT_FP_EQ(neg_min_denormal, func(neg_zero));
+
+    ASSERT_FP_EQ(neg_max_normal, func(neg_max_normal));
+    ASSERT_FP_EQ(neg_inf, func(neg_inf));
+
+    ASSERT_FP_EQ(max_normal, func(inf));
+  }
+};
+
+#define LIST_NEXTDOWN_TESTS(T, func)                                           \
+  using LlvmLibcNextDownTest = NextDownTestTemplate<T>;                        \
+  TEST_F(LlvmLibcNextDownTest, TestNaN) { testNaN(&func); }                    \
+  TEST_F(LlvmLibcNextDownTest, TestBoundaries) { testBoundaries(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_NEXTDOWNTEST_H
diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h
index 42a9a56aeb0a72..caf98262c5d157 100644
--- a/libc/test/src/math/smoke/NextTowardTest.h
+++ b/libc/test/src/math/smoke/NextTowardTest.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTTOWARDTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_NEXTTOWARDTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/BasicOperations.h"
@@ -16,7 +17,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include <fenv.h>
-#include <math.h>
 
 #define ASSERT_FP_EQ_WITH_EXCEPTION(result, expected, expected_exception)      \
   ASSERT_FP_EQ(result, expected);                                              \
diff --git a/libc/test/src/math/smoke/NextUpTest.h b/libc/test/src/math/smoke/NextUpTest.h
new file mode 100644
index 00000000000000..ebbdb5c73def9e
--- /dev/null
+++ b/libc/test/src/math/smoke/NextUpTest.h
@@ -0,0 +1,43 @@
+//===-- Utility class to test different flavors of nextup -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTUPTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_NEXTUPTEST_H
+
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+template <typename T>
+class NextUpTestTemplate : public LIBC_NAMESPACE::testing::Test {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef T (*NextUpFunc)(T);
+
+  void testNaN(NextUpFunc func) { ASSERT_FP_EQ(func(aNaN), aNaN); }
+
+  void testBoundaries(NextUpFunc func) {
+    ASSERT_FP_EQ(neg_zero, func(neg_min_denormal));
+
+    ASSERT_FP_EQ(min_denormal, func(zero));
+    ASSERT_FP_EQ(min_denormal, func(neg_zero));
+
+    ASSERT_FP_EQ(max_normal, func(max_normal));
+    ASSERT_FP_EQ(inf, func(inf));
+
+    ASSERT_FP_EQ(neg_max_normal, func(neg_inf));
+  }
+};
+
+#define LIST_NEXTUP_TESTS(T, func)                                             \
+  using LlvmLibcNextUpTest = NextUpTestTemplate<T>;                            \
+  TEST_F(LlvmLibcNextUpTest, TestNaN) { testNaN(&func); }                      \
+  TEST_F(LlvmLibcNextUpTest, TestBoundaries) { testBoundaries(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_NEXTUPTEST_H
diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h
index 4c90dffa39cb15..903fbe9ce34300 100644
--- a/libc/test/src/math/smoke/RIntTest.h
+++ b/libc/test/src/math/smoke/RIntTest.h
@@ -14,8 +14,8 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include <fenv.h>
-#include <math.h>
 #include <stdio.h>
 
 static constexpr int ROUNDING_MODES[4] = {FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO,
diff --git a/libc/test/src/math/smoke/RemQuoTest.h b/libc/test/src/math/smoke/RemQuoTest.h
index 87551faeda9cf1..a9fa405b270001 100644
--- a/libc/test/src/math/smoke/RemQuoTest.h
+++ b/libc/test/src/math/smoke/RemQuoTest.h
@@ -9,11 +9,11 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_REMQUOTEST_H
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 template <typename T>
 class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
diff --git a/libc/test/src/math/smoke/RoundTest.h b/libc/test/src/math/smoke/RoundTest.h
index d2a5906b1e29ea..8cf96f4569034a 100644
--- a/libc/test/src/math/smoke/RoundTest.h
+++ b/libc/test/src/math/smoke/RoundTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class RoundTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index e86533ca09e178..1b5135d016cc41 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -14,8 +14,8 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include <errno.h>
-#include <math.h>
 
 static constexpr int ROUNDING_MODES[4] = {FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO,
                                           FE_TONEAREST};
diff --git a/libc/test/src/math/smoke/SqrtTest.h b/libc/test/src/math/smoke/SqrtTest.h
index edb6e74236e316..eea5dc1534e0bb 100644
--- a/libc/test/src/math/smoke/SqrtTest.h
+++ b/libc/test/src/math/smoke/SqrtTest.h
@@ -10,7 +10,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class SqrtTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/TruncTest.h b/libc/test/src/math/smoke/TruncTest.h
index 71b1ab9df3f08b..5612d27fef21d8 100644
--- a/libc/test/src/math/smoke/TruncTest.h
+++ b/libc/test/src/math/smoke/TruncTest.h
@@ -12,7 +12,7 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 template <typename T> class TruncTest : public LIBC_NAMESPACE::testing::Test {
 
diff --git a/libc/test/src/math/smoke/acosf_test.cpp b/libc/test/src/math/smoke/acosf_test.cpp
index 864c9ea4d3175d..573a2c39492f0c 100644
--- a/libc/test/src/math/smoke/acosf_test.cpp
+++ b/libc/test/src/math/smoke/acosf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/acosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/acoshf_test.cpp b/libc/test/src/math/smoke/acoshf_test.cpp
index b3ba740c2024d3..f561f23eb99ad3 100644
--- a/libc/test/src/math/smoke/acoshf_test.cpp
+++ b/libc/test/src/math/smoke/acoshf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/acoshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/asinf_test.cpp b/libc/test/src/math/smoke/asinf_test.cpp
index e6afb23a397fe1..39d25e72c143b9 100644
--- a/libc/test/src/math/smoke/asinf_test.cpp
+++ b/libc/test/src/math/smoke/asinf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/asinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/asinhf_test.cpp b/libc/test/src/math/smoke/asinhf_test.cpp
index 7a520cdc050a22..9637bfa5394885 100644
--- a/libc/test/src/math/smoke/asinhf_test.cpp
+++ b/libc/test/src/math/smoke/asinhf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/asinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp
index 8cc1b418f869c2..abd9835d38a057 100644
--- a/libc/test/src/math/smoke/atanf_test.cpp
+++ b/libc/test/src/math/smoke/atanf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/atanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp
index e1d6d2532d39e3..df0746e0c9c3aa 100644
--- a/libc/test/src/math/smoke/atanhf_test.cpp
+++ b/libc/test/src/math/smoke/atanhf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/atanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/cosf_test.cpp b/libc/test/src/math/smoke/cosf_test.cpp
index 539c428eadf113..62132990ed5473 100644
--- a/libc/test/src/math/smoke/cosf_test.cpp
+++ b/libc/test/src/math/smoke/cosf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/cosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp
index e38802f9364743..9d7ef505ae7497 100644
--- a/libc/test/src/math/smoke/coshf_test.cpp
+++ b/libc/test/src/math/smoke/coshf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/coshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/erff_test.cpp b/libc/test/src/math/smoke/erff_test.cpp
index 843cc7f4c8c04d..24778f8d653add 100644
--- a/libc/test/src/math/smoke/erff_test.cpp
+++ b/libc/test/src/math/smoke/erff_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/erff.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/exp10_test.cpp b/libc/test/src/math/smoke/exp10_test.cpp
index daa7b5b48842bc..fffffeb4c78abf 100644
--- a/libc/test/src/math/smoke/exp10_test.cpp
+++ b/libc/test/src/math/smoke/exp10_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp
index f6533e983dacd8..c0dcc125033248 100644
--- a/libc/test/src/math/smoke/exp10f_test.cpp
+++ b/libc/test/src/math/smoke/exp10f_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/exp2_test.cpp b/libc/test/src/math/smoke/exp2_test.cpp
index 7bcc2a3cb6c742..d362d32f678b25 100644
--- a/libc/test/src/math/smoke/exp2_test.cpp
+++ b/libc/test/src/math/smoke/exp2_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp
index 7baab630a4cbc4..e2989a6ec4d8af 100644
--- a/libc/test/src/math/smoke/exp2f_test.cpp
+++ b/libc/test/src/math/smoke/exp2f_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
 #include "src/errno/libc_errno.h"
 #include "src/math/exp2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/exp_test.cpp b/libc/test/src/math/smoke/exp_test.cpp
index 455cb2ea135328..a2becc74f526f9 100644
--- a/libc/test/src/math/smoke/exp_test.cpp
+++ b/libc/test/src/math/smoke/exp_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/exp.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp
index 2968704f83e355..42710c5fa404e9 100644
--- a/libc/test/src/math/smoke/expf_test.cpp
+++ b/libc/test/src/math/smoke/expf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/expm1_test.cpp b/libc/test/src/math/smoke/expm1_test.cpp
index 4581060075e32c..07963ec2d34c8d 100644
--- a/libc/test/src/math/smoke/expm1_test.cpp
+++ b/libc/test/src/math/smoke/expm1_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expm1.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp
index 4ef8d50ba551a7..82e0b154635044 100644
--- a/libc/test/src/math/smoke/expm1f_test.cpp
+++ b/libc/test/src/math/smoke/expm1f_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/expm1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/log10_test.cpp b/libc/test/src/math/smoke/log10_test.cpp
index c1658a3d5c9653..36d75341976482 100644
--- a/libc/test/src/math/smoke/log10_test.cpp
+++ b/libc/test/src/math/smoke/log10_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log10f_test.cpp b/libc/test/src/math/smoke/log10f_test.cpp
index 53950233cabf5b..53e699417fb7c0 100644
--- a/libc/test/src/math/smoke/log10f_test.cpp
+++ b/libc/test/src/math/smoke/log10f_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/log10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log1p_test.cpp b/libc/test/src/math/smoke/log1p_test.cpp
index e1966c28397c08..5fe9c60f90abff 100644
--- a/libc/test/src/math/smoke/log1p_test.cpp
+++ b/libc/test/src/math/smoke/log1p_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log1p.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log1pf_test.cpp b/libc/test/src/math/smoke/log1pf_test.cpp
index 377a46adef9c79..e2fb2f057d2eb7 100644
--- a/libc/test/src/math/smoke/log1pf_test.cpp
+++ b/libc/test/src/math/smoke/log1pf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log1pf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log2_test.cpp b/libc/test/src/math/smoke/log2_test.cpp
index 826c51eb8c1b09..fbeba9527bcb7b 100644
--- a/libc/test/src/math/smoke/log2_test.cpp
+++ b/libc/test/src/math/smoke/log2_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/log2f_test.cpp b/libc/test/src/math/smoke/log2f_test.cpp
index 2387ff59d70d9e..46906e78dcaf71 100644
--- a/libc/test/src/math/smoke/log2f_test.cpp
+++ b/libc/test/src/math/smoke/log2f_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/log_test.cpp b/libc/test/src/math/smoke/log_test.cpp
index 423b9d8bb818a7..b1e39059948001 100644
--- a/libc/test/src/math/smoke/log_test.cpp
+++ b/libc/test/src/math/smoke/log_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/log.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/logf_test.cpp b/libc/test/src/math/smoke/logf_test.cpp
index 21b39ee6896dd2..97b6bdde307b3d 100644
--- a/libc/test/src/math/smoke/logf_test.cpp
+++ b/libc/test/src/math/smoke/logf_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/logf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <stdint.h>
 
diff --git a/libc/test/src/math/smoke/nanf128_test.cpp b/libc/test/src/math/smoke/nanf128_test.cpp
new file mode 100644
index 00000000000000..2a9f57de5b43bc
--- /dev/null
+++ b/libc/test/src/math/smoke/nanf128_test.cpp
@@ -0,0 +1,60 @@
+//===-- Unittests for nanf128 ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/UInt128.h"
+#include "src/math/nanf128.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+class LlvmLibcNanf128Test : public LIBC_NAMESPACE::testing::Test {
+public:
+  using FPBits128 = LIBC_NAMESPACE::fputil::FPBits<float128>;
+  using StorageType = FPBits128::StorageType;
+
+  const UInt128 QUIET_NAN = FPBits128::quiet_nan().uintval();
+  const UInt128 ONE = UInt128(1);
+
+  void run_test(const char *input_str, StorageType bits) {
+    float128 result = LIBC_NAMESPACE::nanf128(input_str);
+    auto actual_fp = FPBits128(result);
+    auto expected_fp = FPBits128(bits);
+    EXPECT_EQ(actual_fp.uintval(), expected_fp.uintval());
+  };
+};
+
+TEST_F(LlvmLibcNanf128Test, NCharSeq) {
+  run_test("", QUIET_NAN);
+  run_test("1234", QUIET_NAN | 1234);
+  run_test("0x1234", QUIET_NAN | 0x1234);
+  run_test("2417851639229258349412352", QUIET_NAN | (ONE << 81));
+  run_test("0x200000000000000000000", QUIET_NAN | (ONE << 81));
+  run_test("10384593717069655257060992658440191",
+           QUIET_NAN | FPBits128::SIG_MASK);
+  run_test("0x1ffffffffffffffffffffffffffff", QUIET_NAN | FPBits128::SIG_MASK);
+  run_test("10384593717069655257060992658440192", QUIET_NAN);
+  run_test("0x20000000000000000000000000000", QUIET_NAN);
+  run_test("1a", QUIET_NAN);
+  run_test("10000000000000000000000000000000000000000000000000", QUIET_NAN);
+}
+
+TEST_F(LlvmLibcNanf128Test, RandomString) {
+  run_test(" 1234", QUIET_NAN);
+  run_test("-1234", QUIET_NAN);
+  run_test("asd&f", QUIET_NAN);
+  run_test("123 ", QUIET_NAN);
+  run_test("1234567890qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM_",
+           QUIET_NAN);
+}
+
+#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+#include <signal.h>
+TEST_F(LlvmLibcNanf128Test, InvalidInput) {
+  EXPECT_DEATH([] { LIBC_NAMESPACE::nanf128(nullptr); }, WITH_SIGNAL(SIGSEGV));
+}
+#endif // LIBC_HAVE_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nextdown_test.cpp b/libc/test/src/math/smoke/nextdown_test.cpp
new file mode 100644
index 00000000000000..6b0f5c6c5a6aa6
--- /dev/null
+++ b/libc/test/src/math/smoke/nextdown_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextdown --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextDownTest.h"
+
+#include "src/math/nextdown.h"
+
+LIST_NEXTDOWN_TESTS(double, LIBC_NAMESPACE::nextdown)
diff --git a/libc/test/src/math/smoke/nextdownf128_test.cpp b/libc/test/src/math/smoke/nextdownf128_test.cpp
new file mode 100644
index 00000000000000..932a8b36a59e47
--- /dev/null
+++ b/libc/test/src/math/smoke/nextdownf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextdownf128 ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextDownTest.h"
+
+#include "src/math/nextdownf128.h"
+
+LIST_NEXTDOWN_TESTS(float128, LIBC_NAMESPACE::nextdownf128)
diff --git a/libc/test/src/math/smoke/nextdownf_test.cpp b/libc/test/src/math/smoke/nextdownf_test.cpp
new file mode 100644
index 00000000000000..3c05c22d82049f
--- /dev/null
+++ b/libc/test/src/math/smoke/nextdownf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextdownf -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextDownTest.h"
+
+#include "src/math/nextdownf.h"
+
+LIST_NEXTDOWN_TESTS(float, LIBC_NAMESPACE::nextdownf)
diff --git a/libc/test/src/math/smoke/nextdownl_test.cpp b/libc/test/src/math/smoke/nextdownl_test.cpp
new file mode 100644
index 00000000000000..f1785eb58ce512
--- /dev/null
+++ b/libc/test/src/math/smoke/nextdownl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextdownl -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextDownTest.h"
+
+#include "src/math/nextdownl.h"
+
+LIST_NEXTDOWN_TESTS(long double, LIBC_NAMESPACE::nextdownl)
diff --git a/libc/test/src/math/smoke/nextup_test.cpp b/libc/test/src/math/smoke/nextup_test.cpp
new file mode 100644
index 00000000000000..04c73ac9492f34
--- /dev/null
+++ b/libc/test/src/math/smoke/nextup_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextup ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextUpTest.h"
+
+#include "src/math/nextup.h"
+
+LIST_NEXTUP_TESTS(double, LIBC_NAMESPACE::nextup)
diff --git a/libc/test/src/math/smoke/nextupf128_test.cpp b/libc/test/src/math/smoke/nextupf128_test.cpp
new file mode 100644
index 00000000000000..ddd385a7b159b4
--- /dev/null
+++ b/libc/test/src/math/smoke/nextupf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextupf128 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextUpTest.h"
+
+#include "src/math/nextupf128.h"
+
+LIST_NEXTUP_TESTS(float128, LIBC_NAMESPACE::nextupf128)
diff --git a/libc/test/src/math/smoke/nextupf_test.cpp b/libc/test/src/math/smoke/nextupf_test.cpp
new file mode 100644
index 00000000000000..df73bee0117111
--- /dev/null
+++ b/libc/test/src/math/smoke/nextupf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextupf ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextUpTest.h"
+
+#include "src/math/nextupf.h"
+
+LIST_NEXTUP_TESTS(float, LIBC_NAMESPACE::nextupf)
diff --git a/libc/test/src/math/smoke/nextupl_test.cpp b/libc/test/src/math/smoke/nextupl_test.cpp
new file mode 100644
index 00000000000000..50f765633c2a50
--- /dev/null
+++ b/libc/test/src/math/smoke/nextupl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for nextupl ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NextUpTest.h"
+
+#include "src/math/nextupl.h"
+
+LIST_NEXTUP_TESTS(long double, LIBC_NAMESPACE::nextupl)
diff --git a/libc/test/src/math/smoke/powf_test.cpp b/libc/test/src/math/smoke/powf_test.cpp
index 1867dde0ac3bea..e9de1554ec6148 100644
--- a/libc/test/src/math/smoke/powf_test.cpp
+++ b/libc/test/src/math/smoke/powf_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/powf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/sincosf_test.cpp b/libc/test/src/math/smoke/sincosf_test.cpp
index a1447ad57e1162..5952b20fc5bff3 100644
--- a/libc/test/src/math/smoke/sincosf_test.cpp
+++ b/libc/test/src/math/smoke/sincosf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sincosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp
index e51726646bc686..94508950418742 100644
--- a/libc/test/src/math/smoke/sinf_test.cpp
+++ b/libc/test/src/math/smoke/sinf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp
index 23334293b73fe4..0f005f752e6982 100644
--- a/libc/test/src/math/smoke/sinhf_test.cpp
+++ b/libc/test/src/math/smoke/sinhf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/sinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/tanf_test.cpp b/libc/test/src/math/smoke/tanf_test.cpp
index 53a153254b3111..68bf493f7e8227 100644
--- a/libc/test/src/math/smoke/tanf_test.cpp
+++ b/libc/test/src/math/smoke/tanf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/tanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/smoke/tanhf_test.cpp b/libc/test/src/math/smoke/tanhf_test.cpp
index 7cb80dca7919eb..f1ce8b40d43ac2 100644
--- a/libc/test/src/math/smoke/tanhf_test.cpp
+++ b/libc/test/src/math/smoke/tanhf_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/tanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/tan_test.cpp b/libc/test/src/math/tan_test.cpp
index 9cdc7c4abd32cc..85174db9364e36 100644
--- a/libc/test/src/math/tan_test.cpp
+++ b/libc/test/src/math/tan_test.cpp
@@ -11,7 +11,7 @@
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 using LlvmLibcTanTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp
index c8d9d52e3f6954..d40bc44d6442f9 100644
--- a/libc/test/src/math/tanf_test.cpp
+++ b/libc/test/src/math/tanf_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/tanf.h"
@@ -13,7 +14,6 @@
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
index 28da7ffbeddd18..ef272b17d68cad 100644
--- a/libc/test/src/math/tanhf_test.cpp
+++ b/libc/test/src/math/tanhf_test.cpp
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/math-macros.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/errno/libc_errno.h"
 #include "src/math/tanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
-#include <math.h>
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 6e1c86e070a823..3ccce16a76a2d5 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -14,6 +14,7 @@ add_libc_test(
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.fflush
+    libc.src.stdio.fileno
     libc.src.stdio.fopen
     libc.src.stdio.fputs
     libc.src.stdio.fread
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index f5dbc49818390d..0fbe19cf08d837 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -11,6 +11,7 @@
 #include "src/stdio/feof.h"
 #include "src/stdio/ferror.h"
 #include "src/stdio/fflush.h"
+#include "src/stdio/fileno.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fputs.h"
 #include "src/stdio/fread.h"
@@ -30,6 +31,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
+  ASSERT_EQ(LIBC_NAMESPACE::fileno(file), 3);
   constexpr char CONTENT[] = "1234567890987654321";
   ASSERT_EQ(sizeof(CONTENT) - 1,
             LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT) - 1, file));
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index ab8bdb23b1187c..8dde95d02a96dc 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -121,8 +121,8 @@ TEST(LlvmLibcSPrintfTest, StringConv) {
 
 #ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
   written = LIBC_NAMESPACE::sprintf(buff, "%s", nullptr);
-  EXPECT_EQ(written, 4);
-  ASSERT_STREQ(buff, "null");
+  EXPECT_EQ(written, 6);
+  ASSERT_STREQ(buff, "(null)");
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 }
 
diff --git a/libc/test/src/sys/mman/linux/CMakeLists.txt b/libc/test/src/sys/mman/linux/CMakeLists.txt
index 6f7fc34d8d3ce3..0762a2d846a8e0 100644
--- a/libc/test/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/test/src/sys/mman/linux/CMakeLists.txt
@@ -127,3 +127,23 @@ add_libc_unittest(
     libc.src.unistd.sysconf
     libc.test.UnitTest.ErrnoSetterMatcher
 )
+
+add_libc_unittest(
+  shm_test
+  SUITE
+    libc_sys_mman_unittests
+  SRCS
+    shm_test.cpp
+  DEPENDS
+    libc.include.sys_mman
+    libc.include.sys_syscall
+    libc.src.errno.errno
+    libc.src.sys.mman.shm_open
+    libc.src.sys.mman.shm_unlink
+    libc.src.sys.mman.mmap
+    libc.src.sys.mman.munmap
+    libc.src.unistd.ftruncate
+    libc.src.unistd.close
+    libc.src.__support.OSUtil.osutil
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
diff --git a/libc/test/src/sys/mman/linux/shm_test.cpp b/libc/test/src/sys/mman/linux/shm_test.cpp
new file mode 100644
index 00000000000000..3b1a2aa33b56ad
--- /dev/null
+++ b/libc/test/src/sys/mman/linux/shm_test.cpp
@@ -0,0 +1,77 @@
+//===-- Unittests for shm_open/shm_unlink ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/OSUtil/syscall.h"
+#include "src/sys/mman/mmap.h"
+#include "src/sys/mman/munmap.h"
+#include "src/sys/mman/shm_open.h"
+#include "src/sys/mman/shm_unlink.h"
+#include "src/unistd/close.h"
+#include "src/unistd/ftruncate.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/LibcTest.h"
+#include <asm-generic/fcntl.h>
+#include <sys/syscall.h>
+
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+// since shm_open/shm_unlink are wrappers around open/unlink, we only focus on
+// testing basic cases and name conversions.
+
+TEST(LlvmLibcShmTest, Basic) {
+  const char *name = "/test_shm_open";
+  int fd;
+  ASSERT_THAT(fd = LIBC_NAMESPACE::shm_open(name, O_CREAT | O_RDWR, 0666),
+              returns(GE(0)).with_errno(EQ(0)));
+
+  // check that FD_CLOEXEC is set by default.
+  // TODO: use fcntl when implemented.
+  // https://github.com/llvm/llvm-project/issues/84968
+  long flag = LIBC_NAMESPACE::syscall_impl(SYS_fcntl, fd, F_GETFD);
+  ASSERT_GE(static_cast<int>(flag), 0);
+  EXPECT_NE(static_cast<int>(flag) & FD_CLOEXEC, 0);
+
+  // allocate space using ftruncate
+  ASSERT_THAT(LIBC_NAMESPACE::ftruncate(fd, 4096), Succeeds());
+  // map the shared memory
+  void *addr = LIBC_NAMESPACE::mmap(nullptr, 4096, PROT_READ | PROT_WRITE,
+                                    MAP_SHARED, fd, 0);
+  ASSERT_NE(addr, MAP_FAILED);
+  // just write random data to the shared memory
+  char data[] = "Despite its name, LLVM has little to do with traditional "
+                "virtual machines.";
+  for (size_t i = 0; i < sizeof(data); ++i)
+    static_cast<char *>(addr)[i] = data[i];
+
+  // close fd does not affect the mapping
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds());
+  for (size_t i = 0; i < sizeof(data); ++i)
+    EXPECT_EQ(static_cast<char *>(addr)[i], data[i]);
+
+  // unmap the shared memory
+  ASSERT_THAT(LIBC_NAMESPACE::munmap(addr, 4096), Succeeds());
+  // remove the shared memory
+  ASSERT_THAT(LIBC_NAMESPACE::shm_unlink(name), Succeeds());
+}
+
+TEST(LlvmLibcShmTest, NameConversion) {
+  const char *name = "////test_shm_open";
+  int fd;
+  ASSERT_THAT(fd = LIBC_NAMESPACE::shm_open(name, O_CREAT | O_RDWR, 0666),
+              returns(GE(0)).with_errno(EQ(0)));
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds());
+  ASSERT_THAT(LIBC_NAMESPACE::shm_unlink(name), Succeeds());
+
+  ASSERT_THAT(LIBC_NAMESPACE::shm_open("/123/123", O_CREAT | O_RDWR, 0666),
+              Fails(EINVAL));
+
+  ASSERT_THAT(LIBC_NAMESPACE::shm_open("/.", O_CREAT | O_RDWR, 0666),
+              Fails(EINVAL));
+
+  ASSERT_THAT(LIBC_NAMESPACE::shm_open("/..", O_CREAT | O_RDWR, 0666),
+              Fails(EINVAL));
+}
diff --git a/libc/test/utils/FPUtil/x86_long_double_test.cpp b/libc/test/utils/FPUtil/x86_long_double_test.cpp
index bafbbe2a410759..3b140c6c026670 100644
--- a/libc/test/utils/FPUtil/x86_long_double_test.cpp
+++ b/libc/test/utils/FPUtil/x86_long_double_test.cpp
@@ -9,7 +9,7 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
+#include "include/llvm-libc-macros/math-macros.h"
 
 using FPBits = LIBC_NAMESPACE::fputil::FPBits<long double>;
 
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 7cc18cfb624389..2e1c44e6fd5da9 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -14,7 +14,7 @@
 #include "src/__support/FPUtil/fpbits_str.h"
 #include "test/UnitTest/FPMatcher.h"
 
-#include <cmath>
+#include "include/llvm-libc-macros/math-macros.h"
 #include <fenv.h>
 #include <memory>
 #include <stdint.h>
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index fa1d8e4adbcc4f..745b848fba4901 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -40,8 +40,7 @@ set( LIBCLC_MIN_LLVM "3.9.0" )
 set( LIBCLC_TARGETS_TO_BUILD "all"
     CACHE STRING "Semicolon-separated list of targets to build, or 'all'." )
 
-option( ENABLE_RUNTIME_SUBNORMAL "Enable runtime linking of subnormal support."
-OFF )
+option( ENABLE_RUNTIME_SUBNORMAL "Enable runtime linking of subnormal support." OFF )
 
 find_package(LLVM REQUIRED HINTS "${LLVM_CMAKE_DIR}")
 include(AddLLVM)
@@ -49,16 +48,16 @@ include(AddLLVM)
 message( "LLVM version: ${LLVM_PACKAGE_VERSION}" )
 
 if( ${LLVM_PACKAGE_VERSION} VERSION_LESS ${LIBCLC_MIN_LLVM} )
-	message( FATAL_ERROR "libclc needs at least LLVM ${LIBCLC_MIN_LLVM}" )
+  message( FATAL_ERROR "libclc needs at least LLVM ${LIBCLC_MIN_LLVM}" )
 endif()
 
 # mesa3d environment is only available since LLVM 4.0
 if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "3.9.0" )
-	set( LIBCLC_TARGETS_ALL ${LIBCLC_TARGETS_ALL} amdgcn-mesa-mesa3d )
+  set( LIBCLC_TARGETS_ALL ${LIBCLC_TARGETS_ALL} amdgcn-mesa-mesa3d )
 endif()
 
 if( LIBCLC_TARGETS_TO_BUILD STREQUAL "all" )
-	set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} )
+  set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} )
 endif()
 
 find_program( LLVM_CLANG clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
@@ -75,15 +74,15 @@ message( "opt: ${LLVM_OPT}" )
 message( "llvm-spirv: ${LLVM_SPIRV}" )
 message( "" )
 if( NOT LLVM_CLANG OR NOT LLVM_OPT OR NOT LLVM_AS OR NOT LLVM_LINK )
-	message( FATAL_ERROR "toolchain incomplete!" )
+  message( FATAL_ERROR "toolchain incomplete!" )
 endif()
 
 list( SORT LIBCLC_TARGETS_TO_BUILD )
 
 if( "spirv-mesa3d-" IN_LIST LIBCLC_TARGETS_TO_BUILD OR "spirv64-mesa3d-" IN_LIST LIBCLC_TARGETS_TO_BUILD )
-	if( NOT LLVM_SPIRV )
-		message( FATAL_ERROR "SPIR-V targets requested, but spirv-tools is not installed" )
-	endif()
+  if( NOT LLVM_SPIRV )
+    message( FATAL_ERROR "SPIR-V targets requested, but spirv-tools is not installed" )
+  endif()
 endif()
 
 set( CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake )
@@ -99,8 +98,8 @@ set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MIN
 
 # LLVM 13 enables standard includes by default
 if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "12.99.99" )
-				set( CMAKE_LLAsm_FLAGS "${CMAKE_LLAsm_FLAGS} -cl-no-stdinc")
-				set( CMAKE_CLC_FLAGS "${CMAKE_CLC_FLAGS} -cl-no-stdinc")
+  set( CMAKE_LLAsm_FLAGS "${CMAKE_LLAsm_FLAGS} -cl-no-stdinc" )
+  set( CMAKE_CLC_FLAGS "${CMAKE_CLC_FLAGS} -cl-no-stdinc" )
 endif()
 
 enable_language( CLC LLAsm )
@@ -142,14 +141,14 @@ set( cypress_aliases hemlock )
 set( barts_aliases turks caicos )
 set( cayman_aliases aruba )
 set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii
-	mullins tonga tongapro iceland carrizo fiji stoney polaris10 polaris11
-	gfx602 gfx705 gfx805
-	gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx940 gfx941 gfx942
-	gfx1010 gfx1011 gfx1012 gfx1013
-	gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036
-	gfx1100 gfx1101 gfx1102 gfx1103
-	gfx1150 gfx1151
-	gfx1200 gfx1201
+  mullins tonga tongapro iceland carrizo fiji stoney polaris10 polaris11
+  gfx602 gfx705 gfx805
+  gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx940 gfx941 gfx942
+  gfx1010 gfx1011 gfx1012 gfx1013
+  gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036
+  gfx1100 gfx1101 gfx1102 gfx1103
+  gfx1150 gfx1151
+  gfx1200 gfx1201
 )
 
 # pkg-config file
@@ -158,199 +157,195 @@ install( FILES ${CMAKE_CURRENT_BINARY_DIR}/libclc.pc DESTINATION "${CMAKE_INSTAL
 install( DIRECTORY generic/include/clc DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" )
 
 if( ENABLE_RUNTIME_SUBNORMAL )
-	add_library( subnormal_use_default STATIC
-		generic/lib/subnormal_use_default.ll )
-	add_library( subnormal_disable STATIC
-		generic/lib/subnormal_disable.ll )
-	install( TARGETS subnormal_use_default subnormal_disable ARCHIVE
-		DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+  add_library( subnormal_use_default STATIC
+    generic/lib/subnormal_use_default.ll )
+  add_library( subnormal_disable STATIC
+    generic/lib/subnormal_disable.ll )
+  install( TARGETS subnormal_use_default subnormal_disable ARCHIVE
+    DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
 endif()
 
 find_package( Python3 REQUIRED COMPONENTS Interpreter )
 file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/generic/lib/gen_convert.py script_loc )
 add_custom_command(
-	OUTPUT convert.cl
-	COMMAND ${Python3_EXECUTABLE} ${script_loc} > convert.cl
-	DEPENDS ${script_loc} )
+  OUTPUT convert.cl
+  COMMAND ${Python3_EXECUTABLE} ${script_loc} > convert.cl
+  DEPENDS ${script_loc} )
 add_custom_target( "generate_convert.cl" DEPENDS convert.cl )
 
+add_custom_command(
+  OUTPUT clspv-convert.cl
+  COMMAND ${Python3_EXECUTABLE} ${script_loc} --clspv > clspv-convert.cl
+  DEPENDS ${script_loc} )
+add_custom_target( "clspv-generate_convert.cl" DEPENDS clspv-convert.cl )
+
 enable_testing()
 
 foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
-	message( "BUILDING ${t}" )
-	string( REPLACE "-" ";" TRIPLE  ${t} )
-	list( GET TRIPLE 0 ARCH )
-	list( GET TRIPLE 1 VENDOR )
-	list( GET TRIPLE 2 OS )
-
-	set( dirs )
-
-	if ( NOT ${ARCH} STREQUAL spirv AND NOT ${ARCH} STREQUAL spirv64 AND
-			 NOT ${ARCH} STREQUAL clspv AND NOT ${ARCH} STREQUAL clspv64)
-		LIST( APPEND dirs generic )
-	endif()
-
-	if( ${ARCH} STREQUAL r600 OR ${ARCH} STREQUAL amdgcn )
-		list( APPEND dirs amdgpu )
-	endif()
-
-	#nvptx is special
-	if( ${ARCH} STREQUAL nvptx OR ${ARCH} STREQUAL nvptx64 )
-		set( DARCH ptx )
-	else()
-		set( DARCH ${ARCH} )
-	endif()
-
-	# Enumerate SOURCES* files
-	set( source_list )
-	foreach( l ${dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} )
-		foreach( s "SOURCES" "SOURCES_${LLVM_MAJOR}.${LLVM_MINOR}" )
-			file( TO_CMAKE_PATH ${l}/lib/${s} file_loc )
-			file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/${file_loc} loc )
-			# Prepend the location to give higher priority to
-			# specialized implementation
-			if( EXISTS ${loc} )
-				set( source_list ${file_loc} ${source_list} )
-			endif()
-		endforeach()
-	endforeach()
-
-	# Add the generated convert.cl here to prevent adding
-	# the one listed in SOURCES
-	if( NOT ${ARCH} STREQUAL "spirv" AND NOT ${ARCH} STREQUAL "spirv64" )
-		set( rel_files convert.cl )
-		set( objects convert.cl )
-		if( NOT ENABLE_RUNTIME_SUBNORMAL AND NOT ${ARCH} STREQUAL "clspv" AND
-		    NOT ${ARCH} STREQUAL "clspv64" )
-			list( APPEND rel_files generic/lib/subnormal_use_default.ll )
-		endif()
-	else()
-		set( rel_files )
-		set( objects )
-	endif()
-
-	foreach( l ${source_list} )
-		file( READ ${l} file_list )
-		string( REPLACE "\n" ";" file_list ${file_list} )
-		get_filename_component( dir ${l} DIRECTORY )
-		foreach( f ${file_list} )
-			list( FIND objects ${f} found )
-			if( found EQUAL  -1 )
-				list( APPEND objects ${f} )
-				list( APPEND rel_files ${dir}/${f} )
-				# FIXME: This should really go away
-				file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/${dir}/${f} src_loc )
-				get_filename_component( fdir ${src_loc} DIRECTORY )
-
-				set_source_files_properties( ${dir}/${f}
-					PROPERTIES COMPILE_FLAGS "-I ${fdir}" )
-			endif()
-		endforeach()
-	endforeach()
-
-	foreach( d ${${t}_devices} )
-		# Some targets don't have a specific GPU to target
-		if( ${d} STREQUAL "none" OR ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" )
-			set( mcpu )
-			set( arch_suffix "${t}" )
-		else()
-			set( mcpu "-mcpu=${d}" )
-			set( arch_suffix "${d}-${t}" )
-		endif()
-		message( "	DEVICE: ${d} ( ${${d}_aliases} )" )
-
-		if ( ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" )
-			if( ${ARCH} STREQUAL "spirv" )
-				set( t "spir--" )
-			else()
-				set( t "spir64--" )
-			endif()
-			set( build_flags -O0 -finline-hint-functions )
-			set( opt_flags )
-			set( spvflags --spirv-max-version=1.1 )
-		elseif( ${ARCH} STREQUAL "clspv" )
-			set( t "spir--" )
-			set( build_flags "-Wno-unknown-assumption")
-			set( opt_flags -O3 )
-		elseif( ${ARCH} STREQUAL "clspv64" )
-			set( t "spir64--" )
-			set( build_flags "-Wno-unknown-assumption")
-			set( opt_flags -O3 )
-		else()
-			set( build_flags )
-			set( opt_flags -O3 )
-		endif()
-
-		add_library( builtins.link.${arch_suffix} STATIC ${rel_files} )
-		# Make sure we depend on the pseudo target to prevent
-		# multiple invocations
-		add_dependencies( builtins.link.${arch_suffix}
-			generate_convert.cl )
-		# CMake will turn this include into absolute path
-		target_include_directories( builtins.link.${arch_suffix} PRIVATE
-			"generic/include" )
-		target_compile_definitions( builtins.link.${arch_suffix} PRIVATE
-			"__CLC_INTERNAL" )
-		string( TOUPPER "-DCLC_${ARCH}" CLC_TARGET_DEFINE )
-		target_compile_definitions( builtins.link.${arch_suffix} PRIVATE
-			${CLC_TARGET_DEFINE} )
-		target_compile_options( builtins.link.${arch_suffix} PRIVATE  -target
-			${t} ${mcpu} -fno-builtin -nostdlib ${build_flags} )
-		set_target_properties( builtins.link.${arch_suffix} PROPERTIES
-			LINKER_LANGUAGE CLC )
-
-		set( obj_suffix ${arch_suffix}.bc )
-
-		# Add opt target
-		add_custom_command( OUTPUT "builtins.opt.${obj_suffix}"
-				    COMMAND ${LLVM_OPT} ${opt_flags} -o
-				    "builtins.opt.${obj_suffix}"
-				    "builtins.link.${obj_suffix}"
-				    DEPENDS "builtins.link.${arch_suffix}" )
-		add_custom_target( "opt.${obj_suffix}" ALL
-		                   DEPENDS "builtins.opt.${obj_suffix}" )
-
-		if( ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" )
-			set( spv_suffix ${arch_suffix}.spv )
-			add_custom_command( OUTPUT "${spv_suffix}"
-					    COMMAND ${LLVM_SPIRV} ${spvflags}
-					    -o "${spv_suffix}"
-					    "builtins.link.${obj_suffix}"
-					    DEPENDS "builtins.link.${arch_suffix}" )
-			add_custom_target( "prepare-${spv_suffix}" ALL
-			                   DEPENDS "${spv_suffix}" )
-			install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
-				 DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
-		else()
-
-			# Add prepare target
-			add_custom_command( OUTPUT "${obj_suffix}"
-				            COMMAND prepare_builtins -o
-					    "${obj_suffix}"
-					    "builtins.opt.${obj_suffix}"
-					    DEPENDS "opt.${obj_suffix}"
-					            "builtins.opt.${obj_suffix}"
-					            prepare_builtins )
-			add_custom_target( "prepare-${obj_suffix}" ALL
-					   DEPENDS "${obj_suffix}" )
-
-			# nvptx-- targets don't include workitem builtins
-			if( NOT ${t} MATCHES ".*ptx.*--$" )
-				add_test( NAME external-calls-${obj_suffix}
-   					  COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR}
-					  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} )
-			endif()
-
-			install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
-			foreach( a ${${d}_aliases} )
-				set( alias_suffix "${a}-${t}.bc" )
-				add_custom_target( ${alias_suffix} ALL
-						   COMMAND ${CMAKE_COMMAND} -E
-						   create_symlink ${obj_suffix}
-						   ${alias_suffix}
-				                   DEPENDS "prepare-${obj_suffix}" )
-				install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${alias_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
-			endforeach( a )
-		endif()
-	endforeach( d )
+  message( "BUILDING ${t}" )
+  string( REPLACE "-" ";" TRIPLE  ${t} )
+  list( GET TRIPLE 0 ARCH )
+  list( GET TRIPLE 1 VENDOR )
+  list( GET TRIPLE 2 OS )
+
+  set( dirs )
+
+  if ( NOT ${ARCH} STREQUAL spirv AND NOT ${ARCH} STREQUAL spirv64 AND
+       NOT ${ARCH} STREQUAL clspv AND NOT ${ARCH} STREQUAL clspv64)
+    LIST( APPEND dirs generic )
+  endif()
+
+  if( ${ARCH} STREQUAL r600 OR ${ARCH} STREQUAL amdgcn )
+    list( APPEND dirs amdgpu )
+  endif()
+
+  #nvptx is special
+  if( ${ARCH} STREQUAL nvptx OR ${ARCH} STREQUAL nvptx64 )
+    set( DARCH ptx )
+  else()
+    set( DARCH ${ARCH} )
+  endif()
+
+  # Enumerate SOURCES* files
+  set( source_list )
+  foreach( l ${dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} )
+    foreach( s "SOURCES" "SOURCES_${LLVM_MAJOR}.${LLVM_MINOR}" )
+      file( TO_CMAKE_PATH ${l}/lib/${s} file_loc )
+      file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/${file_loc} loc )
+      # Prepend the location to give higher priority to
+      # specialized implementation
+      if( EXISTS ${loc} )
+        set( source_list ${file_loc} ${source_list} )
+      endif()
+    endforeach()
+  endforeach()
+
+  # Add the generated convert.cl here to prevent adding
+  # the one listed in SOURCES
+  if( NOT ${ARCH} STREQUAL "spirv" AND NOT ${ARCH} STREQUAL "spirv64" )
+    if( NOT ENABLE_RUNTIME_SUBNORMAL AND NOT ${ARCH} STREQUAL "clspv" AND
+        NOT ${ARCH} STREQUAL "clspv64" )
+      set( rel_files convert.cl )
+      set( objects convert.cl )
+      list( APPEND rel_files generic/lib/subnormal_use_default.ll )
+    elseif(${ARCH} STREQUAL "clspv" OR ${ARCH} STREQUAL "clspv64")
+      set( rel_files clspv-convert.cl )
+      set( objects clspv-convert.cl )
+    endif()
+  else()
+    set( rel_files )
+    set( objects )
+  endif()
+
+  foreach( l ${source_list} )
+    file( READ ${l} file_list )
+    string( REPLACE "\n" ";" file_list ${file_list} )
+    get_filename_component( dir ${l} DIRECTORY )
+    foreach( f ${file_list} )
+      list( FIND objects ${f} found )
+      if( found EQUAL  -1 )
+        list( APPEND objects ${f} )
+        list( APPEND rel_files ${dir}/${f} )
+        # FIXME: This should really go away
+        file( TO_CMAKE_PATH ${CMAKE_SOURCE_DIR}/${dir}/${f} src_loc )
+        get_filename_component( fdir ${src_loc} DIRECTORY )
+
+        set_source_files_properties( ${dir}/${f}
+          PROPERTIES COMPILE_FLAGS "-I ${fdir}" )
+      endif()
+    endforeach()
+  endforeach()
+
+  foreach( d ${${t}_devices} )
+    # Some targets don't have a specific GPU to target
+    if( ${d} STREQUAL "none" OR ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" )
+      set( mcpu )
+      set( arch_suffix "${t}" )
+    else()
+      set( mcpu "-mcpu=${d}" )
+      set( arch_suffix "${d}-${t}" )
+    endif()
+    message( "  DEVICE: ${d} ( ${${d}_aliases} )" )
+
+    if ( ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" )
+      if( ${ARCH} STREQUAL "spirv" )
+        set( t "spir--" )
+      else()
+        set( t "spir64--" )
+      endif()
+      set( build_flags -O0 -finline-hint-functions )
+      set( opt_flags )
+      set( spvflags --spirv-max-version=1.1 )
+    elseif( ${ARCH} STREQUAL "clspv" )
+      set( t "spir--" )
+      set( build_flags "-Wno-unknown-assumption")
+      set( opt_flags -O3 )
+    elseif( ${ARCH} STREQUAL "clspv64" )
+      set( t "spir64--" )
+      set( build_flags "-Wno-unknown-assumption")
+      set( opt_flags -O3 )
+    else()
+      set( build_flags )
+      set( opt_flags -O3 )
+    endif()
+
+    add_library( builtins.link.${arch_suffix} STATIC ${rel_files} )
+    # Make sure we depend on the pseudo target to prevent
+    # multiple invocations
+    add_dependencies( builtins.link.${arch_suffix} generate_convert.cl )
+    add_dependencies( builtins.link.${arch_suffix} clspv-generate_convert.cl )
+    # CMake will turn this include into absolute path
+    target_include_directories( builtins.link.${arch_suffix} PRIVATE
+      "generic/include" )
+    target_compile_definitions( builtins.link.${arch_suffix} PRIVATE
+      "__CLC_INTERNAL" )
+    string( TOUPPER "-DCLC_${ARCH}" CLC_TARGET_DEFINE )
+    target_compile_definitions( builtins.link.${arch_suffix} PRIVATE
+      ${CLC_TARGET_DEFINE} )
+    target_compile_options( builtins.link.${arch_suffix} PRIVATE  -target
+      ${t} ${mcpu} -fno-builtin -nostdlib ${build_flags} )
+    set_target_properties( builtins.link.${arch_suffix} PROPERTIES
+      LINKER_LANGUAGE CLC )
+
+    set( obj_suffix ${arch_suffix}.bc )
+
+    # Add opt target
+    add_custom_command( OUTPUT "builtins.opt.${obj_suffix}"
+      COMMAND ${LLVM_OPT} ${opt_flags} -o "builtins.opt.${obj_suffix}" "builtins.link.${obj_suffix}"
+      DEPENDS "builtins.link.${arch_suffix}" )
+    add_custom_target( "opt.${obj_suffix}" ALL
+      DEPENDS "builtins.opt.${obj_suffix}" )
+
+    if( ${ARCH} STREQUAL "spirv" OR ${ARCH} STREQUAL "spirv64" )
+      set( spv_suffix ${arch_suffix}.spv )
+      add_custom_command( OUTPUT "${spv_suffix}"
+        COMMAND ${LLVM_SPIRV} ${spvflags} -o "${spv_suffix}" "builtins.link.${obj_suffix}"
+        DEPENDS "builtins.link.${arch_suffix}" )
+      add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" )
+      install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
+         DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+    else()
+      # Add prepare target
+      add_custom_command( OUTPUT "${obj_suffix}"
+        COMMAND prepare_builtins -o "${obj_suffix}" "builtins.opt.${obj_suffix}"
+        DEPENDS "opt.${obj_suffix}" "builtins.opt.${obj_suffix}" prepare_builtins )
+      add_custom_target( "prepare-${obj_suffix}" ALL DEPENDS "${obj_suffix}" )
+
+      # nvptx-- targets don't include workitem builtins
+      if( NOT ${t} MATCHES ".*ptx.*--$" )
+        add_test( NAME external-calls-${obj_suffix}
+          COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR}
+          WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} )
+      endif()
+
+      install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+      foreach( a ${${d}_aliases} )
+        set( alias_suffix "${a}-${t}.bc" )
+        add_custom_target( ${alias_suffix} ALL
+          COMMAND ${CMAKE_COMMAND} -E create_symlink ${obj_suffix} ${alias_suffix}
+          DEPENDS "prepare-${obj_suffix}" )
+        install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${alias_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+      endforeach( a )
+    endif()
+  endforeach( d )
 endforeach( t )
diff --git a/libclc/check_external_calls.sh b/libclc/check_external_calls.sh
index 4de31a220d070f..25792e249d6b66 100755
--- a/libclc/check_external_calls.sh
+++ b/libclc/check_external_calls.sh
@@ -3,15 +3,15 @@
 FILE=$1
 BIN_DIR=$2
 if [ ! -f $FILE ]; then
-	echo "ERROR: Not a file: $FILE"
-	exit 3
+  echo "ERROR: Not a file: $FILE"
+  exit 3
 fi
 ret=0
 
 DIS="$BIN_DIR/llvm-dis"
 if [ ! -x $DIS ]; then
-	echo "ERROR: Disassembler '$DIS' is not executable"
-	exit 3
+  echo "ERROR: Disassembler '$DIS' is not executable"
+  exit 3
 fi
 
 TMP_FILE=$(mktemp)
@@ -21,10 +21,10 @@ $DIS < $FILE | grep ' call ' | grep -v '@llvm' > "$TMP_FILE"
 COUNT=$(wc -l < "$TMP_FILE")
 
 if [ "$COUNT" -ne "0" ]; then
-	echo "ERROR: $COUNT unresolved calls detected in $FILE"
-	cat $TMP_FILE
-	ret=1
+  echo "ERROR: $COUNT unresolved calls detected in $FILE"
+  cat $TMP_FILE
+  ret=1
 else
-	echo "File $FILE is OK"
+  echo "File $FILE is OK"
 fi
 exit $ret
diff --git a/libclc/cmake/CMakeLLAsmInformation.cmake b/libclc/cmake/CMakeLLAsmInformation.cmake
index 218e20a52fe823..35ec3081da0f7d 100644
--- a/libclc/cmake/CMakeLLAsmInformation.cmake
+++ b/libclc/cmake/CMakeLLAsmInformation.cmake
@@ -1,7 +1,7 @@
 if(NOT CMAKE_LLAsm_COMPILE_OBJECT)
   set(CMAKE_LLAsm_COMPILE_OBJECT
-	  "${CMAKE_LLAsm_PREPROCESSOR} -E -P  <DEFINES> <INCLUDES> <FLAGS> -x cl <SOURCE> -o <OBJECT>.temp"
-	  "<CMAKE_LLAsm_COMPILER> -o <OBJECT> <OBJECT>.temp")
+    "${CMAKE_LLAsm_PREPROCESSOR} -E -P  <DEFINES> <INCLUDES> <FLAGS> -x cl <SOURCE> -o <OBJECT>.temp"
+    "<CMAKE_LLAsm_COMPILER> -o <OBJECT> <OBJECT>.temp")
 endif()
 
 if(NOT CMAKE_LLAsm_CREATE_STATIC_LIBRARY)
diff --git a/libclc/generic/lib/gen_convert.py b/libclc/generic/lib/gen_convert.py
index 612a9184f4b271..21fc8ebc80d156 100644
--- a/libclc/generic/lib/gen_convert.py
+++ b/libclc/generic/lib/gen_convert.py
@@ -2,6 +2,7 @@
 #
 # Copyright (c) 2013 Victor Oliveira <victormatheus@gmail.com>
 # Copyright (c) 2013 Jesse Towner <jessetowner@lavabit.com>
+# Copyright (c) 2024 Romaric Jodin <rjodin@chromium.org>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +27,16 @@
 #
 # convert_<destTypen><_sat><_roundingMode>(<sourceTypen>)
 
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--clspv", action="store_true", help="Generate the clspv variant of the code"
+)
+args = parser.parse_args()
+
+clspv = args.clspv
+
 types = [
     "char",
     "uchar",
@@ -251,13 +262,19 @@ def generate_default_conversion(src, dst, mode):
         print("#endif")
 
 
-for src in types:
-    for dst in types:
-        generate_default_conversion(src, dst, "")
+# Do not generate default conversion for clspv as they are handled natively
+if not clspv:
+    for src in types:
+        for dst in types:
+            generate_default_conversion(src, dst, "")
 
 for src in int_types:
     for dst in int_types:
         for mode in rounding_modes:
+            # Do not generate "_rte" conversion for clspv as they are handled
+            # natively
+            if clspv and mode == "_rte":
+                continue
             generate_default_conversion(src, dst, mode)
 
 #
@@ -304,21 +321,38 @@ def generate_saturated_conversion(src, dst, size):
 
     elif src in float_types:
 
-        # Conversion from float to int
-        print(
-            """  {DST}{N} y = convert_{DST}{N}(x);
-  y = select(y, ({DST}{N}){DST_MIN}, {BP}(x < ({SRC}{N}){DST_MIN}){BS});
-  y = select(y, ({DST}{N}){DST_MAX}, {BP}(x > ({SRC}{N}){DST_MAX}){BS});
-  return y;""".format(
-                SRC=src,
-                DST=dst,
-                N=size,
-                DST_MIN=limit_min[dst],
-                DST_MAX=limit_max[dst],
-                BP=bool_prefix,
-                BS=bool_suffix,
+        if clspv:
+            # Conversion from float to int
+            print(
+                """  {DST}{N} y = convert_{DST}{N}(x);
+                y = select(y, ({DST}{N}){DST_MIN}, {BP}(x <= ({SRC}{N}){DST_MIN}){BS});
+                y = select(y, ({DST}{N}){DST_MAX}, {BP}(x >= ({SRC}{N}){DST_MAX}){BS});
+                return y;""".format(
+                    SRC=src,
+                    DST=dst,
+                    N=size,
+                    DST_MIN=limit_min[dst],
+                    DST_MAX=limit_max[dst],
+                    BP=bool_prefix,
+                    BS=bool_suffix,
+                )
+            )
+        else:
+            # Conversion from float to int
+            print(
+                """  {DST}{N} y = convert_{DST}{N}(x);
+                y = select(y, ({DST}{N}){DST_MIN}, {BP}(x < ({SRC}{N}){DST_MIN}){BS});
+                y = select(y, ({DST}{N}){DST_MAX}, {BP}(x > ({SRC}{N}){DST_MAX}){BS});
+                return y;""".format(
+                    SRC=src,
+                    DST=dst,
+                    N=size,
+                    DST_MIN=limit_min[dst],
+                    DST_MAX=limit_max[dst],
+                    BP=bool_prefix,
+                    BS=bool_suffix,
+                )
             )
-        )
 
     else:
 
@@ -432,7 +466,10 @@ def generate_float_conversion(src, dst, size, mode, sat):
         print("  return convert_{DST}{N}(x);".format(DST=dst, N=size))
     else:
         print("  {DST}{N} r = convert_{DST}{N}(x);".format(DST=dst, N=size))
-        print("  {SRC}{N} y = convert_{SRC}{N}(r);".format(SRC=src, N=size))
+        if clspv:
+            print("  {SRC}{N} y = convert_{SRC}{N}_sat(r);".format(SRC=src, N=size))
+        else:
+            print("  {SRC}{N} y = convert_{SRC}{N}(r);".format(SRC=src, N=size))
         if mode == "_rtz":
             if src in int_types:
                 print(
@@ -448,11 +485,29 @@ def generate_float_conversion(src, dst, size, mode, sat):
             else:
                 print("  {SRC}{N} abs_x = fabs(x);".format(SRC=src, N=size))
                 print("  {SRC}{N} abs_y = fabs(y);".format(SRC=src, N=size))
-            print(
-                "  return select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), convert_{BOOL}{N}(abs_y > abs_x));".format(
-                    DST=dst, N=size, BOOL=bool_type[dst]
+            if clspv:
+                print(
+                    "  {BOOL}{N} c = convert_{BOOL}{N}(abs_y > abs_x);".format(
+                        BOOL=bool_type[dst], N=size
+                    )
+                )
+                if sizeof_type[src] >= 4 and src in int_types:
+                    print(
+                        "  c = c || convert_{BOOL}{N}(({SRC}{N}){SRC_MAX} == x);".format(
+                            BOOL=bool_type[dst], N=size, SRC=src, SRC_MAX=limit_max[src]
+                        )
+                    )
+                print(
+                    "  return select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), c);".format(
+                        DST=dst, N=size, BOOL=bool_type[dst], SRC=src
+                    )
+                )
+            else:
+                print(
+                    "  return select(r, nextafter(r, sign(r) * ({DST}{N})-INFINITY), convert_{BOOL}{N}(abs_y > abs_x));".format(
+                        DST=dst, N=size, BOOL=bool_type[dst]
+                    )
                 )
-            )
         if mode == "_rtp":
             print(
                 "  return select(r, nextafter(r, ({DST}{N})INFINITY), convert_{BOOL}{N}(y < x));".format(
@@ -460,11 +515,29 @@ def generate_float_conversion(src, dst, size, mode, sat):
                 )
             )
         if mode == "_rtn":
-            print(
-                "  return select(r, nextafter(r, ({DST}{N})-INFINITY), convert_{BOOL}{N}(y > x));".format(
-                    DST=dst, N=size, BOOL=bool_type[dst]
+            if clspv:
+                print(
+                    "  {BOOL}{N} c = convert_{BOOL}{N}(y > x);".format(
+                        BOOL=bool_type[dst], N=size
+                    )
+                )
+                if sizeof_type[src] >= 4 and src in int_types:
+                    print(
+                        "  c = c || convert_{BOOL}{N}(({SRC}{N}){SRC_MAX} == x);".format(
+                            BOOL=bool_type[dst], N=size, SRC=src, SRC_MAX=limit_max[src]
+                        )
+                    )
+                print(
+                    "  return select(r, nextafter(r, ({DST}{N})-INFINITY), c);".format(
+                        DST=dst, N=size, BOOL=bool_type[dst], SRC=src
+                    )
+                )
+            else:
+                print(
+                    "  return select(r, nextafter(r, ({DST}{N})-INFINITY), convert_{BOOL}{N}(y > x));".format(
+                        DST=dst, N=size, BOOL=bool_type[dst]
+                    )
                 )
-            )
 
     # Footer
     print("}")
@@ -484,4 +557,8 @@ def generate_float_conversion(src, dst, size, mode, sat):
     for dst in float_types:
         for size in vector_sizes:
             for mode in rounding_modes:
+                # Do not generate "_rte" conversion for clspv as they are
+                # handled natively
+                if clspv and mode == "_rte":
+                    continue
                 generate_float_conversion(src, dst, size, mode, "")
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index b436e96f178b70..3dec6faea13a0c 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -176,6 +176,7 @@ set(BENCHMARK_TESTS
     algorithms/count.bench.cpp
     algorithms/equal.bench.cpp
     algorithms/find.bench.cpp
+    algorithms/fill.bench.cpp
     algorithms/for_each.bench.cpp
     algorithms/lower_bound.bench.cpp
     algorithms/make_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/fill.bench.cpp b/libcxx/benchmarks/algorithms/fill.bench.cpp
new file mode 100644
index 00000000000000..40f37425c394cf
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/fill.bench.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <vector>
+
+static void bm_fill_n(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::fill_n(vec1.begin(), vec1.size(), false));
+  }
+}
+BENCHMARK(bm_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_ranges_fill_n(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::ranges::fill_n(vec1.begin(), vec1.size(), false));
+  }
+}
+BENCHMARK(bm_ranges_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_fill(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    std::fill(vec1.begin(), vec1.end(), false);
+  }
+}
+BENCHMARK(bm_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_ranges_fill(benchmark::State& state) {
+  std::vector<bool> vec1(state.range());
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::ranges::fill(vec1, false));
+  }
+}
+BENCHMARK(bm_ranges_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst
index ee2b81d3b9e7ca..5b027ed1bd0729 100644
--- a/libcxx/docs/Modules.rst
+++ b/libcxx/docs/Modules.rst
@@ -178,34 +178,13 @@ This is a small sample program that uses the module ``std``. It consists of a
   )
   FetchContent_MakeAvailable(std)
 
-  #
-  # Adjust project compiler flags
-  #
-
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fprebuilt-module-path=${std_BINARY_DIR}/CMakeFiles/std.dir/>)
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fprebuilt-module-path=${std_BINARY_DIR}/CMakeFiles/std.compat.dir/>)
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-nostdinc++>)
-  # The include path needs to be set to be able to use macros from headers.
-  # For example from, the headers <cassert> and <version>.
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-isystem>)
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${LIBCXX_BUILD}/include/c++/v1>)
-
-  #
-  # Adjust project linker flags
-  #
-
-  add_link_options($<$<COMPILE_LANGUAGE:CXX>:-nostdlib++>)
-  add_link_options($<$<COMPILE_LANGUAGE:CXX>:-L${LIBCXX_BUILD}/lib>)
-  add_link_options($<$<COMPILE_LANGUAGE:CXX>:-Wl,-rpath,${LIBCXX_BUILD}/lib>)
-  # Linking against the standard c++ library is required for CMake to get the proper dependencies.
-  link_libraries(std c++)
-  link_libraries(std.compat c++)
-
   #
   # Add the project
   #
 
   add_executable(main)
+  add_dependencies(main std.compat)
+  target_link_libraries(main std.compat)
   target_sources(main
     PRIVATE
       main.cpp
@@ -218,13 +197,9 @@ Building this project is done with the following steps, assuming the files
 
   $ mkdir build
   $ cmake -G Ninja -S . -B build -DCMAKE_CXX_COMPILER=<path-to-compiler> -DLIBCXX_BUILD=<build>
-  $ ninja -j1 std -C build
   $ ninja -C build
   $ build/main
 
-.. note:: The ``std`` dependencies of ``std.compat`` is not always resolved when
-          building the ``std`` target using multiple jobs.
-
 .. warning:: ``<path-to-compiler>`` should point point to the real binary and
              not to a symlink.
 
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 2b62a36ca8e5cd..c70ae477fafc1d 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -49,6 +49,8 @@ Improvements and New Features
 -----------------------------
 
 - The performance of growing ``std::vector`` has been improved for trivially relocatable types.
+- The performance of ``ranges::fill`` and ``ranges::fill_n`` has been improved for ``vector<bool>::iterator``\s,
+  resulting in a performance increase of up to 1400x.
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index e00345533b865d..a103192350069b 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -46,7 +46,7 @@
 "`3473 <https://wg21.link/LWG3473>`__","Normative encouragement in non-normative note","November 2020","|Complete|","15.0","|format|"
 "`3474 <https://wg21.link/LWG3474>`__","Nesting ``join_views`` is broken because of CTAD","November 2020","|Complete|","15.0","|ranges|"
 "`3476 <https://wg21.link/LWG3476>`__","``thread`` and ``jthread`` constructors require that the parameters be move-constructible but never move construct the parameters","November 2020","",""
-"`3477 <https://wg21.link/LWG3477>`__","Simplify constraints for ``semiregular-box``","November 2020","","","|ranges|"
+"`3477 <https://wg21.link/LWG3477>`__","Simplify constraints for ``semiregular-box``","November 2020","|Complete|","13.0","|ranges|"
 "`3482 <https://wg21.link/LWG3482>`__","``drop_view``'s const begin should additionally require ``sized_range``","November 2020","|Complete|","14.0","|ranges|"
 "`3483 <https://wg21.link/LWG3483>`__","``transform_view::iterator``'s difference is overconstrained","November 2020","|Complete|","14.0","|ranges|"
 "","","","","",""
@@ -181,7 +181,7 @@
 "`3711 <https://wg21.link/LWG3711>`__","Missing preconditions for slide_view constructor","July 2022","","","|ranges|"
 "`3712 <https://wg21.link/LWG3712>`__","``chunk_view`` and ``slide_view`` should not be ``default_initializable``","July 2022","","","|ranges|"
 "`3713 <https://wg21.link/LWG3713>`__","Sorted with respect to comparator (only)","July 2022","",""
-"`3715 <https://wg21.link/LWG3715>`__","``view_interface::empty`` is overconstrained","July 2022","","","|ranges|"
+"`3715 <https://wg21.link/LWG3715>`__","``view_interface::empty`` is overconstrained","July 2022","|Complete|","19.0","|ranges|"
 "`3719 <https://wg21.link/LWG3719>`__","Directory iterators should be usable with default sentinel","July 2022","|Complete|","17.0","|ranges|"
 "`3721 <https://wg21.link/LWG3721>`__","Allow an ``arg-id`` with a value of zero for ``width`` in ``std-format-spec``","July 2022","|Complete|","16.0","|format|"
 "`3724 <https://wg21.link/LWG3724>`__","``decay-copy`` should be constrained","July 2022","|Complete|","14.0"
@@ -283,7 +283,7 @@
 "`3655 <https://wg21.link/LWG3655>`__","The ``INVOKE`` operation and union types","February 2023","|Complete|","18.0",""
 "`3723 <https://wg21.link/LWG3723>`__","``priority_queue::push_range`` needs to ``append_range``","February 2023","","","|ranges|"
 "`3734 <https://wg21.link/LWG3734>`__","Inconsistency in ``inout_ptr`` and ``out_ptr`` for empty case","February 2023","","",""
-"`3772 <https://wg21.link/LWG3772>`__","``repeat_view``'s ``piecewise`` constructor is missing Postconditions","February 2023","","","|ranges|"
+"`3772 <https://wg21.link/LWG3772>`__","``repeat_view``'s ``piecewise`` constructor is missing Postconditions","February 2023","|Complete|","17.0","|ranges|"
 "`3786 <https://wg21.link/LWG3786>`__","Flat maps' deduction guide needs to default ``Allocator`` to be useful","February 2023","","",""
 "`3803 <https://wg21.link/LWG3803>`__","``flat_foo`` constructors taking ``KeyContainer`` lack ``KeyCompare`` parameter","February 2023","","",""
 "`3810 <https://wg21.link/LWG3810>`__","CTAD for ``std::basic_format_args``","February 2023","|Complete|","17.0","|format|"
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index 56e1468b4ca1a3..80547c5c1f3f57 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -100,7 +100,7 @@
 "`P2396R1 <https://wg21.link/P2396R1>`__","LWG", "Concurrency TS 2 fixes ", "November 2022","","","|concurrency TS|"
 "`P2505R5 <https://wg21.link/P2505R5>`__","LWG", "Monadic Functions for ``std::expected``", "November 2022","|Complete|","17.0",""
 "`P2539R4 <https://wg21.link/P2539R4>`__","LWG", "Should the output of ``std::print`` to a terminal be synchronized with the underlying stream?", "November 2022","|Complete|","18.0","|format|"
-"`P2602R2 <https://wg21.link/P2602R2>`__","LWG", "Poison Pills are Too Toxic", "November 2022","","","|ranges|"
+"`P2602R2 <https://wg21.link/P2602R2>`__","LWG", "Poison Pills are Too Toxic", "November 2022","|Complete|","19.0","|ranges|"
 "`P2708R1 <https://wg21.link/P2708R1>`__","LWG", "No Further Fundamentals TSes", "November 2022","|Nothing to do|","",""
 "","","","","","",""
 "`P0290R4 <https://wg21.link/P0290R4>`__","LWG", "``apply()`` for ``synchronized_value<T>``","February 2023","","","|concurrency TS|"
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 63adc03fae2980..6ed8d21d98a15a 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -755,10 +755,7 @@ set(files
   __type_traits/is_constant_evaluated.h
   __type_traits/is_constructible.h
   __type_traits/is_convertible.h
-  __type_traits/is_copy_assignable.h
-  __type_traits/is_copy_constructible.h
   __type_traits/is_core_convertible.h
-  __type_traits/is_default_constructible.h
   __type_traits/is_destructible.h
   __type_traits/is_empty.h
   __type_traits/is_enum.h
@@ -774,17 +771,10 @@ set(files
   __type_traits/is_member_function_pointer.h
   __type_traits/is_member_object_pointer.h
   __type_traits/is_member_pointer.h
-  __type_traits/is_move_assignable.h
-  __type_traits/is_move_constructible.h
   __type_traits/is_nothrow_assignable.h
   __type_traits/is_nothrow_constructible.h
   __type_traits/is_nothrow_convertible.h
-  __type_traits/is_nothrow_copy_assignable.h
-  __type_traits/is_nothrow_copy_constructible.h
-  __type_traits/is_nothrow_default_constructible.h
   __type_traits/is_nothrow_destructible.h
-  __type_traits/is_nothrow_move_assignable.h
-  __type_traits/is_nothrow_move_constructible.h
   __type_traits/is_null_pointer.h
   __type_traits/is_object.h
   __type_traits/is_pod.h
@@ -805,14 +795,9 @@ set(files
   __type_traits/is_trivial.h
   __type_traits/is_trivially_assignable.h
   __type_traits/is_trivially_constructible.h
-  __type_traits/is_trivially_copy_assignable.h
-  __type_traits/is_trivially_copy_constructible.h
   __type_traits/is_trivially_copyable.h
-  __type_traits/is_trivially_default_constructible.h
   __type_traits/is_trivially_destructible.h
   __type_traits/is_trivially_lexicographically_comparable.h
-  __type_traits/is_trivially_move_assignable.h
-  __type_traits/is_trivially_move_constructible.h
   __type_traits/is_trivially_relocatable.h
   __type_traits/is_unbounded_array.h
   __type_traits/is_union.h
diff --git a/libcxx/include/__algorithm/copy_backward.h b/libcxx/include/__algorithm/copy_backward.h
index 3ec88d8bd5cc3a..591dd21e2b032e 100644
--- a/libcxx/include/__algorithm/copy_backward.h
+++ b/libcxx/include/__algorithm/copy_backward.h
@@ -15,7 +15,7 @@
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/common_type.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/copy_move_common.h b/libcxx/include/__algorithm/copy_move_common.h
index 0fc7a5e3cee700..845967b05038d6 100644
--- a/libcxx/include/__algorithm/copy_move_common.h
+++ b/libcxx/include/__algorithm/copy_move_common.h
@@ -19,7 +19,7 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_always_bitcastable.h>
 #include <__type_traits/is_constant_evaluated.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_trivially_assignable.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/is_volatile.h>
diff --git a/libcxx/include/__algorithm/equal.h b/libcxx/include/__algorithm/equal.h
index 3c0e3060e39a99..c76a16b47f5da4 100644
--- a/libcxx/include/__algorithm/equal.h
+++ b/libcxx/include/__algorithm/equal.h
@@ -69,20 +69,6 @@ equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first
 }
 
 #if _LIBCPP_STD_VER >= 14
-template <class _BinaryPredicate, class _InputIterator1, class _InputIterator2>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-__equal(_InputIterator1 __first1,
-        _InputIterator1 __last1,
-        _InputIterator2 __first2,
-        _InputIterator2 __last2,
-        _BinaryPredicate __pred,
-        input_iterator_tag,
-        input_iterator_tag) {
-  for (; __first1 != __last1 && __first2 != __last2; ++__first1, (void)++__first2)
-    if (!__pred(*__first1, *__first2))
-      return false;
-  return __first1 == __last1 && __first2 == __last2;
-}
 
 template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Pred, class _Proj1, class _Proj2>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __equal_impl(
@@ -110,17 +96,18 @@ __equal_impl(_Tp* __first1, _Tp* __last1, _Up* __first2, _Up*, _Pred&, _Proj1&,
   return std::__constexpr_memcmp_equal(__first1, __first2, __element_count(__last1 - __first1));
 }
 
-template <class _BinaryPredicate, class _RandomAccessIterator1, class _RandomAccessIterator2>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-__equal(_RandomAccessIterator1 __first1,
-        _RandomAccessIterator1 __last1,
-        _RandomAccessIterator2 __first2,
-        _RandomAccessIterator2 __last2,
-        _BinaryPredicate __pred,
-        random_access_iterator_tag,
-        random_access_iterator_tag) {
-  if (std::distance(__first1, __last1) != std::distance(__first2, __last2))
-    return false;
+template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
+equal(_InputIterator1 __first1,
+      _InputIterator1 __last1,
+      _InputIterator2 __first2,
+      _InputIterator2 __last2,
+      _BinaryPredicate __pred) {
+  if constexpr (__has_random_access_iterator_category<_InputIterator1>::value &&
+                __has_random_access_iterator_category<_InputIterator2>::value) {
+    if (std::distance(__first1, __last1) != std::distance(__first2, __last2))
+      return false;
+  }
   __identity __proj;
   return std::__equal_impl(
       std::__unwrap_iter(__first1),
@@ -132,36 +119,13 @@ __equal(_RandomAccessIterator1 __first1,
       __proj);
 }
 
-template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
-equal(_InputIterator1 __first1,
-      _InputIterator1 __last1,
-      _InputIterator2 __first2,
-      _InputIterator2 __last2,
-      _BinaryPredicate __pred) {
-  return std::__equal<_BinaryPredicate&>(
-      __first1,
-      __last1,
-      __first2,
-      __last2,
-      __pred,
-      typename iterator_traits<_InputIterator1>::iterator_category(),
-      typename iterator_traits<_InputIterator2>::iterator_category());
-}
-
 template <class _InputIterator1, class _InputIterator2>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2) {
-  return std::__equal(
-      __first1,
-      __last1,
-      __first2,
-      __last2,
-      __equal_to(),
-      typename iterator_traits<_InputIterator1>::iterator_category(),
-      typename iterator_traits<_InputIterator2>::iterator_category());
+  return std::equal(__first1, __last1, __first2, __last2, __equal_to());
 }
-#endif
+
+#endif // _LIBCPP_STD_VER >= 14
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__algorithm/equal_range.h b/libcxx/include/__algorithm/equal_range.h
index a94290431971c4..2b086abf1794fd 100644
--- a/libcxx/include/__algorithm/equal_range.h
+++ b/libcxx/include/__algorithm/equal_range.h
@@ -23,7 +23,7 @@
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
 #include <__type_traits/is_callable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h
index 36f3349d9e7a37..f29633f88087f0 100644
--- a/libcxx/include/__algorithm/fill_n.h
+++ b/libcxx/include/__algorithm/fill_n.h
@@ -9,18 +9,74 @@
 #ifndef _LIBCPP___ALGORITHM_FILL_N_H
 #define _LIBCPP___ALGORITHM_FILL_N_H
 
+#include <__algorithm/min.h>
 #include <__config>
+#include <__fwd/bit_reference.h>
 #include <__iterator/iterator_traits.h>
+#include <__memory/pointer_traits.h>
 #include <__utility/convert_to_integral.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
 
+template <class _OutputIterator, class _Size, class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
+
+template <bool _FillVal, class _Cp>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+__fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
+  using _It            = __bit_iterator<_Cp, false>;
+  using __storage_type = typename _It::__storage_type;
+
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0) {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = std::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    if (_FillVal)
+      *__first.__seg_ |= __m;
+    else
+      *__first.__seg_ &= ~__m;
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  __storage_type __nw = __n / __bits_per_word;
+  std::__fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
+  __n -= __nw * __bits_per_word;
+  // do last partial word
+  if (__n > 0) {
+    __first.__seg_ += __nw;
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    if (_FillVal)
+      *__first.__seg_ |= __m;
+    else
+      *__first.__seg_ &= ~__m;
+  }
+}
+
+template <class _Cp, class _Size>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
+__fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
+  if (__n > 0) {
+    if (__value)
+      std::__fill_n_bool<true>(__first, __n);
+    else
+      std::__fill_n_bool<false>(__first, __n);
+  }
+  return __first + __n;
+}
+
 template <class _OutputIterator, class _Size, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
@@ -37,4 +93,6 @@ fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_FILL_N_H
diff --git a/libcxx/include/__algorithm/inplace_merge.h b/libcxx/include/__algorithm/inplace_merge.h
index eb3c0bdbc2db7f..a6bcc66a2fa47a 100644
--- a/libcxx/include/__algorithm/inplace_merge.h
+++ b/libcxx/include/__algorithm/inplace_merge.h
@@ -114,8 +114,8 @@ _LIBCPP_HIDE_FROM_ABI void __buffered_inplace_merge(
     for (_BidirectionalIterator __i = __middle; __i != __last;
          __d.template __incr<value_type>(), (void)++__i, (void)++__p)
       ::new ((void*)__p) value_type(_IterOps<_AlgPolicy>::__iter_move(__i));
-    typedef __unconstrained_reverse_iterator<_BidirectionalIterator> _RBi;
-    typedef __unconstrained_reverse_iterator<value_type*> _Rv;
+    typedef reverse_iterator<_BidirectionalIterator> _RBi;
+    typedef reverse_iterator<value_type*> _Rv;
     typedef __invert<_Compare> _Inverted;
     std::__half_inplace_merge<_AlgPolicy>(
         _Rv(__p), _Rv(__buff), _RBi(__middle), _RBi(__first), _RBi(__last), _Inverted(__comp));
diff --git a/libcxx/include/__algorithm/lexicographical_compare_three_way.h b/libcxx/include/__algorithm/lexicographical_compare_three_way.h
index 32de97d07a1319..50ebdc647a97ab 100644
--- a/libcxx/include/__algorithm/lexicographical_compare_three_way.h
+++ b/libcxx/include/__algorithm/lexicographical_compare_three_way.h
@@ -17,7 +17,7 @@
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__type_traits/common_type.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h
index dba6d487fff77e..bf574b5274093d 100644
--- a/libcxx/include/__algorithm/move.h
+++ b/libcxx/include/__algorithm/move.h
@@ -16,7 +16,7 @@
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/common_type.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/move_backward.h b/libcxx/include/__algorithm/move_backward.h
index aeedf4241dce9c..6bb7c91d66c7ce 100644
--- a/libcxx/include/__algorithm/move_backward.h
+++ b/libcxx/include/__algorithm/move_backward.h
@@ -15,7 +15,7 @@
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/common_type.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/partial_sort.h b/libcxx/include/__algorithm/partial_sort.h
index 85a8fdc77aa228..7f8d0c49147e3a 100644
--- a/libcxx/include/__algorithm/partial_sort.h
+++ b/libcxx/include/__algorithm/partial_sort.h
@@ -18,8 +18,8 @@
 #include <__config>
 #include <__debug_utils/randomize_range.h>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/pop_heap.h b/libcxx/include/__algorithm/pop_heap.h
index 798a1d09934bc3..6d23830097ff96 100644
--- a/libcxx/include/__algorithm/pop_heap.h
+++ b/libcxx/include/__algorithm/pop_heap.h
@@ -17,8 +17,8 @@
 #include <__assert>
 #include <__config>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/push_heap.h b/libcxx/include/__algorithm/push_heap.h
index 7d8720e3a93d44..ec0b445f2b70f3 100644
--- a/libcxx/include/__algorithm/push_heap.h
+++ b/libcxx/include/__algorithm/push_heap.h
@@ -14,8 +14,8 @@
 #include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/rotate.h b/libcxx/include/__algorithm/rotate.h
index 9a4d07883e320f..df4ca95aac95bc 100644
--- a/libcxx/include/__algorithm/rotate.h
+++ b/libcxx/include/__algorithm/rotate.h
@@ -15,7 +15,7 @@
 #include <__algorithm/swap_ranges.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_trivially_move_assignable.h>
+#include <__type_traits/is_trivially_assignable.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
diff --git a/libcxx/include/__algorithm/sort_heap.h b/libcxx/include/__algorithm/sort_heap.h
index 060fc33c3c6e93..f20b110c7fd12e 100644
--- a/libcxx/include/__algorithm/sort_heap.h
+++ b/libcxx/include/__algorithm/sort_heap.h
@@ -16,8 +16,8 @@
 #include <__config>
 #include <__debug_utils/strict_weak_ordering_check.h>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/stable_sort.h b/libcxx/include/__algorithm/stable_sort.h
index 9be192bd65a6ef..726e7e16b3564a 100644
--- a/libcxx/include/__algorithm/stable_sort.h
+++ b/libcxx/include/__algorithm/stable_sort.h
@@ -20,7 +20,7 @@
 #include <__memory/destruct_n.h>
 #include <__memory/temporary_buffer.h>
 #include <__memory/unique_ptr.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
+#include <__type_traits/is_trivially_assignable.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <new>
diff --git a/libcxx/include/__algorithm/unwrap_iter.h b/libcxx/include/__algorithm/unwrap_iter.h
index 50d815c9708849..8cc0d22d4fc211 100644
--- a/libcxx/include/__algorithm/unwrap_iter.h
+++ b/libcxx/include/__algorithm/unwrap_iter.h
@@ -13,7 +13,7 @@
 #include <__iterator/iterator_traits.h>
 #include <__memory/pointer_traits.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/declval.h>
 #include <__utility/move.h>
 
diff --git a/libcxx/include/__algorithm/upper_bound.h b/libcxx/include/__algorithm/upper_bound.h
index f499f7a80aa6d9..9c7d8fbcde07b5 100644
--- a/libcxx/include/__algorithm/upper_bound.h
+++ b/libcxx/include/__algorithm/upper_bound.h
@@ -18,7 +18,7 @@
 #include <__iterator/advance.h>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__atomic/atomic_base.h b/libcxx/include/__atomic/atomic_base.h
index 6ca01a7f1bf9b9..e9badccc25a620 100644
--- a/libcxx/include/__atomic/atomic_base.h
+++ b/libcxx/include/__atomic/atomic_base.h
@@ -18,7 +18,7 @@
 #include <__config>
 #include <__memory/addressof.h>
 #include <__type_traits/is_integral.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
 #include <version>
 
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 3a5339b72ddc31..9579b9eaf70bbd 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -171,61 +171,6 @@ private:
   __bit_const_reference& operator=(const __bit_const_reference&) = delete;
 };
 
-// fill_n
-
-template <bool _FillVal, class _Cp>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
-__fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
-  using _It            = __bit_iterator<_Cp, false>;
-  using __storage_type = typename _It::__storage_type;
-
-  const int __bits_per_word = _It::__bits_per_word;
-  // do first partial word
-  if (__first.__ctz_ != 0) {
-    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-    __storage_type __dn    = std::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    if (_FillVal)
-      *__first.__seg_ |= __m;
-    else
-      *__first.__seg_ &= ~__m;
-    __n -= __dn;
-    ++__first.__seg_;
-  }
-  // do middle whole words
-  __storage_type __nw = __n / __bits_per_word;
-  std::fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
-  __n -= __nw * __bits_per_word;
-  // do last partial word
-  if (__n > 0) {
-    __first.__seg_ += __nw;
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    if (_FillVal)
-      *__first.__seg_ |= __m;
-    else
-      *__first.__seg_ &= ~__m;
-  }
-}
-
-template <class _Cp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value) {
-  if (__n > 0) {
-    if (__value)
-      std::__fill_n<true>(__first, __n);
-    else
-      std::__fill_n<false>(__first, __n);
-  }
-}
-
-// fill
-
-template <class _Cp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value) {
-  std::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value);
-}
-
 // copy
 
 template <class _Cp, bool _IsConst>
@@ -1007,8 +952,10 @@ private:
   friend class __bit_iterator<_Cp, true>;
   template <class _Dp>
   friend struct __bit_array;
+
   template <bool _FillVal, class _Dp>
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void __fill_n(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void
+  __fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
 
   template <class _Dp, bool _IC>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_aligned(
diff --git a/libcxx/include/__compare/partial_order.h b/libcxx/include/__compare/partial_order.h
index f3ed4900fbff25..1d2fae63e5f248 100644
--- a/libcxx/include/__compare/partial_order.h
+++ b/libcxx/include/__compare/partial_order.h
@@ -28,6 +28,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // [cmp.alg]
 namespace __partial_order {
+void partial_order() = delete;
+
 struct __fn {
   // NOLINTBEGIN(libcpp-robust-against-adl) partial_order should use ADL, but only here
   template <class _Tp, class _Up>
diff --git a/libcxx/include/__compare/strong_order.h b/libcxx/include/__compare/strong_order.h
index 3dc819e642515c..8c363b56382221 100644
--- a/libcxx/include/__compare/strong_order.h
+++ b/libcxx/include/__compare/strong_order.h
@@ -37,6 +37,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // [cmp.alg]
 namespace __strong_order {
+void strong_order() = delete;
+
 struct __fn {
   // NOLINTBEGIN(libcpp-robust-against-adl) strong_order should use ADL, but only here
   template <class _Tp, class _Up>
diff --git a/libcxx/include/__compare/weak_order.h b/libcxx/include/__compare/weak_order.h
index b82a708c29a146..1a3e85feb233b3 100644
--- a/libcxx/include/__compare/weak_order.h
+++ b/libcxx/include/__compare/weak_order.h
@@ -30,6 +30,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // [cmp.alg]
 namespace __weak_order {
+void weak_order() = delete;
+
 struct __fn {
   // NOLINTBEGIN(libcpp-robust-against-adl) weak_order should use ADL, but only here
   template <class _Tp, class _Up>
diff --git a/libcxx/include/__concepts/class_or_enum.h b/libcxx/include/__concepts/class_or_enum.h
index c1b4a8c258f3a5..2739e31e14ba65 100644
--- a/libcxx/include/__concepts/class_or_enum.h
+++ b/libcxx/include/__concepts/class_or_enum.h
@@ -28,11 +28,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp>
 concept __class_or_enum = is_class_v<_Tp> || is_union_v<_Tp> || is_enum_v<_Tp>;
 
-// Work around Clang bug https://llvm.org/PR52970
-// TODO: remove this workaround once libc++ no longer has to support Clang 13 (it was fixed in Clang 14).
-template <class _Tp>
-concept __workaround_52970 = is_class_v<__remove_cvref_t<_Tp>> || is_union_v<__remove_cvref_t<_Tp>>;
-
 #endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__concepts/swappable.h b/libcxx/include/__concepts/swappable.h
index 1337dc49d75b10..d339488a087a5c 100644
--- a/libcxx/include/__concepts/swappable.h
+++ b/libcxx/include/__concepts/swappable.h
@@ -15,8 +15,8 @@
 #include <__concepts/constructible.h>
 #include <__config>
 #include <__type_traits/extent.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/exchange.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h
index 417db54e6eaacd..1bf2df939258a6 100644
--- a/libcxx/include/__exception/nested_exception.h
+++ b/libcxx/include/__exception/nested_exception.h
@@ -15,8 +15,8 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/is_base_of.h>
 #include <__type_traits/is_class.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_constructible.h>
 #include <__type_traits/is_final.h>
 #include <__type_traits/is_polymorphic.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 443d9257dc598d..d7adaac7567b2f 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -23,24 +23,14 @@
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_function.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_assignable.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/lazy.h>
 #include <__type_traits/negation.h>
diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index 066cd369eb8918..23a599e9957599 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -151,7 +151,7 @@ consteval __arg_t __determine_arg_t() {
 // The overload for not formattable types allows triggering the static
 // assertion below.
 template <class _Context, class _Tp>
-  requires(!__formattable<_Tp, typename _Context::char_type>)
+  requires(!__formattable_with<_Tp, _Context>)
 consteval __arg_t __determine_arg_t() {
   return __arg_t::__none;
 }
@@ -165,7 +165,6 @@ _LIBCPP_HIDE_FROM_ABI basic_format_arg<_Context> __create_format_arg(_Tp& __valu
   using _Dp               = remove_const_t<_Tp>;
   constexpr __arg_t __arg = __determine_arg_t<_Context, _Dp>();
   static_assert(__arg != __arg_t::__none, "the supplied type is not formattable");
-
   static_assert(__formattable_with<_Tp, _Context>);
 
   // Not all types can be used to directly initialize the
diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h
index d131e942aca60b..bf603c5c62d9db 100644
--- a/libcxx/include/__format/format_context.h
+++ b/libcxx/include/__format/format_context.h
@@ -27,7 +27,7 @@
 #include <cstddef>
 
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#  include <locale>
+#  include <__locale>
 #  include <optional>
 #endif
 
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 3ee53539f4ee6c..c7810140105a07 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -41,7 +41,7 @@
 #include <string_view>
 
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#  include <locale>
+#  include <__locale>
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index f01d323efff5fc..1d94cc349c0dd6 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -39,7 +39,7 @@
 #include <cstddef>
 
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
-#  include <locale>
+#  include <__locale>
 #endif
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__functional/bind_front.h b/libcxx/include/__functional/bind_front.h
index 30dda533615b28..11d0bac3f839ce 100644
--- a/libcxx/include/__functional/bind_front.h
+++ b/libcxx/include/__functional/bind_front.h
@@ -17,7 +17,6 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index 416c26a0c73f2e..1faa9e92ebd63e 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -28,7 +28,7 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/is_core_convertible.h>
 #include <__type_traits/is_scalar.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/strip_signature.h>
@@ -768,7 +768,7 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
   {
   }
 
-  virtual __base<_Rp(_ArgTypes...)>* __clone() const {
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual __base<_Rp(_ArgTypes...)>* __clone() const {
     _LIBCPP_ASSERT_INTERNAL(
         false,
         "Block pointers are just pointers, so they should always fit into "
@@ -777,9 +777,11 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
     return nullptr;
   }
 
-  virtual void __clone(__base<_Rp(_ArgTypes...)>* __p) const { ::new ((void*)__p) __func(__f_); }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void __clone(__base<_Rp(_ArgTypes...)>* __p) const {
+    ::new ((void*)__p) __func(__f_);
+  }
 
-  virtual void destroy() _NOEXCEPT {
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void destroy() _NOEXCEPT {
 #    ifndef _LIBCPP_HAS_OBJC_ARC
     if (__f_)
       _Block_release(__f_);
@@ -787,7 +789,7 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
     __f_ = 0;
   }
 
-  virtual void destroy_deallocate() _NOEXCEPT {
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void destroy_deallocate() _NOEXCEPT {
     _LIBCPP_ASSERT_INTERNAL(
         false,
         "Block pointers are just pointers, so they should always fit into "
@@ -795,16 +797,20 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
         "never be invoked.");
   }
 
-  virtual _Rp operator()(_ArgTypes&&... __arg) { return std::__invoke(__f_, std::forward<_ArgTypes>(__arg)...); }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual _Rp operator()(_ArgTypes&&... __arg) {
+    return std::__invoke(__f_, std::forward<_ArgTypes>(__arg)...);
+  }
 
 #    ifndef _LIBCPP_HAS_NO_RTTI
-  virtual const void* target(type_info const& __ti) const _NOEXCEPT {
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual const void* target(type_info const& __ti) const _NOEXCEPT {
     if (__ti == typeid(__func::__block_type))
       return &__f_;
     return (const void*)nullptr;
   }
 
-  virtual const std::type_info& target_type() const _NOEXCEPT { return typeid(__func::__block_type); }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual const std::type_info& target_type() const _NOEXCEPT {
+    return typeid(__func::__block_type);
+  }
 #    endif // _LIBCPP_HAS_NO_RTTI
 };
 
diff --git a/libcxx/include/__functional/hash.h b/libcxx/include/__functional/hash.h
index a466c837038f8d..a9e450edd39f53 100644
--- a/libcxx/include/__functional/hash.h
+++ b/libcxx/include/__functional/hash.h
@@ -10,23 +10,18 @@
 #define _LIBCPP___FUNCTIONAL_HASH_H
 
 #include <__config>
-#include <__functional/invoke.h>
 #include <__functional/unary_function.h>
 #include <__fwd/functional.h>
-#include <__tuple/sfinae_helpers.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
+#include <__type_traits/conjunction.h>
+#include <__type_traits/invoke.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_enum.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/underlying_type.h>
-#include <__utility/forward.h>
-#include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/swap.h>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__functional/not_fn.h b/libcxx/include/__functional/not_fn.h
index 23a491c135d79d..4b3ce5524a7434 100644
--- a/libcxx/include/__functional/not_fn.h
+++ b/libcxx/include/__functional/not_fn.h
@@ -16,7 +16,6 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index e6691e78a267f8..a705117d0173f9 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -28,12 +28,9 @@
 #include <__type_traits/can_extract_key.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_const.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_swappable.h>
diff --git a/libcxx/include/__iterator/cpp17_iterator_concepts.h b/libcxx/include/__iterator/cpp17_iterator_concepts.h
index d1ad2b4e284808..cdb561e68452af 100644
--- a/libcxx/include/__iterator/cpp17_iterator_concepts.h
+++ b/libcxx/include/__iterator/cpp17_iterator_concepts.h
@@ -14,10 +14,8 @@
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_signed.h>
 #include <__type_traits/is_void.h>
 #include <__utility/as_const.h>
diff --git a/libcxx/include/__iterator/iter_move.h b/libcxx/include/__iterator/iter_move.h
index 202b94cccc5ac6..ba8aed3c0ffbbd 100644
--- a/libcxx/include/__iterator/iter_move.h
+++ b/libcxx/include/__iterator/iter_move.h
@@ -35,7 +35,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 namespace __iter_move {
 
-void iter_move();
+void iter_move() = delete;
 
 template <class _Tp>
 concept __unqualified_iter_move = __class_or_enum<remove_cvref_t<_Tp>> && requires(_Tp&& __t) {
diff --git a/libcxx/include/__iterator/iterator_traits.h b/libcxx/include/__iterator/iterator_traits.h
index 2cd82525ba0639..11af9e301842cf 100644
--- a/libcxx/include/__iterator/iterator_traits.h
+++ b/libcxx/include/__iterator/iterator_traits.h
@@ -21,7 +21,6 @@
 #include <__fwd/pair.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/readable_traits.h>
-#include <__type_traits/add_const.h>
 #include <__type_traits/common_reference.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/disjunction.h>
@@ -493,8 +492,8 @@ using __iter_mapped_type = typename iterator_traits<_InputIterator>::value_type:
 
 template <class _InputIterator>
 using __iter_to_alloc_type =
-    pair< typename add_const<typename iterator_traits<_InputIterator>::value_type::first_type>::type,
-          typename iterator_traits<_InputIterator>::value_type::second_type>;
+    pair<const typename iterator_traits<_InputIterator>::value_type::first_type,
+         typename iterator_traits<_InputIterator>::value_type::second_type>;
 
 template <class _Iter>
 using __iterator_category_type = typename iterator_traits<_Iter>::iterator_category;
diff --git a/libcxx/include/__iterator/ranges_iterator_traits.h b/libcxx/include/__iterator/ranges_iterator_traits.h
index a30864199df76b..859e7082048ac1 100644
--- a/libcxx/include/__iterator/ranges_iterator_traits.h
+++ b/libcxx/include/__iterator/ranges_iterator_traits.h
@@ -13,7 +13,6 @@
 #include <__config>
 #include <__fwd/pair.h>
 #include <__ranges/concepts.h>
-#include <__type_traits/add_const.h>
 #include <__type_traits/remove_const.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -32,8 +31,7 @@ using __range_mapped_type = typename ranges::range_value_t<_Range>::second_type;
 
 template <ranges::input_range _Range>
 using __range_to_alloc_type =
-    pair<add_const_t<typename ranges::range_value_t<_Range>::first_type>,
-         typename ranges::range_value_t<_Range>::second_type>;
+    pair<const typename ranges::range_value_t<_Range>::first_type, typename ranges::range_value_t<_Range>::second_type>;
 
 #endif
 
diff --git a/libcxx/include/__iterator/reverse_iterator.h b/libcxx/include/__iterator/reverse_iterator.h
index 5900b1c5ac154e..2ae14619348536 100644
--- a/libcxx/include/__iterator/reverse_iterator.h
+++ b/libcxx/include/__iterator/reverse_iterator.h
@@ -34,7 +34,7 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__utility/declval.h>
@@ -316,172 +316,6 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator<_Ite
 }
 #endif
 
-#if _LIBCPP_STD_VER <= 17
-template <class _Iter>
-using __unconstrained_reverse_iterator = reverse_iterator<_Iter>;
-#else
-
-// __unconstrained_reverse_iterator allows us to use reverse iterators in the implementation of algorithms by working
-// around a language issue in C++20.
-// In C++20, when a reverse iterator wraps certain C++20-hostile iterators, calling comparison operators on it will
-// result in a compilation error. However, calling comparison operators on the pristine hostile iterator is not
-// an error. Thus, we cannot use reverse_iterators in the implementation of an algorithm that accepts a
-// C++20-hostile iterator. This class is an internal workaround -- it is a copy of reverse_iterator with
-// tweaks to make it support hostile iterators.
-//
-// A C++20-hostile iterator is one that defines a comparison operator where one of the arguments is an exact match
-// and the other requires an implicit conversion, for example:
-//   friend bool operator==(const BaseIter&, const DerivedIter&);
-//
-// C++20 rules for rewriting equality operators create another overload of this function with parameters reversed:
-//   friend bool operator==(const DerivedIter&, const BaseIter&);
-//
-// This creates an ambiguity in overload resolution.
-//
-// Clang treats this ambiguity differently in different contexts. When operator== is actually called in the function
-// body, the code is accepted with a warning. When a concept requires operator== to be a valid expression, however,
-// it evaluates to false. Thus, the implementation of reverse_iterator::operator== can actually call operator== on its
-// base iterators, but the constraints on reverse_iterator::operator== prevent it from being considered during overload
-// resolution. This class simply removes the problematic constraints from comparison functions.
-template <class _Iter>
-class __unconstrained_reverse_iterator {
-  _Iter __iter_;
-
-public:
-  static_assert(__has_bidirectional_iterator_category<_Iter>::value || bidirectional_iterator<_Iter>);
-
-  using iterator_type = _Iter;
-  using iterator_category =
-      _If<__has_random_access_iterator_category<_Iter>::value,
-          random_access_iterator_tag,
-          __iterator_category_type<_Iter>>;
-  using pointer         = __iterator_pointer_type<_Iter>;
-  using value_type      = iter_value_t<_Iter>;
-  using difference_type = iter_difference_t<_Iter>;
-  using reference       = iter_reference_t<_Iter>;
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator()                                        = default;
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator(const __unconstrained_reverse_iterator&) = default;
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit __unconstrained_reverse_iterator(_Iter __iter) : __iter_(__iter) {}
-
-  _LIBCPP_HIDE_FROM_ABI constexpr _Iter base() const { return __iter_; }
-  _LIBCPP_HIDE_FROM_ABI constexpr reference operator*() const {
-    auto __tmp = __iter_;
-    return *--__tmp;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr pointer operator->() const {
-    if constexpr (is_pointer_v<_Iter>) {
-      return std::prev(__iter_);
-    } else {
-      return std::prev(__iter_).operator->();
-    }
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr iter_rvalue_reference_t<_Iter>
-  iter_move(const __unconstrained_reverse_iterator& __i) noexcept(
-      is_nothrow_copy_constructible_v<_Iter>&& noexcept(ranges::iter_move(--std::declval<_Iter&>()))) {
-    auto __tmp = __i.base();
-    return ranges::iter_move(--__tmp);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator++() {
-    --__iter_;
-    return *this;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator++(int) {
-    auto __tmp = *this;
-    --__iter_;
-    return __tmp;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator--() {
-    ++__iter_;
-    return *this;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator--(int) {
-    auto __tmp = *this;
-    ++__iter_;
-    return __tmp;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator+=(difference_type __n) {
-    __iter_ -= __n;
-    return *this;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator& operator-=(difference_type __n) {
-    __iter_ += __n;
-    return *this;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator+(difference_type __n) const {
-    return __unconstrained_reverse_iterator(__iter_ - __n);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr __unconstrained_reverse_iterator operator-(difference_type __n) const {
-    return __unconstrained_reverse_iterator(__iter_ + __n);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr difference_type operator-(const __unconstrained_reverse_iterator& __other) const {
-    return __other.__iter_ - __iter_;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI constexpr auto operator[](difference_type __n) const { return *(*this + __n); }
-
-  // Deliberately unconstrained unlike the comparison functions in `reverse_iterator` -- see the class comment for the
-  // rationale.
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator==(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() == __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator!=(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() != __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator<(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() > __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator>(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() < __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator<=(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() >= __rhs.base();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool
-  operator>=(const __unconstrained_reverse_iterator& __lhs, const __unconstrained_reverse_iterator& __rhs) {
-    return __lhs.base() <= __rhs.base();
-  }
-};
-
-#endif // _LIBCPP_STD_VER <= 17
-
-template <template <class> class _RevIter1, template <class> class _RevIter2, class _Iter>
-struct __unwrap_reverse_iter_impl {
-  using _UnwrappedIter  = decltype(__unwrap_iter_impl<_Iter>::__unwrap(std::declval<_Iter>()));
-  using _ReverseWrapper = _RevIter1<_RevIter2<_Iter> >;
-
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ReverseWrapper
-  __rewrap(_ReverseWrapper __orig_iter, _UnwrappedIter __unwrapped_iter) {
-    return _ReverseWrapper(
-        _RevIter2<_Iter>(__unwrap_iter_impl<_Iter>::__rewrap(__orig_iter.base().base(), __unwrapped_iter)));
-  }
-
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _UnwrappedIter __unwrap(_ReverseWrapper __i) _NOEXCEPT {
-    return __unwrap_iter_impl<_Iter>::__unwrap(__i.base().base());
-  }
-};
-
 #if _LIBCPP_STD_VER >= 20
 template <ranges::bidirectional_range _Range>
 _LIBCPP_HIDE_FROM_ABI constexpr ranges::subrange<reverse_iterator<ranges::iterator_t<_Range>>,
@@ -493,24 +327,20 @@ __reverse_range(_Range&& __range) {
 #endif
 
 template <class _Iter, bool __b>
-struct __unwrap_iter_impl<reverse_iterator<reverse_iterator<_Iter> >, __b>
-    : __unwrap_reverse_iter_impl<reverse_iterator, reverse_iterator, _Iter> {};
-
-#if _LIBCPP_STD_VER >= 20
-
-template <class _Iter, bool __b>
-struct __unwrap_iter_impl<reverse_iterator<__unconstrained_reverse_iterator<_Iter>>, __b>
-    : __unwrap_reverse_iter_impl<reverse_iterator, __unconstrained_reverse_iterator, _Iter> {};
-
-template <class _Iter, bool __b>
-struct __unwrap_iter_impl<__unconstrained_reverse_iterator<reverse_iterator<_Iter>>, __b>
-    : __unwrap_reverse_iter_impl<__unconstrained_reverse_iterator, reverse_iterator, _Iter> {};
+struct __unwrap_iter_impl<reverse_iterator<reverse_iterator<_Iter> >, __b> {
+  using _UnwrappedIter  = decltype(__unwrap_iter_impl<_Iter>::__unwrap(std::declval<_Iter>()));
+  using _ReverseWrapper = reverse_iterator<reverse_iterator<_Iter> >;
 
-template <class _Iter, bool __b>
-struct __unwrap_iter_impl<__unconstrained_reverse_iterator<__unconstrained_reverse_iterator<_Iter>>, __b>
-    : __unwrap_reverse_iter_impl<__unconstrained_reverse_iterator, __unconstrained_reverse_iterator, _Iter> {};
+  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _ReverseWrapper
+  __rewrap(_ReverseWrapper __orig_iter, _UnwrappedIter __unwrapped_iter) {
+    return _ReverseWrapper(
+        reverse_iterator<_Iter>(__unwrap_iter_impl<_Iter>::__rewrap(__orig_iter.base().base(), __unwrapped_iter)));
+  }
 
-#endif // _LIBCPP_STD_VER >= 20
+  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _UnwrappedIter __unwrap(_ReverseWrapper __i) _NOEXCEPT {
+    return __unwrap_iter_impl<_Iter>::__unwrap(__i.base().base());
+  }
+};
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__mdspan/mdspan.h b/libcxx/include/__mdspan/mdspan.h
index 684828eb90ec7c..d9a0108d6d3507 100644
--- a/libcxx/include/__mdspan/mdspan.h
+++ b/libcxx/include/__mdspan/mdspan.h
@@ -27,7 +27,6 @@
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
diff --git a/libcxx/include/__memory/allocator_traits.h b/libcxx/include/__memory/allocator_traits.h
index e79512b2ed1468..efdaa5e4ed8a5f 100644
--- a/libcxx/include/__memory/allocator_traits.h
+++ b/libcxx/include/__memory/allocator_traits.h
@@ -14,9 +14,8 @@
 #include <__memory/construct_at.h>
 #include <__memory/pointer_traits.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/is_copy_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_empty.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/make_unsigned.h>
 #include <__type_traits/remove_reference.h>
 #include <__type_traits/void_t.h>
diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h
index 373e131a7877da..328849d7cc12d8 100644
--- a/libcxx/include/__memory/compressed_pair.h
+++ b/libcxx/include/__memory/compressed_pair.h
@@ -16,7 +16,7 @@
 #include <__type_traits/decay.h>
 #include <__type_traits/dependent_type.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/is_default_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_empty.h>
 #include <__type_traits/is_final.h>
 #include <__type_traits/is_same.h>
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index d90e1449678e5c..794a794d8fd85a 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -37,8 +37,8 @@
 #include <__type_traits/disjunction.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_bounded_array.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/nat.h>
diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h
index 52cce1cf7c7c6a..7475ef5cf85def 100644
--- a/libcxx/include/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__memory/uninitialized_algorithms.h
@@ -25,10 +25,8 @@
 #include <__type_traits/extent.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_constant_evaluated.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
-#include <__type_traits/is_trivially_move_assignable.h>
-#include <__type_traits/is_trivially_move_constructible.h>
+#include <__type_traits/is_trivially_assignable.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_relocatable.h>
 #include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/negation.h>
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index a505dab8dd7491..46d9405e31596d 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -28,7 +28,6 @@
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_reference.h>
diff --git a/libcxx/include/__mutex/mutex.h b/libcxx/include/__mutex/mutex.h
index ddc85cf5a00d51..1ed01547126f48 100644
--- a/libcxx/include/__mutex/mutex.h
+++ b/libcxx/include/__mutex/mutex.h
@@ -11,7 +11,7 @@
 
 #include <__config>
 #include <__thread/support.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__ranges/access.h b/libcxx/include/__ranges/access.h
index 263fdd637fd965..3db4f11b40f2d0 100644
--- a/libcxx/include/__ranges/access.h
+++ b/libcxx/include/__ranges/access.h
@@ -41,12 +41,11 @@ concept __can_borrow = is_lvalue_reference_v<_Tp> || enable_borrowed_range<remov
 namespace ranges {
 namespace __begin {
 template <class _Tp>
-concept __member_begin = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_begin = __can_borrow<_Tp> && requires(_Tp&& __t) {
   { _LIBCPP_AUTO_CAST(__t.begin()) } -> input_or_output_iterator;
 };
 
-void begin(auto&)       = delete;
-void begin(const auto&) = delete;
+void begin() = delete;
 
 template <class _Tp>
 concept __unqualified_begin =
@@ -104,13 +103,12 @@ using iterator_t = decltype(ranges::begin(std::declval<_Tp&>()));
 namespace ranges {
 namespace __end {
 template <class _Tp>
-concept __member_end = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_end = __can_borrow<_Tp> && requires(_Tp&& __t) {
   typename iterator_t<_Tp>;
   { _LIBCPP_AUTO_CAST(__t.end()) } -> sentinel_for<iterator_t<_Tp>>;
 };
 
-void end(auto&)       = delete;
-void end(const auto&) = delete;
+void end() = delete;
 
 template <class _Tp>
 concept __unqualified_end =
diff --git a/libcxx/include/__ranges/data.h b/libcxx/include/__ranges/data.h
index 18002bb52cc8ce..131f6cdad8f21c 100644
--- a/libcxx/include/__ranges/data.h
+++ b/libcxx/include/__ranges/data.h
@@ -40,7 +40,7 @@ template <class _Tp>
 concept __ptr_to_object = is_pointer_v<_Tp> && is_object_v<remove_pointer_t<_Tp>>;
 
 template <class _Tp>
-concept __member_data = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_data = __can_borrow<_Tp> && requires(_Tp&& __t) {
   { _LIBCPP_AUTO_CAST(__t.data()) } -> __ptr_to_object;
 };
 
diff --git a/libcxx/include/__ranges/empty.h b/libcxx/include/__ranges/empty.h
index acd55dae224ce7..5c1004042aba51 100644
--- a/libcxx/include/__ranges/empty.h
+++ b/libcxx/include/__ranges/empty.h
@@ -29,7 +29,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 namespace __empty {
 template <class _Tp>
-concept __member_empty = __workaround_52970<_Tp> && requires(_Tp&& __t) { bool(__t.empty()); };
+concept __member_empty = requires(_Tp&& __t) { bool(__t.empty()); };
 
 template <class _Tp>
 concept __can_invoke_size = !__member_empty<_Tp> && requires(_Tp&& __t) { ranges::size(__t); };
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index c8314dd848b447..9e6f724241ccf1 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -31,7 +31,7 @@
 #include <__ranges/movable_box.h>
 #include <__ranges/view_interface.h>
 #include <__type_traits/conditional.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/make_unsigned.h>
 #include <__type_traits/type_identity.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/__ranges/movable_box.h b/libcxx/include/__ranges/movable_box.h
index 9b38877494eaa4..5a456cc3a1b664 100644
--- a/libcxx/include/__ranges/movable_box.h
+++ b/libcxx/include/__ranges/movable_box.h
@@ -17,8 +17,6 @@
 #include <__memory/addressof.h>
 #include <__memory/construct_at.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
 #include <__utility/move.h>
 #include <optional>
 
diff --git a/libcxx/include/__ranges/rbegin.h b/libcxx/include/__ranges/rbegin.h
index 7111201ae7d6bb..12e739e1a2b852 100644
--- a/libcxx/include/__ranges/rbegin.h
+++ b/libcxx/include/__ranges/rbegin.h
@@ -36,12 +36,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 namespace __rbegin {
 template <class _Tp>
-concept __member_rbegin = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_rbegin = __can_borrow<_Tp> && requires(_Tp&& __t) {
   { _LIBCPP_AUTO_CAST(__t.rbegin()) } -> input_or_output_iterator;
 };
 
-void rbegin(auto&)       = delete;
-void rbegin(const auto&) = delete;
+void rbegin() = delete;
 
 template <class _Tp>
 concept __unqualified_rbegin =
diff --git a/libcxx/include/__ranges/rend.h b/libcxx/include/__ranges/rend.h
index 58d98aafd264b8..5edbc4e3160c5f 100644
--- a/libcxx/include/__ranges/rend.h
+++ b/libcxx/include/__ranges/rend.h
@@ -37,13 +37,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 namespace __rend {
 template <class _Tp>
-concept __member_rend = __can_borrow<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_rend = __can_borrow<_Tp> && requires(_Tp&& __t) {
   ranges::rbegin(__t);
   { _LIBCPP_AUTO_CAST(__t.rend()) } -> sentinel_for<decltype(ranges::rbegin(__t))>;
 };
 
-void rend(auto&)       = delete;
-void rend(const auto&) = delete;
+void rend() = delete;
 
 template <class _Tp>
 concept __unqualified_rend =
diff --git a/libcxx/include/__ranges/size.h b/libcxx/include/__ranges/size.h
index 14e21aae6bf1d5..40b0c6b6aad7a3 100644
--- a/libcxx/include/__ranges/size.h
+++ b/libcxx/include/__ranges/size.h
@@ -41,14 +41,13 @@ inline constexpr bool disable_sized_range = false;
 
 namespace ranges {
 namespace __size {
-void size(auto&)       = delete;
-void size(const auto&) = delete;
+void size() = delete;
 
 template <class _Tp>
 concept __size_enabled = !disable_sized_range<remove_cvref_t<_Tp>>;
 
 template <class _Tp>
-concept __member_size = __size_enabled<_Tp> && __workaround_52970<_Tp> && requires(_Tp&& __t) {
+concept __member_size = __size_enabled<_Tp> && requires(_Tp&& __t) {
   { _LIBCPP_AUTO_CAST(__t.size()) } -> __integer_like;
 };
 
diff --git a/libcxx/include/__ranges/view_interface.h b/libcxx/include/__ranges/view_interface.h
index 84dd1c316de379..3bcfbaf3a2f9ed 100644
--- a/libcxx/include/__ranges/view_interface.h
+++ b/libcxx/include/__ranges/view_interface.h
@@ -21,6 +21,7 @@
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__ranges/empty.h>
+#include <__ranges/size.h>
 #include <__type_traits/is_class.h>
 #include <__type_traits/make_unsigned.h>
 #include <__type_traits/remove_cv.h>
@@ -51,16 +52,24 @@ class view_interface {
 public:
   template <class _D2 = _Derived>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool empty()
-    requires forward_range<_D2>
+    requires sized_range<_D2> || forward_range<_D2>
   {
-    return ranges::begin(__derived()) == ranges::end(__derived());
+    if constexpr (sized_range<_D2>) {
+      return ranges::size(__derived()) == 0;
+    } else {
+      return ranges::begin(__derived()) == ranges::end(__derived());
+    }
   }
 
   template <class _D2 = _Derived>
   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool empty() const
-    requires forward_range<const _D2>
+    requires sized_range<const _D2> || forward_range<const _D2>
   {
-    return ranges::begin(__derived()) == ranges::end(__derived());
+    if constexpr (sized_range<const _D2>) {
+      return ranges::size(__derived()) == 0;
+    } else {
+      return ranges::begin(__derived()) == ranges::end(__derived());
+    }
   }
 
   template <class _D2 = _Derived>
diff --git a/libcxx/include/__ranges/zip_view.h b/libcxx/include/__ranges/zip_view.h
index 4898c0afc87a6e..ce00a4e53a489b 100644
--- a/libcxx/include/__ranges/zip_view.h
+++ b/libcxx/include/__ranges/zip_view.h
@@ -30,7 +30,7 @@
 #include <__ranges/enable_borrowed_range.h>
 #include <__ranges/size.h>
 #include <__ranges/view_interface.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/make_unsigned.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index aaf955685d2d39..c68349e0979c93 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -26,9 +26,8 @@
 #include <__type_traits/add_lvalue_reference.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_destructible.h>
 #include <__type_traits/remove_reference.h>
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index 0d727ad5c9a72a..5a0d8f42a69ceb 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -26,11 +26,9 @@
 #include <__type_traits/can_extract_key.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_const.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
diff --git a/libcxx/include/__tuple/sfinae_helpers.h b/libcxx/include/__tuple/sfinae_helpers.h
index 90e9b1e5800478..35a57ff7765923 100644
--- a/libcxx/include/__tuple/sfinae_helpers.h
+++ b/libcxx/include/__tuple/sfinae_helpers.h
@@ -16,6 +16,7 @@
 #include <__tuple/tuple_like_ext.h>
 #include <__tuple/tuple_size.h>
 #include <__tuple/tuple_types.h>
+#include <__type_traits/conjunction.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_constructible.h>
@@ -32,12 +33,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #ifndef _LIBCPP_CXX03_LANG
 
-template <bool... _Preds>
-struct __all_dummy;
-
-template <bool... _Pred>
-struct __all : _IsSame<__all_dummy<_Pred...>, __all_dummy<((void)_Pred, true)...>> {};
-
 struct __tuple_sfinae_base {
   template <template <class, class...> class _Trait, class... _LArgs, class... _RArgs>
   static auto __do_test(__tuple_types<_LArgs...>, __tuple_types<_RArgs...>)
diff --git a/libcxx/include/__tuple/tuple_element.h b/libcxx/include/__tuple/tuple_element.h
index 2b9ac6696ca412..55b3b47619f648 100644
--- a/libcxx/include/__tuple/tuple_element.h
+++ b/libcxx/include/__tuple/tuple_element.h
@@ -12,9 +12,6 @@
 #include <__config>
 #include <__tuple/tuple_indices.h>
 #include <__tuple/tuple_types.h>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_cv.h>
-#include <__type_traits/add_volatile.h>
 #include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -28,17 +25,17 @@ struct _LIBCPP_TEMPLATE_VIS tuple_element;
 
 template <size_t _Ip, class _Tp>
 struct _LIBCPP_TEMPLATE_VIS tuple_element<_Ip, const _Tp> {
-  typedef _LIBCPP_NODEBUG typename add_const<typename tuple_element<_Ip, _Tp>::type>::type type;
+  typedef _LIBCPP_NODEBUG const typename tuple_element<_Ip, _Tp>::type type;
 };
 
 template <size_t _Ip, class _Tp>
 struct _LIBCPP_TEMPLATE_VIS tuple_element<_Ip, volatile _Tp> {
-  typedef _LIBCPP_NODEBUG typename add_volatile<typename tuple_element<_Ip, _Tp>::type>::type type;
+  typedef _LIBCPP_NODEBUG volatile typename tuple_element<_Ip, _Tp>::type type;
 };
 
 template <size_t _Ip, class _Tp>
 struct _LIBCPP_TEMPLATE_VIS tuple_element<_Ip, const volatile _Tp> {
-  typedef _LIBCPP_NODEBUG typename add_cv<typename tuple_element<_Ip, _Tp>::type>::type type;
+  typedef _LIBCPP_NODEBUG const volatile typename tuple_element<_Ip, _Tp>::type type;
 };
 
 #ifndef _LIBCPP_CXX03_LANG
diff --git a/libcxx/include/__type_traits/conjunction.h b/libcxx/include/__type_traits/conjunction.h
index 4bfa5a27300d1e..c2995591bbc28f 100644
--- a/libcxx/include/__type_traits/conjunction.h
+++ b/libcxx/include/__type_traits/conjunction.h
@@ -13,6 +13,7 @@
 #include <__type_traits/conditional.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
+#include <__type_traits/is_same.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -37,6 +38,12 @@ false_type __and_helper(...);
 template <class... _Pred>
 using _And _LIBCPP_NODEBUG = decltype(std::__and_helper<_Pred...>(0));
 
+template <bool... _Preds>
+struct __all_dummy;
+
+template <bool... _Pred>
+struct __all : _IsSame<__all_dummy<_Pred...>, __all_dummy<((void)_Pred, true)...> > {};
+
 #if _LIBCPP_STD_VER >= 17
 
 template <class...>
diff --git a/libcxx/include/__type_traits/copy_cv.h b/libcxx/include/__type_traits/copy_cv.h
index 3c4ee857f7bc7a..b1c057ff778b1b 100644
--- a/libcxx/include/__type_traits/copy_cv.h
+++ b/libcxx/include/__type_traits/copy_cv.h
@@ -10,9 +10,6 @@
 #define _LIBCPP___TYPE_TRAITS_COPY_CV_H
 
 #include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_cv.h>
-#include <__type_traits/add_volatile.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -29,17 +26,17 @@ struct __copy_cv {
 
 template <class _From, class _To>
 struct __copy_cv<const _From, _To> {
-  using type = typename add_const<_To>::type;
+  using type = const _To;
 };
 
 template <class _From, class _To>
 struct __copy_cv<volatile _From, _To> {
-  using type = typename add_volatile<_To>::type;
+  using type = volatile _To;
 };
 
 template <class _From, class _To>
 struct __copy_cv<const volatile _From, _To> {
-  using type = typename add_cv<_To>::type;
+  using type = const volatile _To;
 };
 
 template <class _From, class _To>
diff --git a/libcxx/include/__type_traits/is_assignable.h b/libcxx/include/__type_traits/is_assignable.h
index 11134b1c1abaf9..cfb46997778782 100644
--- a/libcxx/include/__type_traits/is_assignable.h
+++ b/libcxx/include/__type_traits/is_assignable.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_IS_ASSIGNABLE_H
 
 #include <__config>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,6 +28,25 @@ template <class _Tp, class _Arg>
 inline constexpr bool is_assignable_v = __is_assignable(_Tp, _Arg);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_copy_assignable
+    : public integral_constant<bool,
+                               __is_assignable(__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<const _Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_copy_assignable_v = is_copy_assignable<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_move_assignable
+    : public integral_constant<bool, __is_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_move_assignable_v = is_move_assignable<_Tp>::value;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_constructible.h b/libcxx/include/__type_traits/is_constructible.h
index 4e62eb061fd593..567bd165c71520 100644
--- a/libcxx/include/__type_traits/is_constructible.h
+++ b/libcxx/include/__type_traits/is_constructible.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___TYPE_IS_CONSTRUCTIBLE_H
 
 #include <__config>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,6 +28,32 @@ template <class _Tp, class... _Args>
 inline constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_copy_constructible
+    : public integral_constant<bool, __is_constructible(_Tp, __add_lvalue_reference_t<const _Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_copy_constructible_v = is_copy_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_move_constructible
+    : public integral_constant<bool, __is_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_move_constructible_v = is_move_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_default_constructible : public integral_constant<bool, __is_constructible(_Tp)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_default_constructible_v = __is_constructible(_Tp);
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_IS_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_copy_assignable.h b/libcxx/include/__type_traits/is_copy_assignable.h
deleted file mode 100644
index e607ace5404372..00000000000000
--- a/libcxx/include/__type_traits/is_copy_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_COPY_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_COPY_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_copy_assignable
-    : public integral_constant<
-          bool,
-          __is_assignable(__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_copy_assignable_v = is_copy_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_COPY_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_copy_constructible.h b/libcxx/include/__type_traits/is_copy_constructible.h
deleted file mode 100644
index 402f2b89875bbe..00000000000000
--- a/libcxx/include/__type_traits/is_copy_constructible.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_COPY_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_COPY_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_copy_constructible
-    : public integral_constant<bool, __is_constructible(_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {
-};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_copy_constructible_v = is_copy_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_COPY_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_default_constructible.h b/libcxx/include/__type_traits/is_default_constructible.h
deleted file mode 100644
index e73e9b98f5dca8..00000000000000
--- a/libcxx/include/__type_traits/is_default_constructible.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_DEFAULT_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_DEFAULT_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_default_constructible : public integral_constant<bool, __is_constructible(_Tp)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_default_constructible_v = __is_constructible(_Tp);
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_DEFAULT_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_implicitly_default_constructible.h b/libcxx/include/__type_traits/is_implicitly_default_constructible.h
index 576166e52698a8..d5dadd7b870dd9 100644
--- a/libcxx/include/__type_traits/is_implicitly_default_constructible.h
+++ b/libcxx/include/__type_traits/is_implicitly_default_constructible.h
@@ -11,7 +11,7 @@
 
 #include <__config>
 #include <__type_traits/integral_constant.h>
-#include <__type_traits/is_default_constructible.h>
+#include <__type_traits/is_constructible.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/is_move_assignable.h b/libcxx/include/__type_traits/is_move_assignable.h
deleted file mode 100644
index 867bc00d824ae3..00000000000000
--- a/libcxx/include/__type_traits/is_move_assignable.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_MOVE_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_MOVE_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_move_assignable
-    : public integral_constant<bool, __is_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_move_assignable_v = is_move_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_MOVE_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_move_constructible.h b/libcxx/include/__type_traits/is_move_constructible.h
deleted file mode 100644
index 40ec4a070816b5..00000000000000
--- a/libcxx/include/__type_traits/is_move_constructible.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_MOVE_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_MOVE_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_move_constructible
-    : public integral_constant<bool, __is_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_move_constructible_v = is_move_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_MOVE_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_assignable.h b/libcxx/include/__type_traits/is_nothrow_assignable.h
index cc574933138592..7e00c741f83e30 100644
--- a/libcxx/include/__type_traits/is_nothrow_assignable.h
+++ b/libcxx/include/__type_traits/is_nothrow_assignable.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_ASSIGNABLE_H
 
 #include <__config>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -27,6 +29,28 @@ template <class _Tp, class _Arg>
 inline constexpr bool is_nothrow_assignable_v = __is_nothrow_assignable(_Tp, _Arg);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_assignable
+    : public integral_constant<
+          bool,
+          __is_nothrow_assignable(__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<const _Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_copy_assignable_v = is_nothrow_copy_assignable<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_assignable
+    : public integral_constant<bool,
+                               __is_nothrow_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {
+};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_move_assignable_v = is_nothrow_move_assignable<_Tp>::value;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_constructible.h b/libcxx/include/__type_traits/is_nothrow_constructible.h
index f56816b943c4cb..2f7ed8487e76bb 100644
--- a/libcxx/include/__type_traits/is_nothrow_constructible.h
+++ b/libcxx/include/__type_traits/is_nothrow_constructible.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_CONSTRUCTIBLE_H
 
 #include <__config>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_reference.h>
@@ -66,6 +68,55 @@ template <class _Tp, class... _Args>
 inline constexpr bool is_nothrow_constructible_v = is_nothrow_constructible<_Tp, _Args...>::value;
 #endif
 
+// TODO: remove this implementation once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611 is fixed
+#ifdef _LIBCPP_COMPILER_GCC
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_constructible
+    : public is_nothrow_constructible<_Tp, __add_lvalue_reference_t<const _Tp> > {};
+
+#else // _LIBCPP_COMPILER_GCC
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_constructible
+    : public integral_constant< bool, __is_nothrow_constructible(_Tp, __add_lvalue_reference_t<const _Tp>)> {};
+
+#endif // _LIBCPP_COMPILER_GCC
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_copy_constructible_v = is_nothrow_copy_constructible<_Tp>::value;
+#endif
+
+// TODO: remove this implementation once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611 is fixed
+#ifndef _LIBCPP_COMPILER_GCC
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_constructible
+    : public integral_constant<bool, __is_nothrow_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
+
+#else // _LIBCPP_COMPILER_GCC
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_constructible
+    : public is_nothrow_constructible<_Tp, __add_rvalue_reference_t<_Tp> > {};
+
+#endif // _LIBCPP_COMPILER_GCC
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_move_constructible_v = is_nothrow_move_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_nothrow_default_constructible
+    : public integral_constant<bool, __is_nothrow_constructible(_Tp)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_nothrow_default_constructible_v = __is_nothrow_constructible(_Tp);
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_copy_assignable.h b/libcxx/include/__type_traits/is_nothrow_copy_assignable.h
deleted file mode 100644
index a97e962b308c8b..00000000000000
--- a/libcxx/include/__type_traits/is_nothrow_copy_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_assignable
-    : public integral_constant<bool,
-                               __is_nothrow_assignable(__add_lvalue_reference_t<_Tp>,
-                                                       __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_copy_assignable_v = is_nothrow_copy_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_copy_constructible.h b/libcxx/include/__type_traits/is_nothrow_copy_constructible.h
deleted file mode 100644
index dd7f1d5ef6276d..00000000000000
--- a/libcxx/include/__type_traits/is_nothrow_copy_constructible.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/is_nothrow_constructible.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-// TODO: remove this implementation once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611 is fixed
-#ifdef _LIBCPP_COMPILER_GCC
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_constructible
-    : public is_nothrow_constructible<_Tp, __add_lvalue_reference_t<typename add_const<_Tp>::type> > {};
-
-#else // _LIBCPP_COMPILER_GCC
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_copy_constructible
-    : public integral_constant<
-          bool,
-          __is_nothrow_constructible(_Tp, typename add_lvalue_reference<typename add_const<_Tp>::type>::type)> {};
-
-#endif // _LIBCPP_COMPILER_GCC
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_copy_constructible_v = is_nothrow_copy_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_COPY_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_default_constructible.h b/libcxx/include/__type_traits/is_nothrow_default_constructible.h
deleted file mode 100644
index 58f31f21b0267f..00000000000000
--- a/libcxx/include/__type_traits/is_nothrow_default_constructible.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DEFAULT_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DEFAULT_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_default_constructible
-    : public integral_constant<bool, __is_nothrow_constructible(_Tp)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_default_constructible_v = __is_nothrow_constructible(_Tp);
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_DEFAULT_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_move_assignable.h b/libcxx/include/__type_traits/is_nothrow_move_assignable.h
deleted file mode 100644
index aa87e369cd5d01..00000000000000
--- a/libcxx/include/__type_traits/is_nothrow_move_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_assignable
-    : public integral_constant<bool,
-                               __is_nothrow_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {
-};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_move_assignable_v = is_nothrow_move_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_nothrow_move_constructible.h b/libcxx/include/__type_traits/is_nothrow_move_constructible.h
deleted file mode 100644
index dab5a019b33bf0..00000000000000
--- a/libcxx/include/__type_traits/is_nothrow_move_constructible.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/is_nothrow_constructible.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-// TODO: remove this implementation once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106611 is fixed
-#ifndef _LIBCPP_COMPILER_GCC
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_constructible
-    : public integral_constant<bool, __is_nothrow_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
-
-#else // _LIBCPP_COMPILER_GCC
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_nothrow_move_constructible
-    : public is_nothrow_constructible<_Tp, __add_rvalue_reference_t<_Tp> > {};
-
-#endif // _LIBCPP_COMPILER_GCC
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_nothrow_move_constructible_v = is_nothrow_move_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_NOTHROW_MOVE_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_scoped_enum.h b/libcxx/include/__type_traits/is_scoped_enum.h
index 43b3a6b66b1f5d..1db88e13356e83 100644
--- a/libcxx/include/__type_traits/is_scoped_enum.h
+++ b/libcxx/include/__type_traits/is_scoped_enum.h
@@ -22,6 +22,18 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 23
+
+// TODO: GCC and Clang both have this builtin. Remove the false case once we've updated to GCC 14.
+#  if __has_builtin(__is_scoped_enum)
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_scoped_enum : bool_constant<__is_scoped_enum(_Tp)> {};
+
+template <class _Tp>
+inline constexpr bool is_scoped_enum_v = __is_scoped_enum(_Tp);
+
+#  else
+
 template <class _Tp, bool = is_enum_v<_Tp> >
 struct __is_scoped_enum_helper : false_type {};
 
@@ -33,7 +45,10 @@ struct _LIBCPP_TEMPLATE_VIS is_scoped_enum : public __is_scoped_enum_helper<_Tp>
 
 template <class _Tp>
 inline constexpr bool is_scoped_enum_v = is_scoped_enum<_Tp>::value;
-#endif
+
+#  endif // __has_builtin(__is_scoped_enum)
+
+#endif // _LIBCPP_STD_VER >= 23
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__type_traits/is_swappable.h b/libcxx/include/__type_traits/is_swappable.h
index 06e092692df3d9..06b59e5c25d045 100644
--- a/libcxx/include/__type_traits/is_swappable.h
+++ b/libcxx/include/__type_traits/is_swappable.h
@@ -13,10 +13,10 @@
 #include <__type_traits/add_lvalue_reference.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_referenceable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_void.h>
diff --git a/libcxx/include/__type_traits/is_trivially_assignable.h b/libcxx/include/__type_traits/is_trivially_assignable.h
index 19169d13d6d237..201333b0fa0b33 100644
--- a/libcxx/include/__type_traits/is_trivially_assignable.h
+++ b/libcxx/include/__type_traits/is_trivially_assignable.h
@@ -10,6 +10,9 @@
 #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_ASSIGNABLE_H
 
 #include <__config>
+#include <__type_traits/add_const.h>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,6 +29,28 @@ template <class _Tp, class _Arg>
 inline constexpr bool is_trivially_assignable_v = __is_trivially_assignable(_Tp, _Arg);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_assignable
+    : public integral_constant<
+          bool,
+          __is_trivially_assignable(__add_lvalue_reference_t<_Tp>, __add_lvalue_reference_t<const _Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_copy_assignable_v = is_trivially_copy_assignable<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_move_assignable
+    : public integral_constant<
+          bool,
+          __is_trivially_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_move_assignable_v = is_trivially_move_assignable<_Tp>::value;
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_constructible.h b/libcxx/include/__type_traits/is_trivially_constructible.h
index 4faaf9323cd5bb..3a77e9fe164da1 100644
--- a/libcxx/include/__type_traits/is_trivially_constructible.h
+++ b/libcxx/include/__type_traits/is_trivially_constructible.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_CONSTRUCTIBLE_H
 
 #include <__config>
+#include <__type_traits/add_lvalue_reference.h>
+#include <__type_traits/add_rvalue_reference.h>
 #include <__type_traits/integral_constant.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -27,6 +29,33 @@ template <class _Tp, class... _Args>
 inline constexpr bool is_trivially_constructible_v = __is_trivially_constructible(_Tp, _Args...);
 #endif
 
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_constructible
+    : public integral_constant<bool, __is_trivially_constructible(_Tp, __add_lvalue_reference_t<const _Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_copy_constructible_v = is_trivially_copy_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_move_constructible
+    : public integral_constant<bool, __is_trivially_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_move_constructible_v = is_trivially_move_constructible<_Tp>::value;
+#endif
+
+template <class _Tp>
+struct _LIBCPP_TEMPLATE_VIS is_trivially_default_constructible
+    : public integral_constant<bool, __is_trivially_constructible(_Tp)> {};
+
+#if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+inline constexpr bool is_trivially_default_constructible_v = __is_trivially_constructible(_Tp);
+#endif
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_copy_assignable.h b/libcxx/include/__type_traits/is_trivially_copy_assignable.h
deleted file mode 100644
index b6b3c119746f5b..00000000000000
--- a/libcxx/include/__type_traits/is_trivially_copy_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_const.h>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_assignable
-    : public integral_constant<bool,
-                               __is_trivially_assignable(__add_lvalue_reference_t<_Tp>,
-                                                         __add_lvalue_reference_t<typename add_const<_Tp>::type>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_copy_assignable_v = is_trivially_copy_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_copy_constructible.h b/libcxx/include/__type_traits/is_trivially_copy_constructible.h
deleted file mode 100644
index 8e71fd1fbf8b3e..00000000000000
--- a/libcxx/include/__type_traits/is_trivially_copy_constructible.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_copy_constructible
-    : public integral_constant<bool, __is_trivially_constructible(_Tp, __add_lvalue_reference_t<const _Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_copy_constructible_v = is_trivially_copy_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_COPY_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_default_constructible.h b/libcxx/include/__type_traits/is_trivially_default_constructible.h
deleted file mode 100644
index c3b6152a9fcad8..00000000000000
--- a/libcxx/include/__type_traits/is_trivially_default_constructible.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_DEFAULT_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_DEFAULT_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_default_constructible
-    : public integral_constant<bool, __is_trivially_constructible(_Tp)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_default_constructible_v = __is_trivially_constructible(_Tp);
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_DEFAULT_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_move_assignable.h b/libcxx/include/__type_traits/is_trivially_move_assignable.h
deleted file mode 100644
index daf890b26c7432..00000000000000
--- a/libcxx/include/__type_traits/is_trivially_move_assignable.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_ASSIGNABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_ASSIGNABLE_H
-
-#include <__config>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_move_assignable
-    : public integral_constant<
-          bool,
-          __is_trivially_assignable(__add_lvalue_reference_t<_Tp>, __add_rvalue_reference_t<_Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_move_assignable_v = is_trivially_move_assignable<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_ASSIGNABLE_H
diff --git a/libcxx/include/__type_traits/is_trivially_move_constructible.h b/libcxx/include/__type_traits/is_trivially_move_constructible.h
deleted file mode 100644
index 71e6f898fb3f38..00000000000000
--- a/libcxx/include/__type_traits/is_trivially_move_constructible.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE_H
-
-#include <__config>
-#include <__type_traits/add_rvalue_reference.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct _LIBCPP_TEMPLATE_VIS is_trivially_move_constructible
-    : public integral_constant<bool, __is_trivially_constructible(_Tp, __add_rvalue_reference_t<_Tp>)> {};
-
-#if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-inline constexpr bool is_trivially_move_constructible_v = is_trivially_move_constructible<_Tp>::value;
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE_H
diff --git a/libcxx/include/__type_traits/noexcept_move_assign_container.h b/libcxx/include/__type_traits/noexcept_move_assign_container.h
index 1d6c07e8cf96d3..baaf36d9980e94 100644
--- a/libcxx/include/__type_traits/noexcept_move_assign_container.h
+++ b/libcxx/include/__type_traits/noexcept_move_assign_container.h
@@ -12,7 +12,7 @@
 #include <__config>
 #include <__memory/allocator_traits.h>
 #include <__type_traits/integral_constant.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
+#include <__type_traits/is_nothrow_assignable.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__type_traits/remove_reference.h b/libcxx/include/__type_traits/remove_reference.h
index fd66417bd84fed..ba67891758adce 100644
--- a/libcxx/include/__type_traits/remove_reference.h
+++ b/libcxx/include/__type_traits/remove_reference.h
@@ -10,7 +10,6 @@
 #define _LIBCPP___TYPE_TRAITS_REMOVE_REFERENCE_H
 
 #include <__config>
-#include <cstddef>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -26,15 +25,16 @@ struct remove_reference {
 
 template <class _Tp>
 using __libcpp_remove_reference_t = __remove_reference_t(_Tp);
-#else
-// clang-format off
-template <class _Tp> struct _LIBCPP_TEMPLATE_VIS remove_reference        {typedef _LIBCPP_NODEBUG _Tp type;};
-template <class _Tp> struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&>  {typedef _LIBCPP_NODEBUG _Tp type;};
-template <class _Tp> struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&&> {typedef _LIBCPP_NODEBUG _Tp type;};
-// clang-format on
+#elif __has_builtin(__remove_reference)
+template <class _Tp>
+struct remove_reference {
+  using type _LIBCPP_NODEBUG = __remove_reference(_Tp);
+};
 
 template <class _Tp>
 using __libcpp_remove_reference_t = typename remove_reference<_Tp>::type;
+#else
+#  error "remove_reference not implemented!"
 #endif // __has_builtin(__remove_reference_t)
 
 #if _LIBCPP_STD_VER >= 14
diff --git a/libcxx/include/__utility/exception_guard.h b/libcxx/include/__utility/exception_guard.h
index 8d90dfd5f1907a..93290be40e7148 100644
--- a/libcxx/include/__utility/exception_guard.h
+++ b/libcxx/include/__utility/exception_guard.h
@@ -11,7 +11,7 @@
 
 #include <__assert>
 #include <__config>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__utility/exchange.h>
 #include <__utility/move.h>
 
diff --git a/libcxx/include/__utility/exchange.h b/libcxx/include/__utility/exchange.h
index 72312c06b5886d..957e9d0acaa65e 100644
--- a/libcxx/include/__utility/exchange.h
+++ b/libcxx/include/__utility/exchange.h
@@ -11,7 +11,7 @@
 
 #include <__config>
 #include <__type_traits/is_nothrow_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
 
diff --git a/libcxx/include/__utility/move.h b/libcxx/include/__utility/move.h
index 86d7fc88473bd2..626535c8b1ee39 100644
--- a/libcxx/include/__utility/move.h
+++ b/libcxx/include/__utility/move.h
@@ -12,8 +12,8 @@
 
 #include <__config>
 #include <__type_traits/conditional.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/remove_reference.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index b488a9829c3849..0056806877e13c 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -30,16 +30,9 @@
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_implicitly_default_constructible.h>
-#include <__type_traits/is_move_assignable.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_assignable.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/nat.h>
diff --git a/libcxx/include/__utility/small_buffer.h b/libcxx/include/__utility/small_buffer.h
index f3415a5e50f999..9e13797573d2d7 100644
--- a/libcxx/include/__utility/small_buffer.h
+++ b/libcxx/include/__utility/small_buffer.h
@@ -12,8 +12,8 @@
 #include <__config>
 #include <__memory/construct_at.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__utility/exception_guard.h>
 #include <__utility/forward.h>
 #include <cstddef>
diff --git a/libcxx/include/__utility/swap.h b/libcxx/include/__utility/swap.h
index ca8280c729df1d..d678d2cba7dc9f 100644
--- a/libcxx/include/__utility/swap.h
+++ b/libcxx/include/__utility/swap.h
@@ -10,10 +10,10 @@
 #define _LIBCPP___UTILITY_SWAP_H
 
 #include <__config>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_assignable.h>
+#include <__type_traits/is_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_swappable.h>
 #include <__utility/declval.h>
 #include <__utility/move.h>
diff --git a/libcxx/include/any b/libcxx/include/any
index ce54803cd91b5b..a6212fedfa2cd0 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -92,9 +92,8 @@ namespace std {
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_copy_constructible.h>
 #include <__type_traits/is_function.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_void.h>
diff --git a/libcxx/include/array b/libcxx/include/array
index 7fa5dc1479349e..977f913c3e748a 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -123,12 +123,11 @@ template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexce
 #include <__iterator/wrap_iter.h>
 #include <__tuple/sfinae_helpers.h>
 #include <__type_traits/conditional.h>
+#include <__type_traits/conjunction.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_const.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/remove_cv.h>
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index 0320c1dc4c2f96..5bab3f8ad5cf05 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -883,6 +883,7 @@ constexpr chrono::year                                  operator ""y(unsigned lo
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER == 20
 #  include <charconv>
+#  include <locale>
 #endif
 
 #endif // _LIBCPP_CHRONO
diff --git a/libcxx/include/complex b/libcxx/include/complex
index e996485a38ae67..a81f968143c4cf 100644
--- a/libcxx/include/complex
+++ b/libcxx/include/complex
@@ -258,6 +258,7 @@ template<class T> complex<T> tanh (const complex<T>&);
 
 #include <__config>
 #include <__fwd/complex.h>
+#include <__fwd/tuple.h>
 #include <__tuple/tuple_element.h>
 #include <__tuple/tuple_size.h>
 #include <__utility/move.h>
@@ -1443,15 +1444,9 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const complex<_Tp>& __x) {
 
 // [complex.tuple], tuple interface
 
-template <class _Tp>
-struct tuple_size;
-
 template <class _Tp>
 struct tuple_size<complex<_Tp>> : integral_constant<size_t, 2> {};
 
-template <size_t _Ip, class _Tp>
-struct tuple_element;
-
 template <size_t _Ip, class _Tp>
 struct tuple_element<_Ip, complex<_Tp>> {
   static_assert(_Ip < 2, "Index value is out of range.");
diff --git a/libcxx/include/coroutine b/libcxx/include/coroutine
index 4bd1d4e9c3103a..b1ba83b541b4b7 100644
--- a/libcxx/include/coroutine
+++ b/libcxx/include/coroutine
@@ -56,6 +56,7 @@ struct suspend_always;
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <iosfwd>
+#  include <limits>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/cwchar b/libcxx/include/cwchar
index 7442438d8f447f..4cc6f56c389bfe 100644
--- a/libcxx/include/cwchar
+++ b/libcxx/include/cwchar
@@ -254,4 +254,8 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __constexpr_wmemchr(_Tp
 
 _LIBCPP_END_NAMESPACE_STD
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cstddef>
+#endif
+
 #endif // _LIBCPP_CWCHAR
diff --git a/libcxx/include/execution b/libcxx/include/execution
index 822ffa1fd3ebc4..94d434b2e4603e 100644
--- a/libcxx/include/execution
+++ b/libcxx/include/execution
@@ -142,4 +142,8 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cstddef>
+#endif
+
 #endif // _LIBCPP_EXECUTION
diff --git a/libcxx/include/experimental/memory b/libcxx/include/experimental/memory
index 7e698fe9dee771..e9663d43a8ab73 100644
--- a/libcxx/include/experimental/memory
+++ b/libcxx/include/experimental/memory
@@ -191,4 +191,8 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_ENABLE_EXPERIMENTAL
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <limits>
+#endif
+
 #endif /* _LIBCPP_EXPERIMENTAL_MEMORY */
diff --git a/libcxx/include/format b/libcxx/include/format
index c0485c5a103596..146613464534f7 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -221,4 +221,8 @@ namespace std {
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <locale>
+#endif
+
 #endif // _LIBCPP_FORMAT
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index a62b171a46783b..1dcbe8f88b90db 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -221,9 +221,8 @@ template <class T, class Allocator, class Predicate>
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_const.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/type_identity.h>
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 776641b347e6c1..7a084d114b1855 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -308,6 +308,43 @@ private:
   state_type __st_;
   state_type __st_last_;
   ios_base::openmode __om_;
+  // There have been no file operations yet, which allows setting unbuffered
+  // I/O mode.
+  static const ios_base::openmode __no_io_operations = ios_base::trunc;
+  // Unbuffered I/O mode has been requested.
+  static const ios_base::openmode __use_unbuffered_io = ios_base::ate;
+  // Used to track the currently used mode and track whether the output should
+  // be unbuffered.
+  // [filebuf.virtuals]/12
+  //   If setbuf(0, 0) is called on a stream before any I/O has occurred on
+  //   that stream, the stream becomes unbuffered. Otherwise the results are
+  //   implementation-defined.
+  // This allows calling setbuf(0, 0)
+  // - before opening a file,
+  // - after opening a file, before
+  //   - a read
+  //   - a write
+  //   - a seek.
+  // Note that opening a file with ios_base::ate does a seek operation.
+  // Normally underflow, overflow, and sync change this flag to ios_base::in,
+  // ios_base_out, or 0.
+  //
+  // The ios_base::trunc and ios_base::ate flags are not used in __cm_. They
+  // are used to track the state of the unbuffered request. For readability
+  // they have the aliases __no_io_operations and __use_unbuffered_io
+  // respectively.
+  //
+  // The __no_io_operations and __use_unbuffered_io flags are used in the
+  // following way:
+  // - __no_io_operations is set upon construction to indicate the unbuffered
+  //   state can be set.
+  // - When requesting unbuffered output:
+  //   - If the file is open it sets the mode.
+  //   - Else places a request by adding the __use_unbuffered_io flag.
+  // - When a file is opened it checks whether both __no_io_operations and
+  //   __use_unbuffered_io are set. If so switches to unbuffered mode.
+  // - All file I/O operations change the mode effectively clearing the
+  //   __no_io_operations and __use_unbuffered_io flags.
   ios_base::openmode __cm_;
   bool __owns_eb_;
   bool __owns_ib_;
@@ -327,7 +364,13 @@ private:
       return nullptr;
 
     __om_ = __mode;
+    if (__cm_ == (__no_io_operations | __use_unbuffered_io)) {
+      std::setbuf(__file_, nullptr);
+      __cm_ = 0;
+    }
+
     if (__mode & ios_base::ate) {
+      __cm_ = 0;
       if (fseek(__file_, 0, SEEK_END)) {
         fclose(__file_);
         __file_ = nullptr;
@@ -337,6 +380,20 @@ private:
 
     return this;
   }
+
+  // If the file is already open, switch to unbuffered mode. Otherwise, record
+  // the request to use unbuffered mode so that we use that mode when we
+  // eventually open the file.
+  _LIBCPP_HIDE_FROM_ABI void __request_unbuffered_mode(char_type* __s, streamsize __n) {
+    if (__cm_ == __no_io_operations && __s == nullptr && __n == 0) {
+      if (__file_) {
+        std::setbuf(__file_, nullptr);
+        __cm_ = 0;
+      } else {
+        __cm_ = __no_io_operations | __use_unbuffered_io;
+      }
+    }
+  }
 };
 
 template <class _CharT, class _Traits>
@@ -352,7 +409,7 @@ basic_filebuf<_CharT, _Traits>::basic_filebuf()
       __st_(),
       __st_last_(),
       __om_(0),
-      __cm_(0),
+      __cm_(__no_io_operations),
       __owns_eb_(false),
       __owns_ib_(false),
       __always_noconv_(false) {
@@ -810,6 +867,7 @@ template <class _CharT, class _Traits>
 basic_streambuf<_CharT, _Traits>* basic_filebuf<_CharT, _Traits>::setbuf(char_type* __s, streamsize __n) {
   this->setg(nullptr, nullptr, nullptr);
   this->setp(nullptr, nullptr);
+  __request_unbuffered_mode(__s, __n);
   if (__owns_eb_)
     delete[] __extbuf_;
   if (__owns_ib_)
diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
index b1e728cde868da..77b7befd44f56c 100644
--- a/libcxx/include/libcxx.imp
+++ b/libcxx/include/libcxx.imp
@@ -751,10 +751,7 @@
   { include: [ "<__type_traits/is_constant_evaluated.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_convertible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_copy_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_copy_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_core_convertible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_default_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_destructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_empty.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_enum.h>", "private", "<type_traits>", "public" ] },
@@ -770,17 +767,10 @@
   { include: [ "<__type_traits/is_member_function_pointer.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_member_object_pointer.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_member_pointer.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_move_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_move_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_nothrow_assignable.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_nothrow_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_nothrow_convertible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_copy_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_copy_constructible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_default_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_nothrow_destructible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_move_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_nothrow_move_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_null_pointer.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_object.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_pod.h>", "private", "<type_traits>", "public" ] },
@@ -801,14 +791,9 @@
   { include: [ "<__type_traits/is_trivial.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_assignable.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_constructible.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_copy_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_copy_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_copyable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_default_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_destructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_lexicographically_comparable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_move_assignable.h>", "private", "<type_traits>", "public" ] },
-  { include: [ "<__type_traits/is_trivially_move_constructible.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_trivially_relocatable.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_unbounded_array.h>", "private", "<type_traits>", "public" ] },
   { include: [ "<__type_traits/is_union.h>", "private", "<type_traits>", "public" ] },
diff --git a/libcxx/include/limits b/libcxx/include/limits
index f15b5b1ab1d52f..f022048cc26554 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -137,9 +137,9 @@ protected:
   typedef _Tp type;
 
   static _LIBCPP_CONSTEXPR const bool is_specialized = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return type(); }
 
   static _LIBCPP_CONSTEXPR const int digits       = 0;
   static _LIBCPP_CONSTEXPR const int digits10     = 0;
@@ -148,8 +148,8 @@ protected:
   static _LIBCPP_CONSTEXPR const bool is_integer  = false;
   static _LIBCPP_CONSTEXPR const bool is_exact    = false;
   static _LIBCPP_CONSTEXPR const int radix        = 0;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(); }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = 0;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = 0;
@@ -161,10 +161,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = false;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(); }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = false;
   static _LIBCPP_CONSTEXPR const bool is_bounded = false;
@@ -198,15 +198,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int max_digits10 = 0;
   static _LIBCPP_CONSTEXPR const type __min       = __libcpp_compute_min<type, digits, is_signed>::value;
   static _LIBCPP_CONSTEXPR const type __max       = is_signed ? type(type(~0) ^ __min) : type(~0);
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = true;
   static _LIBCPP_CONSTEXPR const bool is_exact   = true;
   static _LIBCPP_CONSTEXPR const int radix       = 2;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = 0;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = 0;
@@ -218,10 +218,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = false;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = false;
   static _LIBCPP_CONSTEXPR const bool is_bounded = true;
@@ -249,15 +249,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int max_digits10 = 0;
   static _LIBCPP_CONSTEXPR const type __min       = false;
   static _LIBCPP_CONSTEXPR const type __max       = true;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __min; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __max; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return min(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = true;
   static _LIBCPP_CONSTEXPR const bool is_exact   = true;
   static _LIBCPP_CONSTEXPR const int radix       = 2;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return type(0); }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = 0;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = 0;
@@ -269,10 +269,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = false;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_absent;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return type(0); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return type(0); }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = false;
   static _LIBCPP_CONSTEXPR const bool is_bounded = true;
@@ -294,15 +294,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int digits       = __FLT_MANT_DIG__;
   static _LIBCPP_CONSTEXPR const int digits10     = __FLT_DIG__;
   static _LIBCPP_CONSTEXPR const int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __FLT_MIN__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __FLT_MAX__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __FLT_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __FLT_MAX__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = false;
   static _LIBCPP_CONSTEXPR const bool is_exact   = false;
   static _LIBCPP_CONSTEXPR const int radix       = __FLT_RADIX__;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __FLT_EPSILON__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5F; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __FLT_EPSILON__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5F; }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = __FLT_MIN_EXP__;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = __FLT_MIN_10_EXP__;
@@ -314,10 +314,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = true;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_valf(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nanf(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nansf(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __FLT_DENORM_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_valf(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nanf(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nansf(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __FLT_DENORM_MIN__; }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = true;
   static _LIBCPP_CONSTEXPR const bool is_bounded = true;
@@ -343,15 +343,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int digits       = __DBL_MANT_DIG__;
   static _LIBCPP_CONSTEXPR const int digits10     = __DBL_DIG__;
   static _LIBCPP_CONSTEXPR const int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __DBL_MIN__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __DBL_MAX__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __DBL_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __DBL_MAX__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = false;
   static _LIBCPP_CONSTEXPR const bool is_exact   = false;
   static _LIBCPP_CONSTEXPR const int radix       = __FLT_RADIX__;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __DBL_EPSILON__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __DBL_EPSILON__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5; }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = __DBL_MIN_EXP__;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = __DBL_MIN_10_EXP__;
@@ -363,10 +363,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = true;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_val(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nan(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nans(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __DBL_DENORM_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_val(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nan(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nans(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __DBL_DENORM_MIN__; }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = true;
   static _LIBCPP_CONSTEXPR const bool is_bounded = true;
@@ -392,15 +392,15 @@ protected:
   static _LIBCPP_CONSTEXPR const int digits       = __LDBL_MANT_DIG__;
   static _LIBCPP_CONSTEXPR const int digits10     = __LDBL_DIG__;
   static _LIBCPP_CONSTEXPR const int max_digits10 = 2 + (digits * 30103l) / 100000l;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __LDBL_MIN__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __LDBL_MAX__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __LDBL_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __LDBL_MAX__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return -max(); }
 
   static _LIBCPP_CONSTEXPR const bool is_integer = false;
   static _LIBCPP_CONSTEXPR const bool is_exact   = false;
   static _LIBCPP_CONSTEXPR const int radix       = __FLT_RADIX__;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __LDBL_EPSILON__; }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5L; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __LDBL_EPSILON__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return 0.5L; }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = __LDBL_MIN_EXP__;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = __LDBL_MIN_10_EXP__;
@@ -412,10 +412,10 @@ protected:
   static _LIBCPP_CONSTEXPR const bool has_signaling_NaN                                    = true;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = denorm_present;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = false;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_vall(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nanl(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nansl(""); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __LDBL_DENORM_MIN__; }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __builtin_huge_vall(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __builtin_nanl(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __builtin_nansl(""); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __LDBL_DENORM_MIN__; }
 
 #if defined(__powerpc__) && defined(__LONG_DOUBLE_IBM128__)
   static _LIBCPP_CONSTEXPR const bool is_iec559 = false;
@@ -441,9 +441,9 @@ class _LIBCPP_TEMPLATE_VIS numeric_limits : private __libcpp_numeric_limits<_Tp>
 
 public:
   static _LIBCPP_CONSTEXPR const bool is_specialized = __base::is_specialized;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type min() _NOEXCEPT { return __base::min(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type max() _NOEXCEPT { return __base::max(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type lowest() _NOEXCEPT { return __base::lowest(); }
 
   static _LIBCPP_CONSTEXPR const int digits       = __base::digits;
   static _LIBCPP_CONSTEXPR const int digits10     = __base::digits10;
@@ -452,8 +452,8 @@ public:
   static _LIBCPP_CONSTEXPR const bool is_integer  = __base::is_integer;
   static _LIBCPP_CONSTEXPR const bool is_exact    = __base::is_exact;
   static _LIBCPP_CONSTEXPR const int radix        = __base::radix;
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type epsilon() _NOEXCEPT { return __base::epsilon(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type round_error() _NOEXCEPT { return __base::round_error(); }
 
   static _LIBCPP_CONSTEXPR const int min_exponent   = __base::min_exponent;
   static _LIBCPP_CONSTEXPR const int min_exponent10 = __base::min_exponent10;
@@ -467,10 +467,10 @@ public:
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const float_denorm_style has_denorm = __base::has_denorm;
   static _LIBCPP_DEPRECATED_IN_CXX23 _LIBCPP_CONSTEXPR const bool has_denorm_loss          = __base::has_denorm_loss;
   _LIBCPP_SUPPRESS_DEPRECATED_POP
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
-  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type infinity() _NOEXCEPT { return __base::infinity(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type quiet_NaN() _NOEXCEPT { return __base::quiet_NaN(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type signaling_NaN() _NOEXCEPT { return __base::signaling_NaN(); }
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR type denorm_min() _NOEXCEPT { return __base::denorm_min(); }
 
   static _LIBCPP_CONSTEXPR const bool is_iec559  = __base::is_iec559;
   static _LIBCPP_CONSTEXPR const bool is_bounded = __base::is_bounded;
diff --git a/libcxx/include/list b/libcxx/include/list
index 8f0689268e2a5a..9de3d1f60a1f79 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -227,9 +227,8 @@ template <class T, class Allocator, class Predicate>
 #include <__ranges/from_range.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_allocator.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/type_identity.h>
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 0bd2831b7f159c..f36a47cef00977 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1902,7 +1902,6 @@ module std_private_type_traits_is_core_convertible                       [system
   header "__type_traits/is_core_convertible.h"
   export std_private_type_traits_integral_constant
 }
-module std_private_type_traits_is_default_constructible                  [system] { header "__type_traits/is_default_constructible.h" }
 module std_private_type_traits_is_destructible                           [system] { header "__type_traits/is_destructible.h" }
 module std_private_type_traits_is_empty                                  [system] { header "__type_traits/is_empty.h" }
 module std_private_type_traits_is_enum                                   [system] {
@@ -1927,26 +1926,16 @@ module std_private_type_traits_is_literal_type                           [system
 module std_private_type_traits_is_member_function_pointer                [system] { header "__type_traits/is_member_function_pointer.h" }
 module std_private_type_traits_is_member_object_pointer                  [system] { header "__type_traits/is_member_object_pointer.h" }
 module std_private_type_traits_is_member_pointer                         [system] { header "__type_traits/is_member_pointer.h" }
-module std_private_type_traits_is_move_assignable                        [system] { header "__type_traits/is_move_assignable.h" }
-module std_private_type_traits_is_move_constructible                     [system] { header "__type_traits/is_move_constructible.h" }
 module std_private_type_traits_is_nothrow_assignable                     [system] { header "__type_traits/is_nothrow_assignable.h" }
-module std_private_type_traits_is_nothrow_constructible                  [system] { header "__type_traits/is_nothrow_constructible.h" }
-module std_private_type_traits_is_nothrow_convertible                    [system] { header "__type_traits/is_nothrow_convertible.h" }
-module std_private_type_traits_is_nothrow_copy_assignable                [system] { header "__type_traits/is_nothrow_copy_assignable.h" }
-module std_private_type_traits_is_nothrow_copy_constructible             [system] { header "__type_traits/is_nothrow_copy_constructible.h" }
-module std_private_type_traits_is_nothrow_default_constructible          [system] {
-  header "__type_traits/is_nothrow_default_constructible.h"
+module std_private_type_traits_is_nothrow_constructible                  [system] {
+  header "__type_traits/is_nothrow_constructible.h"
   export std_private_type_traits_integral_constant
 }
+module std_private_type_traits_is_nothrow_convertible                    [system] { header "__type_traits/is_nothrow_convertible.h" }
 module std_private_type_traits_is_nothrow_destructible                   [system] {
   header "__type_traits/is_nothrow_destructible.h"
   export std_private_type_traits_is_destructible
 }
-module std_private_type_traits_is_nothrow_move_assignable                [system] { header "__type_traits/is_nothrow_move_assignable.h" }
-module std_private_type_traits_is_nothrow_move_constructible             [system] {
-  header "__type_traits/is_nothrow_move_constructible.h"
-  export std_private_type_traits_is_nothrow_constructible
-}
 module std_private_type_traits_is_null_pointer                           [system] {
   header "__type_traits/is_null_pointer.h"
   export std_cstddef
@@ -1985,14 +1974,9 @@ module std_private_type_traits_is_swappable                              [system
 module std_private_type_traits_is_trivial                                [system] { header "__type_traits/is_trivial.h" }
 module std_private_type_traits_is_trivially_assignable                   [system] { header "__type_traits/is_trivially_assignable.h" }
 module std_private_type_traits_is_trivially_constructible                [system] { header "__type_traits/is_trivially_constructible.h" }
-module std_private_type_traits_is_trivially_copy_assignable              [system] { header "__type_traits/is_trivially_copy_assignable.h" }
-module std_private_type_traits_is_trivially_copy_constructible           [system] { header "__type_traits/is_trivially_copy_constructible.h" }
 module std_private_type_traits_is_trivially_copyable                     [system] { header "__type_traits/is_trivially_copyable.h" }
-module std_private_type_traits_is_trivially_default_constructible        [system] { header "__type_traits/is_trivially_default_constructible.h" }
 module std_private_type_traits_is_trivially_destructible                 [system] { header "__type_traits/is_trivially_destructible.h" }
 module std_private_type_traits_is_trivially_lexicographically_comparable [system] { header "__type_traits/is_trivially_lexicographically_comparable.h" }
-module std_private_type_traits_is_trivially_move_assignable              [system] { header "__type_traits/is_trivially_move_assignable.h" }
-module std_private_type_traits_is_trivially_move_constructible           [system] { header "__type_traits/is_trivially_move_constructible.h" }
 module std_private_type_traits_is_trivially_relocatable                  [system] { header "__type_traits/is_trivially_relocatable.h" }
 module std_private_type_traits_is_unbounded_array                        [system] { header "__type_traits/is_unbounded_array.h" }
 module std_private_type_traits_is_union                                  [system] { header "__type_traits/is_union.h" }
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index ea56e3051908a7..12fae9a88b9d7e 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -418,24 +418,6 @@ inline _LIBCPP_HIDE_FROM_ABI void lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&...
   std::__lock_first(0, __l0, __l1, __l2, __l3...);
 }
 
-template <class _L0>
-inline _LIBCPP_HIDE_FROM_ABI void __unlock(_L0& __l0) {
-  __l0.unlock();
-}
-
-template <class _L0, class _L1>
-inline _LIBCPP_HIDE_FROM_ABI void __unlock(_L0& __l0, _L1& __l1) {
-  __l0.unlock();
-  __l1.unlock();
-}
-
-template <class _L0, class _L1, class _L2, class... _L3>
-inline _LIBCPP_HIDE_FROM_ABI void __unlock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3) {
-  __l0.unlock();
-  __l1.unlock();
-  std::__unlock(__l2, __l3...);
-}
-
 #  endif // _LIBCPP_CXX03_LANG
 
 #  if _LIBCPP_STD_VER >= 17
@@ -498,7 +480,7 @@ public:
 private:
   template <size_t... _Indx>
   _LIBCPP_HIDE_FROM_ABI static void __unlock_unpack(__tuple_indices<_Indx...>, _MutexTuple& __mt) {
-    std::__unlock(std::get<_Indx>(__mt)...);
+    (std::get<_Indx>(__mt).unlock(), ...);
   }
 
   _MutexTuple __t_;
diff --git a/libcxx/include/optional b/libcxx/include/optional
index 99bfd0dd900d3a..a16e48502e2509 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -200,22 +200,16 @@ namespace std {
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
 #include <__type_traits/is_destructible.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_object.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_scalar.h>
 #include <__type_traits/is_swappable.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
+#include <__type_traits/is_trivially_assignable.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_assignable.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__type_traits/negation.h>
 #include <__type_traits/remove_const.h>
 #include <__type_traits/remove_cvref.h>
@@ -1291,6 +1285,7 @@ _LIBCPP_POP_MACROS
 #  include <concepts>
 #  include <ctime>
 #  include <iterator>
+#  include <limits>
 #  include <memory>
 #  include <ratio>
 #  include <stdexcept>
diff --git a/libcxx/include/string b/libcxx/include/string
index ca5b3fa6a01472..a456f8cb80ee35 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -603,8 +603,8 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_array.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
+#include <__type_traits/is_nothrow_assignable.h>
+#include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_standard_layout.h>
 #include <__type_traits/is_trivial.h>
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 02ace66542feb8..f78db061b844b0 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -232,20 +232,11 @@ template <class... Types>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_empty.h>
 #include <__type_traits/is_final.h>
 #include <__type_traits/is_implicitly_default_constructible.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_nothrow_copy_assignable.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index 54c8abec340c4d..10f9b881c0e4a3 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -458,9 +458,6 @@ namespace std
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_convertible.h>
-#include <__type_traits/is_copy_assignable.h>
-#include <__type_traits/is_copy_constructible.h>
-#include <__type_traits/is_default_constructible.h>
 #include <__type_traits/is_destructible.h>
 #include <__type_traits/is_empty.h>
 #include <__type_traits/is_enum.h>
@@ -474,17 +471,10 @@ namespace std
 #include <__type_traits/is_member_function_pointer.h>
 #include <__type_traits/is_member_object_pointer.h>
 #include <__type_traits/is_member_pointer.h>
-#include <__type_traits/is_move_assignable.h>
-#include <__type_traits/is_move_constructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_nothrow_convertible.h>
-#include <__type_traits/is_nothrow_copy_assignable.h>
-#include <__type_traits/is_nothrow_copy_constructible.h>
-#include <__type_traits/is_nothrow_default_constructible.h>
 #include <__type_traits/is_nothrow_destructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_null_pointer.h>
 #include <__type_traits/is_object.h>
 #include <__type_traits/is_pod.h>
@@ -503,13 +493,8 @@ namespace std
 #include <__type_traits/is_trivial.h>
 #include <__type_traits/is_trivially_assignable.h>
 #include <__type_traits/is_trivially_constructible.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
 #include <__type_traits/is_trivially_copyable.h>
-#include <__type_traits/is_trivially_default_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_assignable.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__type_traits/is_unbounded_array.h>
 #include <__type_traits/is_union.h>
 #include <__type_traits/is_unsigned.h>
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 59d7f9b740f3be..f5e99fc3239e84 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -228,17 +228,16 @@ namespace std {
 #include <__type_traits/add_pointer.h>
 #include <__type_traits/add_volatile.h>
 #include <__type_traits/common_type.h>
+#include <__type_traits/conjunction.h>
 #include <__type_traits/dependent_type.h>
 #include <__type_traits/is_array.h>
-#include <__type_traits/is_default_constructible.h>
+#include <__type_traits/is_constructible.h>
 #include <__type_traits/is_destructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
-#include <__type_traits/is_nothrow_move_constructible.h>
-#include <__type_traits/is_trivially_copy_assignable.h>
-#include <__type_traits/is_trivially_copy_constructible.h>
+#include <__type_traits/is_nothrow_constructible.h>
+#include <__type_traits/is_trivially_assignable.h>
+#include <__type_traits/is_trivially_constructible.h>
 #include <__type_traits/is_trivially_destructible.h>
-#include <__type_traits/is_trivially_move_assignable.h>
-#include <__type_traits/is_trivially_move_constructible.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/remove_const.h>
 #include <__type_traits/type_identity.h>
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 90507e6baaf243..0908482600c533 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -346,7 +346,7 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #include <__split_buffer>
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_constructible.h>
-#include <__type_traits/is_nothrow_move_assignable.h>
+#include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/noexcept_move_assign_container.h>
 #include <__type_traits/type_identity.h>
 #include <__utility/exception_guard.h>
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index 0dea8cfca94ac3..8dd1b51aac2f72 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -206,9 +206,15 @@ add_custom_target(generate-cxx-modules
 # Configure the modules manifest.
 # Use the relative path between the installation and the module in the json
 # file. This allows moving the entire installation to a different location.
+cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_LIBRARY_DIR
+  BASE_DIRECTORY ${CMAKE_INSTALL_PREFIX}
+  OUTPUT_VARIABLE ABS_LIBRARY_DIR)
+cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_MODULES_DIR
+  BASE_DIRECTORY ${CMAKE_INSTALL_PREFIX}
+  OUTPUT_VARIABLE ABS_MODULES_DIR)
 file(RELATIVE_PATH LIBCXX_MODULE_RELATIVE_PATH
-  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_LIBRARY_DIR}
-  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_MODULES_DIR})
+  ${ABS_LIBRARY_DIR}
+  ${ABS_MODULES_DIR})
 configure_file(
   "modules.json.in"
   "${LIBCXX_LIBRARY_DIR}/libc++.modules.json"
diff --git a/libcxx/modules/CMakeLists.txt.in b/libcxx/modules/CMakeLists.txt.in
index e332d70cc16333..c35f6fecb1fd76 100644
--- a/libcxx/modules/CMakeLists.txt.in
+++ b/libcxx/modules/CMakeLists.txt.in
@@ -50,10 +50,15 @@ endif()
 target_compile_options(std
   PUBLIC
     -nostdinc++
+    @LIBCXX_COMPILE_FLAGS@
+)
+target_compile_options(std
+  PRIVATE
     -Wno-reserved-module-identifier
     -Wno-reserved-user-defined-literal
-    @LIBCXX_COMPILE_FLAGS@
 )
+target_link_options(std PUBLIC -nostdlib++ -Wl,-rpath,@LIBCXX_LIBRARY_DIR@ -L@LIBCXX_LIBRARY_DIR@)
+target_link_libraries(std c++)
 set_target_properties(std
   PROPERTIES
     OUTPUT_NAME   "c++std"
@@ -67,7 +72,7 @@ target_sources(std.compat
     std.compat.cppm
 )
 
-target_include_directories(std.compat SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+target_include_directories(std.compat SYSTEM PUBLIC @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
 
 if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
   target_compile_options(std.compat PUBLIC -fno-exceptions)
@@ -76,13 +81,16 @@ endif()
 target_compile_options(std.compat
   PUBLIC
     -nostdinc++
+    @LIBCXX_COMPILE_FLAGS@
+)
+target_compile_options(std.compat
+ PRIVATE
     -Wno-reserved-module-identifier
     -Wno-reserved-user-defined-literal
-	-fmodule-file=std=${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/std.dir/std.pcm
-    @LIBCXX_COMPILE_FLAGS@
 )
 set_target_properties(std.compat
   PROPERTIES
     OUTPUT_NAME   "c++std.compat"
 )
 add_dependencies(std.compat std)
+target_link_libraries(std.compat PUBLIC std c++)
diff --git a/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp
index 83f90ac4184bf1..9c780ae98d1e84 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp
@@ -31,7 +31,8 @@ struct MyAlloc {
 int main(int, char**)
 {
     std::vector<bool, MyAlloc<bool>> vb;
-    std::vector<bool, MyAlloc<bool>> wb(100);
+    // std::fill_n triggers ADL because __bit_iterator has the container type as a template argument
+    // std::vector<bool, MyAlloc<bool>> wb(100);
 
     std::vector<int, MyAlloc<int>> v;
     std::vector<int, MyAlloc<int>> w(100);
diff --git a/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.compile.pass.cpp
new file mode 100644
index 00000000000000..a93f55b888b95a
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.compile.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_NODISCARD_EXT
+
+// Check that <limits> functions aren't marked [[nodiscard]] when
+// _LIBCPP_DISABLE_NODISCARD_EXT is defined
+
+#include <limits>
+
+#include "test_macros.h"
+
+void func() {
+  // arithmetic
+  std::numeric_limits<int>::min();
+  std::numeric_limits<int>::max();
+  std::numeric_limits<int>::lowest();
+  std::numeric_limits<int>::epsilon();
+  std::numeric_limits<int>::round_error();
+  std::numeric_limits<int>::infinity();
+  std::numeric_limits<int>::quiet_NaN();
+  std::numeric_limits<int>::signaling_NaN();
+  std::numeric_limits<int>::denorm_min();
+  // bool
+  std::numeric_limits<bool>::min();
+  std::numeric_limits<bool>::max();
+  std::numeric_limits<bool>::lowest();
+  std::numeric_limits<bool>::epsilon();
+  std::numeric_limits<bool>::round_error();
+  std::numeric_limits<bool>::infinity();
+  std::numeric_limits<bool>::quiet_NaN();
+  std::numeric_limits<bool>::signaling_NaN();
+  std::numeric_limits<bool>::denorm_min();
+  // float
+  std::numeric_limits<float>::min();
+  std::numeric_limits<float>::max();
+  std::numeric_limits<float>::lowest();
+  std::numeric_limits<float>::epsilon();
+  std::numeric_limits<float>::round_error();
+  std::numeric_limits<float>::infinity();
+  std::numeric_limits<float>::quiet_NaN();
+  std::numeric_limits<float>::signaling_NaN();
+  std::numeric_limits<float>::denorm_min();
+  // double
+  std::numeric_limits<double>::min();
+  std::numeric_limits<double>::max();
+  std::numeric_limits<double>::lowest();
+  std::numeric_limits<double>::epsilon();
+  std::numeric_limits<double>::round_error();
+  std::numeric_limits<double>::infinity();
+  std::numeric_limits<double>::quiet_NaN();
+  std::numeric_limits<double>::signaling_NaN();
+  std::numeric_limits<double>::denorm_min();
+  // long double
+  std::numeric_limits<long double>::min();
+  std::numeric_limits<long double>::max();
+  std::numeric_limits<long double>::lowest();
+  std::numeric_limits<long double>::epsilon();
+  std::numeric_limits<long double>::round_error();
+  std::numeric_limits<long double>::infinity();
+  std::numeric_limits<long double>::quiet_NaN();
+  std::numeric_limits<long double>::signaling_NaN();
+  std::numeric_limits<long double>::denorm_min();
+}
diff --git a/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.verify.cpp
new file mode 100644
index 00000000000000..7a81b84378c52e
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/limits.nodiscard_extensions.verify.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// check that <limits> functions are marked [[nodiscard]]
+
+#include <limits>
+
+#include "test_macros.h"
+
+void func() {
+  // clang-format off
+  // arithmetic
+  std::numeric_limits<int>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<int>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // bool
+  std::numeric_limits<bool>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<bool>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // float
+  std::numeric_limits<float>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<float>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // double
+  std::numeric_limits<double>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<double>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // long double
+  std::numeric_limits<long double>::min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::max(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::lowest(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::epsilon(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::round_error(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::infinity(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::quiet_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::signaling_NaN(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::numeric_limits<long double>::denorm_min(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  // clang-format on
+}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/equal.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/equal.pass.cpp
deleted file mode 100644
index 583e733c07cb0b..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/equal.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <BidirectionalIterator Iter1, BidirectionalIterator Iter2>
-//   requires HasEqualTo<Iter1, Iter2>
-// bool operator==(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 == r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s), true);
-    test(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s+1), false);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), true);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), false);
-    test(s, s, true);
-    test(s, s+1, false);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater-equal.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater-equal.pass.cpp
deleted file mode 100644
index 9e908418d07565..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater-equal.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasGreater<Iter1, Iter2>
-// bool operator>=(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 >= r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), true);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), true);
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s), false);
-    test(s, s, true);
-    test(s, s+1, true);
-    test(s+1, s, false);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater.pass.cpp
deleted file mode 100644
index f1afd23bab1338..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/greater.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasGreater<Iter1, Iter2>
-// bool operator>(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 > r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), false);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), true);
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s), false);
-    test(s, s, false);
-    test(s, s+1, true);
-    test(s+1, s, false);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less-equal.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less-equal.pass.cpp
deleted file mode 100644
index c710212308fac7..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less-equal.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasGreater<Iter1, Iter2>
-// bool operator<=(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 <= r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), true);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), false);
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s), true);
-    test(s, s, true);
-    test(s, s+1, false);
-    test(s+1, s, true);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less.pass.cpp
deleted file mode 100644
index ffd3a0323373fb..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/less.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasGreater<Iter1, Iter2>
-// bool operator<(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 < r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), false);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), false);
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s), true);
-    test(s, s, false);
-    test(s, s+1, false);
-    test(s+1, s, true);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/not-equal.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/not-equal.pass.cpp
deleted file mode 100644
index 614f159cc80522..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cmp/not-equal.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <BidirectionalIterator Iter1, BidirectionalIterator Iter2>
-//   requires HasEqualTo<Iter1, Iter2>
-// bool operator!=(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It l, It r, bool x) {
-    const std::__unconstrained_reverse_iterator<It> r1(l);
-    const std::__unconstrained_reverse_iterator<It> r2(r);
-    assert((r1 != r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s), false);
-    test(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s+1), true);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s), false);
-    test(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+1), true);
-    test(s, s, false);
-    test(s, s+1, true);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/assign.LWG3435.verify.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/assign.LWG3435.verify.cpp
deleted file mode 100644
index 835e2b65c19c20..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/assign.LWG3435.verify.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <class U>
-//  requires !same_as<U, Iter> && convertible_to<const U&, Iter> && assignable_from<Iter&, const U&>
-// __unconstrained_reverse_iterator& operator=(const __unconstrained_reverse_iterator<U>& u);
-
-#include <iterator>
-
-struct Base { };
-struct Derived : Base { };
-
-void test() {
-    std::__unconstrained_reverse_iterator<Base*> base;
-    std::__unconstrained_reverse_iterator<Derived*> derived;
-    derived = base; // expected-error {{no viable overloaded '='}}
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.default.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.default.pass.cpp
deleted file mode 100644
index 66972d7243cc83..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.default.pass.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator(); // constexpr since C++17
-
-#include <iterator>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test() {
-    std::__unconstrained_reverse_iterator<It> r;
-    (void)r;
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    test<bidirectional_iterator<const char*> >();
-    test<random_access_iterator<char*> >();
-    test<char*>();
-    test<const char*>();
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.explicit.verify.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.explicit.verify.cpp
deleted file mode 100644
index 6440e284f6a016..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.explicit.verify.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// explicit __unconstrained_reverse_iterator(Iter x);
-
-// test explicitness
-
-#include <iterator>
-
-void f() {
-    char const* it = "";
-    std::__unconstrained_reverse_iterator<char const*> r = it; // expected-error{{no viable conversion from 'const char *' to 'std::__unconstrained_reverse_iterator<const char *>'}}
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.pass.cpp
deleted file mode 100644
index e4d0874d50b5e9..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.iter.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// explicit __unconstrained_reverse_iterator(Iter x); // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    assert(r.base() == i);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char s[] = "123";
-    test(bidirectional_iterator<const char*>(s));
-    test(random_access_iterator<const char*>(s));
-    test(s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.reverse_iterator.LWG3435.verify.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.reverse_iterator.LWG3435.verify.cpp
deleted file mode 100644
index 7ea4a61ce66020..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.cons/ctor.reverse_iterator.LWG3435.verify.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <class U>
-//  requires !same_as<U, Iter> && convertible_to<const U&, Iter>
-// __unconstrained_reverse_iterator(const __unconstrained_reverse_iterator<U> &);
-
-#include <iterator>
-
-struct Base { };
-struct Derived : Base { };
-
-void test() {
-    std::__unconstrained_reverse_iterator<Base*> base;
-    std::__unconstrained_reverse_iterator<Derived*> derived(base); // expected-error {{no matching constructor for initialization of 'std::__unconstrained_reverse_iterator<Derived *>'}}
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.conv/base.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.conv/base.pass.cpp
deleted file mode 100644
index 7fd85c92b32771..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.conv/base.pass.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// iterator_type base() const; // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-TEST_CONSTEXPR_CXX17 bool test() {
-    typedef bidirectional_iterator<int*> Iter;
-    int i = 0;
-    Iter iter(&i);
-    std::__unconstrained_reverse_iterator<Iter> const reverse(iter);
-    std::__unconstrained_reverse_iterator<Iter>::iterator_type base = reverse.base();
-    assert(base == Iter(&i));
-    return true;
-}
-
-int main(int, char**) {
-    test();
-#if TEST_STD_VER > 14
-    static_assert(test(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/arrow.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/arrow.pass.cpp
deleted file mode 100644
index f0a181bcba88f1..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/arrow.pass.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// pointer operator->() const; // constexpr in C++17
-
-// Be sure to respect LWG 198:
-//    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#198
-// LWG 198 was superseded by LWG 2360
-//    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2360
-
-
-#include <iterator>
-#include <list>
-#include <cassert>
-
-#include "test_macros.h"
-
-class A
-{
-    int data_;
-public:
-    A() : data_(1) {}
-    A(const A&) = default;
-    A& operator=(const A&) = default;
-    ~A() {data_ = -1;}
-
-    int get() const {return data_;}
-
-    friend bool operator==(const A& x, const A& y)
-        {return x.data_ == y.data_;}
-};
-
-template <class It>
-void
-test(It i, typename std::iterator_traits<It>::value_type x)
-{
-    std::__unconstrained_reverse_iterator<It> r(i);
-    assert(r->get() == x.get());
-}
-
-class B
-{
-    int data_;
-public:
-    B(int d=1) : data_(d) {}
-    B(const B&) = default;
-    B& operator=(const B&) = default;
-    ~B() {data_ = -1;}
-
-    int get() const {return data_;}
-
-    friend bool operator==(const B& x, const B& y)
-        {return x.data_ == y.data_;}
-    const B *operator&() const { return nullptr; }
-    B       *operator&()       { return nullptr; }
-};
-
-class C
-{
-    int data_;
-public:
-    TEST_CONSTEXPR C() : data_(1) {}
-
-    TEST_CONSTEXPR int get() const {return data_;}
-
-    friend TEST_CONSTEXPR bool operator==(const C& x, const C& y)
-        {return x.data_ == y.data_;}
-};
-
-TEST_CONSTEXPR  C gC;
-
-int main(int, char**)
-{
-  A a;
-  test(&a+1, A());
-
-  {
-    std::list<B> l;
-    l.push_back(B(0));
-    l.push_back(B(1));
-    l.push_back(B(2));
-
-    {
-      std::list<B>::const_iterator i = l.begin();
-      assert ( i->get() == 0 );  ++i;
-      assert ( i->get() == 1 );  ++i;
-      assert ( i->get() == 2 );  ++i;
-      assert ( i == l.end ());
-    }
-
-    {
-      std::list<B>::const_reverse_iterator ri = l.rbegin();
-      assert ( ri->get() == 2 );  ++ri;
-      assert ( ri->get() == 1 );  ++ri;
-      assert ( ri->get() == 0 );  ++ri;
-      assert ( ri == l.rend ());
-    }
-  }
-
-#if TEST_STD_VER > 14
-  {
-    typedef std::__unconstrained_reverse_iterator<const C *> RI;
-    constexpr RI it1 = RI(&gC+1);
-
-    static_assert(it1->get() == gC.get(), "");
-  }
-#endif
-  {
-    ((void)gC);
-  }
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/bracket.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/bracket.pass.cpp
deleted file mode 100644
index f9beada9e4e64b..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/bracket.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// unspecified operator[](difference_type n) const; // constexpr since C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i,
-                               typename std::iterator_traits<It>::difference_type n,
-                               typename std::iterator_traits<It>::value_type x) {
-    typedef typename std::iterator_traits<It>::value_type value_type;
-    const std::__unconstrained_reverse_iterator<It> r(i);
-    value_type rr = r[n];
-    assert(rr == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 4, '1');
-    test(random_access_iterator<const char*>(s+5), 0, '5');
-    test(s+5, 4, '1');
-    test(s+5, 0, '5');
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/dereference.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/dereference.pass.cpp
deleted file mode 100644
index bd6b6e0df038de..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.elem/dereference.pass.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// reference operator*() const; // constexpr in C++17
-
-// Be sure to respect LWG 198:
-//    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#198
-// LWG 198 was superseded by LWG 2360
-//    http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2360
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-
-class A
-{
-    int data_;
-public:
-    A() : data_(1) {}
-    A(const A&) = default;
-    A& operator=(const A&) = default;
-    ~A() {data_ = -1;}
-
-    friend bool operator==(const A& x, const A& y)
-        {return x.data_ == y.data_;}
-};
-
-template <class It>
-void
-test(It i, typename std::iterator_traits<It>::value_type x)
-{
-    std::__unconstrained_reverse_iterator<It> r(i);
-    assert(*r == x);
-}
-
-int main(int, char**)
-{
-    A a;
-    test(&a+1, A());
-
-#if TEST_STD_VER > 14
-    {
-        constexpr const char *p = "123456789";
-        typedef std::__unconstrained_reverse_iterator<const char *> RI;
-        constexpr RI it1 = RI(p+1);
-        constexpr RI it2 = RI(p+2);
-        static_assert(*it1 == p[0], "");
-        static_assert(*it2 == p[1], "");
-    }
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/decrement-assign.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/decrement-assign.pass.cpp
deleted file mode 100644
index 48be8a7399e427..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/decrement-assign.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// __unconstrained_reverse_iterator& operator-=(difference_type n); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits<It>::difference_type n, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It>& rr = r -= n;
-    assert(r.base() == x);
-    assert(&rr == &r);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 5, random_access_iterator<const char*>(s+10));
-    test(s+5, 5, s+10);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/increment-assign.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/increment-assign.pass.cpp
deleted file mode 100644
index 115d95e1485cdc..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/increment-assign.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// __unconstrained_reverse_iterator& operator+=(difference_type n); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits<It>::difference_type n, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It>& rr = r += n;
-    assert(r.base() == x);
-    assert(&rr == &r);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    char const* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 5, random_access_iterator<const char*>(s));
-    test(s+5, 5, s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/minus.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/minus.pass.cpp
deleted file mode 100644
index c3a4d1fd9e36fb..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/minus.pass.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// __unconstrained_reverse_iterator operator-(difference_type n) const; // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits<It>::difference_type n, It x) {
-    const std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It> rr = r - n;
-    assert(rr.base() == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 5, random_access_iterator<const char*>(s+10));
-    test(s+5, 5, s+10);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/plus.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/plus.pass.cpp
deleted file mode 100644
index 164c5abe8a3533..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/plus.pass.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// requires RandomAccessIterator<Iter>
-// __unconstrained_reverse_iterator operator+(difference_type n) const; // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, typename std::iterator_traits<It>::difference_type n, It x) {
-    const std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It> rr = r + n;
-    assert(rr.base() == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "1234567890";
-    test(random_access_iterator<const char*>(s+5), 5, random_access_iterator<const char*>(s));
-    test(s+5, 5, s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postdecrement.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postdecrement.pass.cpp
deleted file mode 100644
index 3220c1f9b1eb1d..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postdecrement.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator operator--(int); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It> rr = r--;
-    assert(r.base() == x);
-    assert(rr.base() == i);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "123";
-    test(bidirectional_iterator<const char*>(s+1), bidirectional_iterator<const char*>(s+2));
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s+2));
-    test(s+1, s+2);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postincrement.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postincrement.pass.cpp
deleted file mode 100644
index 47477fe89545b6..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/postincrement.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator operator++(int); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It> rr = r++;
-    assert(r.base() == x);
-    assert(rr.base() == i);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "123";
-    test(bidirectional_iterator<const char*>(s+1), bidirectional_iterator<const char*>(s));
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s));
-    test(s+1, s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/predecrement.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/predecrement.pass.cpp
deleted file mode 100644
index 6ad41aeaf17a28..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/predecrement.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator& operator--(); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It>& rr = --r;
-    assert(r.base() == x);
-    assert(&rr == &r);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "123";
-    test(bidirectional_iterator<const char*>(s+1), bidirectional_iterator<const char*>(s+2));
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s+2));
-    test(s+1, s+2);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/preincrement.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/preincrement.pass.cpp
deleted file mode 100644
index 9c7e5b41738e9b..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nav/preincrement.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// __unconstrained_reverse_iterator& operator++(); // constexpr in C++17
-
-#include <iterator>
-#include <cassert>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-TEST_CONSTEXPR_CXX17 void test(It i, It x) {
-    std::__unconstrained_reverse_iterator<It> r(i);
-    std::__unconstrained_reverse_iterator<It>& rr = ++r;
-    assert(r.base() == x);
-    assert(&rr == &r);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    const char* s = "123";
-    test(bidirectional_iterator<const char*>(s+1), bidirectional_iterator<const char*>(s));
-    test(random_access_iterator<const char*>(s+1), random_access_iterator<const char*>(s));
-    test(s+1, s);
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nonmember/minus.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nonmember/minus.pass.cpp
deleted file mode 100644
index 632e2655dea07f..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/reverse.iter.nonmember/minus.pass.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// template <RandomAccessIterator Iter1, RandomAccessIterator Iter2>
-//   requires HasMinus<Iter2, Iter1>
-// auto operator-(const __unconstrained_reverse_iterator<Iter1>& x, const __unconstrained_reverse_iterator<Iter2>& y) // constexpr in C++17
-//  -> decltype(y.base() - x.base());
-
-#include <iterator>
-#include <cstddef>
-#include <cassert>
-#include <type_traits>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class, class, class = void> struct HasMinus : std::false_type {};
-template <class R1, class R2> struct HasMinus<R1, R2, decltype((R1() - R2(), void()))> : std::true_type {};
-
-template <class It1, class It2>
-TEST_CONSTEXPR_CXX17 void test(It1 l, It2 r, std::ptrdiff_t x) {
-    const std::__unconstrained_reverse_iterator<It1> r1(l);
-    const std::__unconstrained_reverse_iterator<It2> r2(r);
-    assert((r1 - r2) == x);
-}
-
-TEST_CONSTEXPR_CXX17 bool tests() {
-    char s[3] = {0};
-
-    // Test same base iterator type
-    test(s, s, 0);
-    test(s, s+1, 1);
-    test(s+1, s, -1);
-
-    // Test non-subtractable base iterator types
-    static_assert( HasMinus<std::__unconstrained_reverse_iterator<int*>, std::__unconstrained_reverse_iterator<int*> >::value, "");
-#if TEST_STD_VER >= 11
-    static_assert(!HasMinus<std::__unconstrained_reverse_iterator<int*>, std::__unconstrained_reverse_iterator<char*> >::value, "");
-#endif
-
-    return true;
-}
-
-int main(int, char**) {
-    tests();
-#if TEST_STD_VER > 14
-    static_assert(tests(), "");
-#endif
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/types.compile.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/types.compile.pass.cpp
deleted file mode 100644
index f8ffef364f37a5..00000000000000
--- a/libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/types.compile.pass.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <iterator>
-
-// __unconstrained_reverse_iterator
-
-// Test nested types and data member:
-
-// template <BidirectionalIterator Iter>
-// class __unconstrained_reverse_iterator {
-// protected:
-//   Iter current;
-// public:
-//   iterator<typename iterator_traits<Iterator>::iterator_category,
-//   typename iterator_traits<Iterator>::value_type,
-//   typename iterator_traits<Iterator>::difference_type,
-//   typename iterator_traits<Iterator>::pointer,
-//   typename iterator_traits<Iterator>::reference> {
-// };
-
-#include <iterator>
-#include <type_traits>
-
-#include "test_macros.h"
-#include "test_iterators.h"
-
-template <class It>
-void test() {
-  typedef std::__unconstrained_reverse_iterator<It> R;
-  typedef std::iterator_traits<It> T;
-  static_assert((std::is_same<typename R::iterator_type, It>::value), "");
-  static_assert((std::is_same<typename R::value_type, typename T::value_type>::value), "");
-  static_assert((std::is_same<typename R::difference_type, typename T::difference_type>::value), "");
-  static_assert((std::is_same<typename R::reference, typename T::reference>::value), "");
-  static_assert((std::is_same<typename R::pointer, typename std::iterator_traits<It>::pointer>::value), "");
-
-#if TEST_STD_VER <= 14
-  typedef std::iterator<typename T::iterator_category, typename T::value_type> iterator_base;
-  static_assert((std::is_base_of<iterator_base, R>::value), "");
-#endif
-#if TEST_STD_VER > 17
-  if constexpr (std::is_same_v<typename T::iterator_category, std::contiguous_iterator_tag>) {
-    static_assert((std::is_same<typename R::iterator_category, std::random_access_iterator_tag>::value), "");
-  } else {
-    static_assert((std::is_same<typename R::iterator_category, typename T::iterator_category>::value), "");
-  }
-#else
-  static_assert((std::is_same<typename R::iterator_category, typename T::iterator_category>::value), "");
-#endif
-}
-
-#if TEST_STD_VER > 17
-
-struct FooIter {
-  using iterator_category = std::bidirectional_iterator_tag;
-  using value_type = void*;
-  using difference_type = void*;
-  using pointer = void*;
-  using reference = int&;
-  int& operator*() const;
-};
-template <>
-struct std::indirectly_readable_traits<FooIter> {
-  using value_type = int;
-};
-template <>
-struct std::incrementable_traits<FooIter> {
-  using difference_type = char;
-};
-
-// Not using `FooIter::value_type`.
-static_assert(std::is_same_v<typename std::__unconstrained_reverse_iterator<FooIter>::value_type, int>);
-// Not using `FooIter::difference_type`.
-static_assert(std::is_same_v<typename std::__unconstrained_reverse_iterator<FooIter>::difference_type, char>);
-
-#endif
-
-struct BarIter {
-  bool& operator*() const;
-};
-template <>
-struct std::iterator_traits<BarIter> {
-  using difference_type = char;
-  using value_type = char;
-  using pointer = char*;
-  using reference = char&;
-  using iterator_category = std::bidirectional_iterator_tag;
-};
-
-#if TEST_STD_VER > 17
-  static_assert(std::is_same_v<typename std::__unconstrained_reverse_iterator<BarIter>::reference, bool&>);
-#else
-  static_assert(std::is_same<typename std::__unconstrained_reverse_iterator<BarIter>::reference, char&>::value, "");
-#endif
-
-void test_all() {
-  test<bidirectional_iterator<char*> >();
-  test<random_access_iterator<char*> >();
-  test<char*>();
-}
diff --git a/libcxx/test/libcxx/module_std.gen.py b/libcxx/test/libcxx/module_std.gen.py
index 5acaa837d37e0a..fc23985caf30de 100644
--- a/libcxx/test/libcxx/module_std.gen.py
+++ b/libcxx/test/libcxx/module_std.gen.py
@@ -16,11 +16,7 @@
 # to be one monolitic test. Since the test doesn't take very long it's
 # not a huge issue.
 
-# WARNING: Disabled at the bottom. Fix this test and remove the UNSUPPORTED line
-# TODO: Re-enable this test once we understand why it keeps timing out.
-
 # RUN: %{python} %s %{libcxx-dir}/utils
-# END.
 
 import sys
 
@@ -39,5 +35,4 @@
 
 
 print("//--- module_std.sh.cpp")
-print('// UNSUPPORTED: clang')
 generator.write_test("std")
diff --git a/libcxx/test/libcxx/module_std_compat.gen.py b/libcxx/test/libcxx/module_std_compat.gen.py
index a502276a1ccfc5..000aa299861220 100644
--- a/libcxx/test/libcxx/module_std_compat.gen.py
+++ b/libcxx/test/libcxx/module_std_compat.gen.py
@@ -16,11 +16,7 @@
 # to be one monolitic test. Since the test doesn't take very long it's
 # not a huge issue.
 
-# WARNING: Disabled at the bottom. Fix this test and remove the UNSUPPORTED line
-# TODO: Re-enable this test once we understand why it keeps timing out.
-
 # RUN: %{python} %s %{libcxx-dir}/utils
-# END.
 
 import sys
 
@@ -40,7 +36,6 @@
 
 
 print("//--- module_std_compat.sh.cpp")
-print("// UNSUPPORTED: clang")
 generator.write_test(
     "std.compat",
     module_c_headers,
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 678a986e522aa0..c65b9b9d705e27 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -266,9 +266,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index c3875fa2cfc06f..b3d9e327fc7aa9 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -267,9 +267,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index e28e0cd44fed95..d723409422a3eb 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -269,9 +269,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index e28e0cd44fed95..d723409422a3eb 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -269,9 +269,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index eec71f4fc6282e..03b4eda8b4d868 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -113,14 +113,19 @@ charconv type_traits
 charconv version
 chrono array
 chrono bit
+chrono cctype
+chrono cerrno
 chrono charconv
+chrono clocale
 chrono cmath
 chrono compare
 chrono concepts
 chrono cstddef
 chrono cstdint
+chrono cstdlib
 chrono cstring
 chrono ctime
+chrono cwchar
 chrono forward_list
 chrono limits
 chrono locale
@@ -275,9 +280,14 @@ filesystem system_error
 filesystem type_traits
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
 format locale
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 142204c4477cde..062127364adfee 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -68,15 +68,20 @@ charconv limits
 charconv new
 charconv version
 chrono array
+chrono cctype
+chrono cerrno
+chrono clocale
 chrono cmath
 chrono compare
 chrono cstddef
 chrono cstdint
+chrono cstdlib
+chrono cstring
 chrono ctime
+chrono cwchar
 chrono forward_list
 chrono initializer_list
 chrono limits
-chrono locale
 chrono new
 chrono optional
 chrono ostream
@@ -85,6 +90,8 @@ chrono sstream
 chrono stdexcept
 chrono string
 chrono string_view
+chrono tuple
+chrono typeinfo
 chrono vector
 chrono version
 cinttypes cstdint
@@ -131,12 +138,10 @@ coroutine compare
 coroutine cstddef
 coroutine cstdint
 coroutine cstring
-coroutine limits
 coroutine version
 cstddef version
 ctgmath ccomplex
 ctgmath cmath
-cwchar cstddef
 cwchar cwctype
 cwctype cctype
 deque compare
@@ -155,7 +160,6 @@ exception cstdlib
 exception new
 exception typeinfo
 exception version
-execution cstddef
 execution version
 expected cstddef
 expected initializer_list
@@ -166,7 +170,6 @@ experimental/iterator iterator
 experimental/memory cstddef
 experimental/memory cstdint
 experimental/memory cstring
-experimental/memory limits
 experimental/propagate_const cstddef
 experimental/simd cstddef
 experimental/simd cstdint
@@ -186,12 +189,16 @@ filesystem string
 filesystem string_view
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
-format locale
 format new
 format optional
 format queue
@@ -200,6 +207,7 @@ format stdexcept
 format string
 format string_view
 format tuple
+format typeinfo
 format version
 forward_list compare
 forward_list cstddef
@@ -396,7 +404,6 @@ optional cstddef
 optional cstdint
 optional cstring
 optional initializer_list
-optional limits
 optional new
 optional version
 ostream bitset
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 142204c4477cde..062127364adfee 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -68,15 +68,20 @@ charconv limits
 charconv new
 charconv version
 chrono array
+chrono cctype
+chrono cerrno
+chrono clocale
 chrono cmath
 chrono compare
 chrono cstddef
 chrono cstdint
+chrono cstdlib
+chrono cstring
 chrono ctime
+chrono cwchar
 chrono forward_list
 chrono initializer_list
 chrono limits
-chrono locale
 chrono new
 chrono optional
 chrono ostream
@@ -85,6 +90,8 @@ chrono sstream
 chrono stdexcept
 chrono string
 chrono string_view
+chrono tuple
+chrono typeinfo
 chrono vector
 chrono version
 cinttypes cstdint
@@ -131,12 +138,10 @@ coroutine compare
 coroutine cstddef
 coroutine cstdint
 coroutine cstring
-coroutine limits
 coroutine version
 cstddef version
 ctgmath ccomplex
 ctgmath cmath
-cwchar cstddef
 cwchar cwctype
 cwctype cctype
 deque compare
@@ -155,7 +160,6 @@ exception cstdlib
 exception new
 exception typeinfo
 exception version
-execution cstddef
 execution version
 expected cstddef
 expected initializer_list
@@ -166,7 +170,6 @@ experimental/iterator iterator
 experimental/memory cstddef
 experimental/memory cstdint
 experimental/memory cstring
-experimental/memory limits
 experimental/propagate_const cstddef
 experimental/simd cstddef
 experimental/simd cstdint
@@ -186,12 +189,16 @@ filesystem string
 filesystem string_view
 filesystem version
 format array
+format cctype
+format clocale
 format cmath
 format cstddef
 format cstdint
+format cstdlib
+format cstring
+format cwchar
 format initializer_list
 format limits
-format locale
 format new
 format optional
 format queue
@@ -200,6 +207,7 @@ format stdexcept
 format string
 format string_view
 format tuple
+format typeinfo
 format version
 forward_list compare
 forward_list cstddef
@@ -396,7 +404,6 @@ optional cstddef
 optional cstdint
 optional cstring
 optional initializer_list
-optional limits
 optional new
 optional version
 ostream bitset
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
index da56ec30f128b1..481d565961b2b5 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
@@ -14,62 +14,91 @@
 //   fill(Iter first, Iter last, const T& value);
 
 #include <algorithm>
+#include <array>
 #include <cassert>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
 
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
-    int ia[] = {0, 1, 2, 3, 4};
-
-    std::fill(std::begin(ia), std::end(ia), 5);
+template <class Iter, class Container>
+TEST_CONSTEXPR_CXX20 void
+test(Container in, size_t from, size_t to, typename Container::value_type value, Container expected) {
+  std::fill(Iter(in.data() + from), Iter(in.data() + to), value);
+  assert(in == expected);
+}
 
-    return std::all_of(std::begin(ia), std::end(ia), [](int a) {return a == 5; })
-        ;
+template <class T>
+struct Test {
+  template <class Iter>
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    {
+      std::array<T, 4> in       = {1, 2, 3, 4};
+      std::array<T, 4> expected = {5, 5, 5, 5};
+      test<Iter>(in, 0, 4, 5, expected);
     }
-#endif
-
-template <class Iter>
-void
-test_char()
-{
-    const unsigned n = 4;
-    char ca[n] = {0};
-    std::fill(Iter(ca), Iter(ca+n), char(1));
-    assert(ca[0] == 1);
-    assert(ca[1] == 1);
-    assert(ca[2] == 1);
-    assert(ca[3] == 1);
-}
+    {
+      std::array<T, 4> in       = {1, 2, 3, 4};
+      std::array<T, 4> expected = {1, 5, 5, 4};
+      test<Iter>(in, 1, 3, 5, expected);
+    }
+  }
+};
 
-template <class Iter>
-void
-test_int()
-{
-    const unsigned n = 4;
-    int ia[n] = {0};
-    std::fill(Iter(ia), Iter(ia+n), 1);
-    assert(ia[0] == 1);
-    assert(ia[1] == 1);
-    assert(ia[2] == 1);
-    assert(ia[3] == 1);
+TEST_CONSTEXPR_CXX20 bool test() {
+  types::for_each(types::forward_iterator_list<char*>(), Test<char>());
+  types::for_each(types::forward_iterator_list<int*>(), Test<int>());
+  {   // test vector<bool>::iterator optimization
+    { // simple case
+      std::vector<bool> in(4, false);
+      std::vector<bool> expected(4, true);
+      std::fill(in.begin(), in.end(), true);
+      assert(in == expected);
+    }
+    { // partial byte in the front is not filled
+      std::vector<bool> in(8, false);
+      std::vector<bool> expected(8, true);
+      expected[0] = false;
+      expected[1] = false;
+      std::fill(in.begin() + 2, in.end(), true);
+      assert(in == expected);
+    }
+    { // partial byte in the back is not filled
+      std::vector<bool> in(8, false);
+      std::vector<bool> expected(8, true);
+      expected[6] = false;
+      expected[7] = false;
+      std::fill(in.begin(), in.end() - 2, true);
+      assert(in == expected);
+    }
+    { // partial byte in the front and back is not filled
+      std::vector<bool> in(16, false);
+      std::vector<bool> expected(16, true);
+      expected[0]  = false;
+      expected[1]  = false;
+      expected[14] = false;
+      expected[15] = false;
+      std::fill(in.begin() + 2, in.end() - 2, true);
+      assert(in == expected);
+    }
+    { // only a few bits of a byte are set
+      std::vector<bool> in(8, false);
+      std::vector<bool> expected(8, true);
+      expected[0] = false;
+      expected[1] = false;
+      expected[6] = false;
+      expected[7] = false;
+      std::fill(in.begin() + 2, in.end() - 2, true);
+      assert(in == expected);
+    }
+  }
+  return true;
 }
 
-int main(int, char**)
-{
-    test_char<forward_iterator<char*> >();
-    test_char<bidirectional_iterator<char*> >();
-    test_char<random_access_iterator<char*> >();
-    test_char<char*>();
-
-    test_int<forward_iterator<int*> >();
-    test_int<bidirectional_iterator<int*> >();
-    test_int<random_access_iterator<int*> >();
-    test_int<int*>();
-
-#if TEST_STD_VER > 17
-    static_assert(test_constexpr());
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 20
+  static_assert(test());
 #endif
 
   return 0;
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
index 904100c1cf0bb1..7654a4b0c7f007 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
@@ -14,7 +14,7 @@
 //   count(Iter first, Iter last, const T& value);
 
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
-// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
index b17272ea90cddd..b6631add7e48a7 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp
@@ -11,7 +11,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
-// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
 
 // template<input_iterator I, sentinel_for<I> S, class T, class Proj = identity>
 //   requires indirect_binary_predicate<ranges::equal_to, projected<I, Proj>, const T*>
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
index d4bbde75ae8821..068202c6e41508 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/generic_category.pass.cpp
@@ -44,19 +44,14 @@ int main(int, char**)
         errno = E2BIG; // something that message will never generate
         const std::error_category& e_cat1 = std::generic_category();
         const std::string msg = e_cat1.message(-1);
-        // Exact message format varies by platform.  We can't detect
-        // some of these (Musl in particular) using the preprocessor,
-        // so accept a few sensible messages.  Newlib unfortunately
-        // responds with an empty message, which we probably want to
-        // treat as a failure code otherwise, but we can detect that
-        // with the preprocessor.
-        LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0       // AIX
-                      || msg.rfind("No error information", 0) == 0 // Musl
-                      || msg.rfind("Unknown error", 0) == 0        // Glibc
-#if defined(_NEWLIB_VERSION)
-                      || msg.empty()
+        // Exact message format varies by platform.
+#if defined(_AIX)
+        LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0);
+#elif defined(_NEWLIB_VERSION)
+        LIBCPP_ASSERT(msg.empty());
+#else
+        LIBCPP_ASSERT(msg.rfind("Unknown error", 0) == 0);
 #endif
-        );
         assert(errno == E2BIG);
     }
 
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index eefbddd27a7f53..42fdd1cb3b91bd 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -50,19 +50,14 @@ int main(int, char**) {
     errno                             = E2BIG; // something that message will never generate
     const std::error_category& e_cat1 = std::system_category();
     const std::string msg             = e_cat1.message(-1);
-    // Exact message format varies by platform.  We can't detect
-    // some of these (Musl in particular) using the preprocessor,
-    // so accept a few sensible messages.  Newlib unfortunately
-    // responds with an empty message, which we probably want to
-    // treat as a failure code otherwise, but we can detect that
-    // with the preprocessor.
-    LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0       // AIX
-                  || msg.rfind("No error information", 0) == 0 // Musl
-                  || msg.rfind("Unknown error", 0) == 0        // Glibc
-#if defined(_NEWLIB_VERSION)
-                  || msg.empty()
+    // Exact message format varies by platform.
+#if defined(_AIX)
+    LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0);
+#elif defined(_NEWLIB_VERSION)
+    LIBCPP_ASSERT(msg.empty());
+#else
+    LIBCPP_ASSERT(msg.rfind("Unknown error", 0) == 0);
 #endif
-    );
     assert(errno == E2BIG);
   }
 
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
new file mode 100644
index 00000000000000..8bcce281620332
--- /dev/null
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp
@@ -0,0 +1,120 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <fstream>
+
+// basic_streambuf<charT, traits>* setbuf(char_type* s, streamsize n) override;
+
+#include <fstream>
+#include <cstddef>
+#include <cassert>
+
+#include "test_macros.h"
+
+template <class CharT>
+static std::size_t file_size(const char* filename) {
+  FILE* f = std::fopen(filename, "rb");
+  std::fseek(f, 0, SEEK_END);
+  long result = std::ftell(f);
+  std::fclose(f);
+  return result;
+}
+
+// Helper class to expose some protected std::basic_filebuf<CharT> members.
+template <class CharT>
+struct filebuf : public std::basic_filebuf<CharT> {
+  CharT* base() { return this->pbase(); }
+  CharT* ptr() { return this->pptr(); }
+};
+
+template <class CharT>
+static void buffered_request() {
+  filebuf<CharT> buffer;
+
+  CharT b[10] = {0};
+  assert(buffer.pubsetbuf(b, 10) == &buffer);
+
+  buffer.open("test.dat", std::ios_base::out);
+  buffer.sputc(CharT('a'));
+  assert(b[0] == 'a');
+
+  buffer.close();
+  assert(file_size<CharT>("test.dat") == 1);
+}
+
+template <class CharT>
+static void unbuffered_request_before_open() {
+  filebuf<CharT> buffer;
+
+  assert(buffer.pubsetbuf(nullptr, 0) == &buffer);
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  buffer.open("test.dat", std::ios_base::out);
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  buffer.sputc(CharT('a'));
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  assert(file_size<CharT>("test.dat") == 1);
+}
+
+template <class CharT>
+static void unbuffered_request_after_open() {
+  filebuf<CharT> buffer;
+
+  buffer.open("test.dat", std::ios_base::out);
+
+  assert(buffer.pubsetbuf(nullptr, 0) == &buffer);
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  buffer.sputc(CharT('a'));
+  assert(buffer.base() == nullptr);
+  assert(buffer.ptr() == nullptr);
+
+  assert(file_size<CharT>("test.dat") == 1);
+}
+
+template <class CharT>
+static void unbuffered_request_after_open_ate() {
+  filebuf<CharT> buffer;
+
+  buffer.open("test.dat", std::ios_base::out | std::ios_base::ate);
+
+  assert(buffer.pubsetbuf(nullptr, 0) == &buffer);
+
+  buffer.sputc(CharT('a'));
+  assert(file_size<CharT>("test.dat") <= 1);
+  // on libc++ buffering is used by default.
+  LIBCPP_ASSERT(file_size<CharT>("test.dat") == 0);
+
+  buffer.close();
+  assert(file_size<CharT>("test.dat") == 1);
+}
+
+template <class CharT>
+static void test() {
+  buffered_request<CharT>();
+
+  unbuffered_request_before_open<CharT>();
+  unbuffered_request_after_open<CharT>();
+  unbuffered_request_after_open_ate<CharT>();
+}
+
+int main(int, char**) {
+  test<char>();
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy.pass.cpp
index 842aece4121fbe..719c231d33e75e 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy_assign.pass.cpp
index 4be769612dec6c..e897676ec2bfe5 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move.pass.cpp
index 8b41854fb4bdcc..1118ba23ab92f3 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move_assign.pass.cpp
index f663f1aa536e04..a71f8ce3147928 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
index 1476f29360fbbe..a1a37012369d4c 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/assign.pass.cpp
index c071d44de70c4b..ba14af1240d4b7 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
index 22f94def8529e2..4f5002e068c19d 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
index 62b6837f463235..03e9de882ab0ab 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
index 5d25485479ea6d..e1407858f66ac5 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
index 91d6017f76a9db..303a95a0128bc9 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // Starting in Android N (API 24), SELinux policy prevents the shell user from
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
index 34e60c66357b5d..514bbb27c2c475 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
index 75877bc0a2752a..41e2ee668f525d 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // The string reported on errors changed, which makes those tests fail when run
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
index d28497395be6c0..dd72232ee530af 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
index bcb1f615986e9a..24e80695095276 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 
 // <filesystem>
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
index 31dfd8dba0e73a..b7100b22dcd429 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
index ff47ad2d0a9687..11748f942bc860 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
index 780994f26386ac..3e363ae9bceec2 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
index d224b44aa0318e..a7a4fd4c1df8be 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
index e6b779c7563f84..5dcd91d6c26dc4 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
index 943708a8336a9e..5a1750dcd1329d 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
index 80024b95b0e579..d110912c0a4b10 100644
--- a/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
index 61dab710ce6405..05346693f4bb6c 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
index 3f1e2a3cc9dffe..e1452c365a3875 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
index 4dd4a39a3d5d8e..f0cc8d9a25ffc0 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
index e5d2c64731e43c..30804c0e54e54d 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
index b0f6cbbbbdc73f..26e24ad988bcdf 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
index 9b3a50fb62c72e..243bed0783dba7 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
index e2969b7dc63b10..5fc3efee9cd06f 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
index 4aa24dea475e60..e4a32e0595c35b 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
index 9f8e5da5a9fa6c..aa458a655ffc94 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
index e0abd19dcd4e97..f54ee86d5ec0b5 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
index d0f3047bad51ae..2dc9271fbe3b24 100644
--- a/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
index 0098fe8ee698ef..279a856511b998 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
index cc801922578ccc..3f3effbf267520 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_symlink/copy_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_symlink/copy_symlink.pass.cpp
index 49eb30ebf39ce8..7b27213d83b01d 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_symlink/copy_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_symlink/copy_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
index e0bb40c3943bf1..9955e08710c567 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
index dbb36e2889522e..21f283cae86d59 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
index 8e6194f8c98629..a0ec52643e8c01 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory_symlink/create_directory_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory_symlink/create_directory_symlink.pass.cpp
index 7faf90f038b9bf..65bb9ee452fc3b 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory_symlink/create_directory_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory_symlink/create_directory_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_hard_link/create_hard_link.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_hard_link/create_hard_link.pass.cpp
index 5e801e01f4f2aa..bb8fd1c344a728 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_hard_link/create_hard_link.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_hard_link/create_hard_link.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_symlink/create_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_symlink/create_symlink.pass.cpp
index 2cee2469c838f1..99d0a3b07ad2b8 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_symlink/create_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_symlink/create_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
index 5a48be10637e60..57df465bac18c3 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
index 710d1a60b64c8b..f9b00a28af2123 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
index 2bcdf0fda1b3f3..50181cb8d76c84 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
index 397617d489b63c..99d303bc4244dc 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
index b2fa8e2b57c16c..1890696c4bf191 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
index b9d15f521fa11d..c16664c9a3f56e 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
index f1bb81a22341d8..79645b9c2cfded 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
index d493924f97e3ff..70fc0122427eb4 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
index a05054e226047c..647d2e6a255a95 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
index 8a9b0225c133fd..76375d7b3b78c3 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
index 16bd2912bbeb21..ba3070e7bdc7d1 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
index a3e12a7532d159..06a6fb4d22debb 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
index 2c4fe7d1bf6d35..b6f92e47a67a22 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
index 1d70e64bb234ad..a71361c8470493 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
index 0678b4d767d23e..d43bc5eea51214 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.permissions/permissions.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.permissions/permissions.pass.cpp
index f009befa49c379..085fa65e3559f5 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.permissions/permissions.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.permissions/permissions.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.read_symlink/read_symlink.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.read_symlink/read_symlink.pass.cpp
index ee0ba7284903da..9bb102ab4da5c3 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.read_symlink/read_symlink.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.read_symlink/read_symlink.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
index 9bd4051f790218..7bba86252498ec 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove/remove.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove/remove.pass.cpp
index 3685bbf78d5064..c8a5d7f6aa10e4 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove/remove.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove/remove.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/remove_all.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/remove_all.pass.cpp
index 5cdbd087843165..5513c18f00585e 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/remove_all.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/remove_all.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp
index 61a31543dd7adf..972a2a024c2124 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.resize_file/resize_file.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.resize_file/resize_file.pass.cpp
index 8218504115d854..e21e6b9ddc1ffe 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.resize_file/resize_file.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.resize_file/resize_file.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
index 9f7b5aa94b68d1..60e9ac778967ef 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
index 02a68712dabdb4..75eae80194ba80 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
index 5c2078e4b48564..7a4658756f0c30 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
index 035e875e4ec83d..053bddcad9b43a 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// REQUIRES: can-create-symlinks
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: availability-filesystem-missing
diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap/iter_swap.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap/iter_swap.pass.cpp
index dd78707f25409c..e6507f7e776735 100644
--- a/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap/iter_swap.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/iterator.cust/iterator.cust.swap/iter_swap.pass.cpp
@@ -47,6 +47,13 @@ static_assert( std::is_invocable_v<IterSwapT&&, HasIterSwap&, HasIterSwap&>);
 static_assert( std::is_invocable_v<IterSwapT&&, HasIterSwap&, int&>);
 static_assert(!std::is_invocable_v<IterSwapT&&, int&, HasIterSwap&>);
 
+struct StructWithNotMoreSpecializedIterSwap {
+  friend void iter_swap(auto&, auto&);
+};
+
+static_assert(
+    !std::is_invocable_v<IterSwapT, StructWithNotMoreSpecializedIterSwap&, StructWithNotMoreSpecializedIterSwap&>);
+
 struct NodiscardIterSwap {
   [[nodiscard]] friend int iter_swap(NodiscardIterSwap&, NodiscardIterSwap&) { return 0; }
 };
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
index 0258ebf8724310..8637a933008fb9 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
@@ -22,7 +22,6 @@
 #include <locale>
 #include <ios>
 #include <cassert>
-#include <cstdio>
 #include <streambuf>
 #include <cmath>
 #include "test_macros.h"
@@ -8935,12 +8934,11 @@ void test4()
     char str[200];
     std::locale lc = std::locale::classic();
     std::locale lg(lc, new my_numpunct);
-
-    std::string inf;
-
-    // This should match the underlying C library
-    std::sprintf(str, "%f", INFINITY);
-    inf = str;
+#ifdef _AIX
+    std::string inf = "INF";
+#else
+    std::string inf = "inf";
+#endif
 
     const my_facet f(1);
     {
@@ -10729,27 +10727,24 @@ void test5()
     std::locale lc = std::locale::classic();
     std::locale lg(lc, new my_numpunct);
     const my_facet f(1);
-
-    std::string nan;
-    std::string NaN;
-    std::string pnan_sign;
-
-    // The output here depends on the underlying C library, so work out what
-    // that does.
-    std::sprintf(str, "%f", std::nan(""));
-    nan = str;
-
-    std::sprintf(str, "%F", std::nan(""));
-    NaN = str;
-
-    std::sprintf(str, "%+f", std::nan(""));
-    if (str[0] == '+') {
-      pnan_sign = "+";
-    }
-
-    std::string nan_padding25  = std::string(25 - nan.length(), '*');
-    std::string pnan_padding25 = std::string(25 - nan.length() - pnan_sign.length(), '*');
-
+#if defined(_AIX)
+    std::string nan= "NaNQ";
+    std::string NaN = "NaNQ";
+    std::string nan_padding25 = "*********************";
+    std::string pnan_sign = "+";
+    std::string pnan_padding25 = "********************";
+#else
+    std::string nan= "nan";
+    std::string NaN = "NAN";
+    std::string nan_padding25 = "**********************";
+#if defined(TEST_HAS_GLIBC) || defined(_WIN32)
+    std::string pnan_sign = "+";
+    std::string pnan_padding25 = "*********************";
+#else
+    std::string pnan_sign = "";
+    std::string pnan_padding25 = "**********************";
+#endif
+#endif
     {
         long double v = std::nan("");
         std::ios ios(0);
diff --git a/libcxx/test/std/ranges/range.access/begin.pass.cpp b/libcxx/test/std/ranges/range.access/begin.pass.cpp
index ca25fc297a0119..5ca3d59abb1407 100644
--- a/libcxx/test/std/ranges/range.access/begin.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/begin.pass.cpp
@@ -192,7 +192,7 @@ struct BeginFunction {
 };
 static_assert( std::is_invocable_v<RangeBeginT,  BeginFunction const&>);
 static_assert(!std::is_invocable_v<RangeBeginT,  BeginFunction &&>);
-static_assert(!std::is_invocable_v<RangeBeginT,  BeginFunction &>);
+static_assert(std::is_invocable_v<RangeBeginT, BeginFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeCBeginT, BeginFunction const&>);
 static_assert( std::is_invocable_v<RangeCBeginT, BeginFunction &>);
 
@@ -245,7 +245,7 @@ struct BeginFunctionWithPrivateBeginMember {
 constexpr bool testBeginFunction() {
   BeginFunction a{};
   const BeginFunction aa{};
-  static_assert(!std::invocable<RangeBeginT, decltype((a))>);
+  assert(std::ranges::begin(a) == &a.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cbegin(a) == &a.x);
   assert(std::ranges::begin(aa) == &aa.x);
   assert(std::ranges::cbegin(aa) == &aa.x);
@@ -266,21 +266,21 @@ constexpr bool testBeginFunction() {
 
   BeginFunctionReturnsEmptyPtr d{};
   const BeginFunctionReturnsEmptyPtr dd{};
-  static_assert(!std::invocable<RangeBeginT, decltype((d))>);
+  assert(std::ranges::begin(d) == &d.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cbegin(d) == &d.x);
   assert(std::ranges::begin(dd) == &dd.x);
   assert(std::ranges::cbegin(dd) == &dd.x);
 
   BeginFunctionWithDataMember e{};
   const BeginFunctionWithDataMember ee{};
-  static_assert(!std::invocable<RangeBeginT, decltype((e))>);
+  assert(std::ranges::begin(e) == &e.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::begin(ee) == &ee.x);
   assert(std::ranges::cbegin(e) == &e.x);
   assert(std::ranges::cbegin(ee) == &ee.x);
 
   BeginFunctionWithPrivateBeginMember f{};
   const BeginFunctionWithPrivateBeginMember ff{};
-  static_assert(!std::invocable<RangeBeginT, decltype((f))>);
+  assert(std::ranges::begin(f) == &f.y); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cbegin(f) == &f.y);
   assert(std::ranges::begin(ff) == &ff.y);
   assert(std::ranges::cbegin(ff) == &ff.y);
diff --git a/libcxx/test/std/ranges/range.access/end.pass.cpp b/libcxx/test/std/ranges/range.access/end.pass.cpp
index 21d97354ef08d0..3e465b357e9851 100644
--- a/libcxx/test/std/ranges/range.access/end.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/end.pass.cpp
@@ -193,7 +193,7 @@ static_assert(!std::is_invocable_v<RangeEndT, EndFunction &&>);
 
 static_assert( std::is_invocable_v<RangeEndT,  EndFunction const&>);
 static_assert(!std::is_invocable_v<RangeEndT,  EndFunction &&>);
-static_assert(!std::is_invocable_v<RangeEndT,  EndFunction &>);
+static_assert(std::is_invocable_v<RangeEndT, EndFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeCEndT, EndFunction const&>);
 static_assert( std::is_invocable_v<RangeCEndT, EndFunction &>);
 
@@ -271,7 +271,7 @@ constexpr bool testEndFunction() {
   assert(std::ranges::end(a) == &a.x);
   assert(std::ranges::cend(a) == &a.x);
   EndFunction aa{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((aa))>);
+  assert(std::ranges::end(aa) == &aa.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(aa) == &aa.x);
 
   EndFunctionByValue b;
@@ -286,28 +286,28 @@ constexpr bool testEndFunction() {
   assert(std::ranges::end(d) == &d.x);
   assert(std::ranges::cend(d) == &d.x);
   EndFunctionReturnsEmptyPtr dd{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((dd))>);
+  assert(std::ranges::end(dd) == &dd.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(dd) == &dd.x);
 
   const EndFunctionWithDataMember e{};
   assert(std::ranges::end(e) == &e.x);
   assert(std::ranges::cend(e) == &e.x);
   EndFunctionWithDataMember ee{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((ee))>);
+  assert(std::ranges::end(ee) == &ee.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(ee) == &ee.x);
 
   const EndFunctionWithPrivateEndMember f{};
   assert(std::ranges::end(f) == &f.y);
   assert(std::ranges::cend(f) == &f.y);
   EndFunctionWithPrivateEndMember ff{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((ff))>);
+  assert(std::ranges::end(ff) == &ff.y); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(ff) == &ff.y);
 
   const BeginMemberEndFunction g{};
   assert(std::ranges::end(g) == &g.x);
   assert(std::ranges::cend(g) == &g.x);
   BeginMemberEndFunction gg{};
-  static_assert(!std::is_invocable_v<RangeEndT, decltype((gg))>);
+  assert(std::ranges::end(gg) == &gg.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::cend(gg) == &gg.x);
 
   return true;
diff --git a/libcxx/test/std/ranges/range.access/rbegin.pass.cpp b/libcxx/test/std/ranges/range.access/rbegin.pass.cpp
index e1a564c94ed948..3997f38efd0298 100644
--- a/libcxx/test/std/ranges/range.access/rbegin.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/rbegin.pass.cpp
@@ -187,7 +187,8 @@ struct RBeginFunction {
 };
 static_assert( std::is_invocable_v<RangeRBeginT,  RBeginFunction const&>);
 static_assert(!std::is_invocable_v<RangeRBeginT,  RBeginFunction &&>);
-static_assert(!std::is_invocable_v<RangeRBeginT,  RBeginFunction &>);
+static_assert(
+    std::is_invocable_v<RangeRBeginT, RBeginFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeCRBeginT, RBeginFunction const&>);
 static_assert( std::is_invocable_v<RangeCRBeginT, RBeginFunction &>);
 
@@ -246,7 +247,7 @@ struct RBeginFunctionWithPrivateBeginMember {
 constexpr bool testRBeginFunction() {
   RBeginFunction a{};
   const RBeginFunction aa{};
-  static_assert(!std::invocable<RangeRBeginT, decltype((a))>);
+  assert(std::ranges::rbegin(a) == &a.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crbegin(a) == &a.x);
   assert(std::ranges::rbegin(aa) == &aa.x);
   assert(std::ranges::crbegin(aa) == &aa.x);
@@ -267,21 +268,21 @@ constexpr bool testRBeginFunction() {
 
   RBeginFunctionReturnsEmptyPtr d{};
   const RBeginFunctionReturnsEmptyPtr dd{};
-  static_assert(!std::invocable<RangeRBeginT, decltype((d))>);
+  assert(std::ranges::rbegin(d) == &d.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crbegin(d) == &d.x);
   assert(std::ranges::rbegin(dd) == &dd.x);
   assert(std::ranges::crbegin(dd) == &dd.x);
 
   RBeginFunctionWithDataMember e{};
   const RBeginFunctionWithDataMember ee{};
-  static_assert(!std::invocable<RangeRBeginT, decltype((e))>);
+  assert(std::ranges::rbegin(e) == &e.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::rbegin(ee) == &ee.x);
   assert(std::ranges::crbegin(e) == &e.x);
   assert(std::ranges::crbegin(ee) == &ee.x);
 
   RBeginFunctionWithPrivateBeginMember f{};
   const RBeginFunctionWithPrivateBeginMember ff{};
-  static_assert(!std::invocable<RangeRBeginT, decltype((f))>);
+  assert(std::ranges::rbegin(f) == &f.y); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crbegin(f) == &f.y);
   assert(std::ranges::rbegin(ff) == &ff.y);
   assert(std::ranges::crbegin(ff) == &ff.y);
diff --git a/libcxx/test/std/ranges/range.access/rend.pass.cpp b/libcxx/test/std/ranges/range.access/rend.pass.cpp
index 5ba244b6b18cfe..f5f59edf19393f 100644
--- a/libcxx/test/std/ranges/range.access/rend.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/rend.pass.cpp
@@ -196,7 +196,7 @@ static_assert(!std::is_invocable_v<RangeREndT, REndFunction &&>);
 
 static_assert( std::is_invocable_v<RangeREndT,  REndFunction const&>);
 static_assert(!std::is_invocable_v<RangeREndT,  REndFunction &&>);
-static_assert(!std::is_invocable_v<RangeREndT,  REndFunction &>);
+static_assert(std::is_invocable_v<RangeREndT, REndFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeCREndT, REndFunction const&>);
 static_assert( std::is_invocable_v<RangeCREndT, REndFunction &>);
 
@@ -272,7 +272,7 @@ constexpr bool testREndFunction() {
   assert(std::ranges::rend(a) == &a.x);
   assert(std::ranges::crend(a) == &a.x);
   REndFunction aa{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((aa))>);
+  assert(std::ranges::rend(aa) == &aa.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(aa) == &aa.x);
 
   REndFunctionByValue b;
@@ -287,28 +287,28 @@ constexpr bool testREndFunction() {
   assert(std::ranges::rend(d) == &d.x);
   assert(std::ranges::crend(d) == &d.x);
   REndFunctionReturnsEmptyPtr dd{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((dd))>);
+  assert(std::ranges::rend(dd) == &dd.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(dd) == &dd.x);
 
   const REndFunctionWithDataMember e{};
   assert(std::ranges::rend(e) == &e.x);
   assert(std::ranges::crend(e) == &e.x);
   REndFunctionWithDataMember ee{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((ee))>);
+  assert(std::ranges::rend(ee) == &ee.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(ee) == &ee.x);
 
   const REndFunctionWithPrivateEndMember f{};
   assert(std::ranges::rend(f) == &f.y);
   assert(std::ranges::crend(f) == &f.y);
   REndFunctionWithPrivateEndMember ff{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((ff))>);
+  assert(std::ranges::rend(ff) == &ff.y); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(ff) == &ff.y);
 
   const RBeginMemberEndFunction g{};
   assert(std::ranges::rend(g) == &g.x);
   assert(std::ranges::crend(g) == &g.x);
   RBeginMemberEndFunction gg{};
-  static_assert(!std::is_invocable_v<RangeREndT, decltype((gg))>);
+  assert(std::ranges::rend(gg) == &gg.x); // Ill-formed before P2602R2 Poison Pills are Too Toxic
   assert(std::ranges::crend(gg) == &gg.x);
 
   return true;
diff --git a/libcxx/test/std/ranges/range.access/size.pass.cpp b/libcxx/test/std/ranges/range.access/size.pass.cpp
index fd7d0a8b997522..ee44aa815ba993 100644
--- a/libcxx/test/std/ranges/range.access/size.pass.cpp
+++ b/libcxx/test/std/ranges/range.access/size.pass.cpp
@@ -219,7 +219,8 @@ inline constexpr bool std::ranges::disable_sized_range<const ImproperlyDisabledF
 
 static_assert( std::is_invocable_v<RangeSizeT, ImproperlyDisabledMember&>);
 static_assert( std::is_invocable_v<RangeSizeT, const ImproperlyDisabledMember&>);
-static_assert(!std::is_invocable_v<RangeSizeT, ImproperlyDisabledFunction&>);
+static_assert(std::is_invocable_v<RangeSizeT,
+                                  ImproperlyDisabledFunction&>); // Ill-formed before P2602R2 Poison Pills are Too Toxic
 static_assert( std::is_invocable_v<RangeSizeT, const ImproperlyDisabledFunction&>);
 
 // No begin end.
diff --git a/libcxx/test/std/ranges/range.utility/view.interface/view.interface.pass.cpp b/libcxx/test/std/ranges/range.utility/view.interface/view.interface.pass.cpp
index 2fc2fa8579996a..3f7c174d3fe48a 100644
--- a/libcxx/test/std/ranges/range.utility/view.interface/view.interface.pass.cpp
+++ b/libcxx/test/std/ranges/range.utility/view.interface/view.interface.pass.cpp
@@ -39,6 +39,14 @@ struct InputRange : std::ranges::view_interface<InputRange> {
   constexpr InputIter end() const { return InputIter(buff + 8); }
 };
 
+struct SizedInputRange : std::ranges::view_interface<SizedInputRange> {
+  int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  constexpr InputIter begin() const { return InputIter(buff); }
+  constexpr sentinel_wrapper<InputIter> end() const { return sentinel_wrapper(InputIter(buff + 8)); }
+  constexpr std::size_t size() const { return 8; }
+};
+static_assert(std::ranges::sized_range<SizedInputRange>);
+
 struct NotSizedSentinel {
   using value_type = int;
   using difference_type = std::ptrdiff_t;
@@ -155,11 +163,24 @@ concept BoolOpInvocable = requires (T const& obj) { bool(obj); };
 
 constexpr bool testEmpty() {
   static_assert(!EmptyInvocable<InputRange>);
+  // LWG 3715: `view_interface::empty` is overconstrained
+  static_assert(EmptyInvocable<SizedInputRange>);
   static_assert( EmptyInvocable<ForwardRange>);
 
   static_assert(!BoolOpInvocable<InputRange>);
+  static_assert(BoolOpInvocable<SizedInputRange>);
   static_assert( BoolOpInvocable<ForwardRange>);
 
+  SizedInputRange sizedInputRange;
+  assert(!sizedInputRange.empty());
+  assert(!static_cast<SizedInputRange const&>(sizedInputRange).empty());
+
+  assert(sizedInputRange);
+  assert(static_cast<SizedInputRange const&>(sizedInputRange));
+
+  assert(!std::ranges::empty(sizedInputRange));
+  assert(!std::ranges::empty(static_cast<SizedInputRange const&>(sizedInputRange)));
+
   ForwardRange forwardRange;
   assert(!forwardRange.empty());
   assert(!static_cast<ForwardRange const&>(forwardRange).empty());
diff --git a/libcxx/test/std/ranges/robust_against_poison_pills.compile.pass.cpp b/libcxx/test/std/ranges/robust_against_poison_pills.compile.pass.cpp
new file mode 100644
index 00000000000000..1b3da814d08002
--- /dev/null
+++ b/libcxx/test/std/ranges/robust_against_poison_pills.compile.pass.cpp
@@ -0,0 +1,102 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <compare>, <iterator>, <ranges>
+
+// ADL should be performed. Ordinary unqualified lookup should not be performed.
+
+namespace ns {
+struct StructWithGlobalFunctions {};
+} // namespace ns
+
+struct ConvertibleToCmpType;
+ConvertibleToCmpType strong_order(const ns::StructWithGlobalFunctions&, const ns::StructWithGlobalFunctions&);
+ConvertibleToCmpType weak_order(const ns::StructWithGlobalFunctions&, const ns::StructWithGlobalFunctions&);
+ConvertibleToCmpType partial_order(const ns::StructWithGlobalFunctions&, const ns::StructWithGlobalFunctions&);
+
+int&& iter_move(const ns::StructWithGlobalFunctions&);
+void iter_swap(const ns::StructWithGlobalFunctions&, const ns::StructWithGlobalFunctions&);
+
+int* begin(const ns::StructWithGlobalFunctions&);
+int* end(const ns::StructWithGlobalFunctions&);
+int* rbegin(const ns::StructWithGlobalFunctions&);
+int* rend(const ns::StructWithGlobalFunctions&);
+unsigned int size(const ns::StructWithGlobalFunctions&);
+
+#include <compare>
+#include <ranges>
+#include <type_traits>
+
+struct ConvertibleToCmpType {
+  operator std::strong_ordering() const;
+  operator std::weak_ordering() const;
+  operator std::partial_ordering() const;
+};
+
+struct StructWithHiddenFriends {
+  friend ConvertibleToCmpType strong_order(const StructWithHiddenFriends&, const StructWithHiddenFriends&);
+  friend ConvertibleToCmpType weak_order(const StructWithHiddenFriends&, const StructWithHiddenFriends&);
+  friend ConvertibleToCmpType partial_order(const StructWithHiddenFriends&, const StructWithHiddenFriends&);
+
+  friend int&& iter_move(const StructWithHiddenFriends&);
+  friend void iter_swap(const StructWithHiddenFriends&, const StructWithHiddenFriends&);
+
+  friend int* begin(const StructWithHiddenFriends&);
+  friend int* end(const StructWithHiddenFriends&);
+  friend int* rbegin(const StructWithHiddenFriends&);
+  friend int* rend(const StructWithHiddenFriends&);
+  friend unsigned int size(const StructWithHiddenFriends&);
+};
+
+// [cmp.alg] ADL should be performed.
+static_assert(std::is_invocable_v<decltype(std::strong_order), StructWithHiddenFriends&, StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::weak_order), StructWithHiddenFriends&, StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::partial_order), StructWithHiddenFriends&, StructWithHiddenFriends&>);
+
+// [cmp.alg] Ordinary unqualified lookup should not be performed.
+static_assert(
+    !std::is_invocable_v<decltype(std::strong_order), ns::StructWithGlobalFunctions&, ns::StructWithGlobalFunctions&>);
+static_assert(
+    !std::is_invocable_v<decltype(std::weak_order), ns::StructWithGlobalFunctions&, ns::StructWithGlobalFunctions&>);
+static_assert(
+    !std::is_invocable_v<decltype(std::partial_order), ns::StructWithGlobalFunctions&, ns::StructWithGlobalFunctions&>);
+
+// [iterator.cust] ADL should be performed.
+static_assert(std::is_invocable_v<decltype(std::ranges::iter_move), StructWithHiddenFriends&>);
+static_assert(
+    std::is_invocable_v<decltype(std::ranges::iter_swap), StructWithHiddenFriends&, StructWithHiddenFriends&>);
+
+// [iterator.cust] Ordinary unqualified lookup should not be performed.
+static_assert(!std::is_invocable_v<decltype(std::ranges::iter_move), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::iter_swap),
+                                   ns::StructWithGlobalFunctions&,
+                                   ns::StructWithGlobalFunctions&>);
+
+// [range.access] ADL should be performed.
+static_assert(std::is_invocable_v<decltype(std::ranges::begin), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::cbegin), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::end), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::cend), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::rbegin), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::crbegin), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::rend), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::crend), StructWithHiddenFriends&>);
+static_assert(std::is_invocable_v<decltype(std::ranges::size), StructWithHiddenFriends&>);
+
+// [range.access] Ordinary unqualified lookup should not be performed.
+static_assert(!std::is_invocable_v<decltype(std::ranges::begin), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::cbegin), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::end), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::cend), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::rbegin), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::crbegin), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::rend), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::crend), ns::StructWithGlobalFunctions&>);
+static_assert(!std::is_invocable_v<decltype(std::ranges::size), ns::StructWithGlobalFunctions&>);
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp
index 335e8d20c23910..b6204c615d965e 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // const tzdb& get_tzdb();
@@ -30,8 +27,6 @@ int main(int, const char**) {
 
   assert(!db.version.empty());
 
-  LIBCPP_ASSERT(!db.__rules.empty());
-
   assert(!db.zones.empty());
   assert(std::ranges::is_sorted(db.zones));
   assert(std::ranges::adjacent_find(db.zones) == db.zones.end()); // is unique?
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp
index 34af9b576361fa..a5579a3820b6ac 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.access/get_tzdb_list.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // const tzdb& get_tzdb_list();
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp
index ac5fee8183b9bc..12c5310772f6a9 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/front.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 //
 // class tzdb_list;
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp
index 8bd9b321b2a0fb..b00b8b44188d0b 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.list/iterators.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 //
 // class tzdb_list;
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp
index bbf9002c0430ca..af38772ee3cb28 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/reload_tzdb.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // Note there is no Standard way to change the remote database used.
diff --git a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp
index 861075cd82aa60..36b68cefc8d31a 100644
--- a/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.db/time.zone.db.remote/remote_version.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // const string remote_version();
diff --git a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp
index 95d86d586666ea..c2412bac461bc5 100644
--- a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/name.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // class time_zone_link;
diff --git a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp
index 305fbd21f625b5..2f8b5b9421d63c 100644
--- a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.members/target.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // class time_zone_link;
diff --git a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp
index e375d7e443ce49..944818c1ad0c16 100644
--- a/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.link/time.zone.link.nonmembers/comparison.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // bool operator==(const time_zone_link& x, const time_zone_link& y) noexcept;
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp
index 2bbe714b71a683..d1ff2fe683108c 100644
--- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/name.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // class time_zone;
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp
index 9eae91e80a42d9..7c680707bc518b 100644
--- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.nonmembers/comparison.pass.cpp
@@ -12,9 +12,6 @@
 // XFAIL: libcpp-has-no-incomplete-tzdb
 // XFAIL: availability-tzdb-missing
 
-// TODO TZDB (#81654) Enable tests
-// UNSUPPORTED: c++20, c++23, c++26
-
 // <chrono>
 
 // bool operator==(const time_zone& x, const time_zone& y) noexcept;
diff --git a/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp b/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
new file mode 100644
index 00000000000000..127f6546ccbe71
--- /dev/null
+++ b/libcxx/test/std/utilities/format/format.functions/bug_81590.compile.pass.cpp
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// XFAIL: availability-fp_to_chars-missing
+
+// The sample code is based on the bug report
+// https://github.com/llvm/llvm-project/issues/81590
+//
+// Tests whether this formatter does not fail to compile due to nested concept
+// evaluation.
+
+#include <format>
+#include <variant>
+
+struct X : std::variant<X*> {
+  X* p = nullptr;
+  constexpr const std::variant<X*>& decay() const noexcept { return *this; }
+};
+
+template <>
+struct std::formatter<X, char> : std::formatter<std::string, char> {
+  static constexpr auto format(const X& x, auto ctx) {
+    if (!x.p)
+      return ctx.out();
+    auto m = [&](const X* t) { return std::format_to(ctx.out(), "{}", *t); };
+    return std::visit(m, x.decay());
+  }
+};
+
+void bug_81590() { (void)std::format("{}", X{}); }
diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml
index 5cefcb28bd860c..af9a48481e8b67 100644
--- a/libcxx/utils/ci/docker-compose.yml
+++ b/libcxx/utils/ci/docker-compose.yml
@@ -2,7 +2,7 @@ version: '3.7'
 
 x-versions: &compiler_versions
   GCC_LATEST_VERSION: 13
-  LLVM_HEAD_VERSION: 18
+  LLVM_HEAD_VERSION: 19
 
 services:
   buildkite-builder:
diff --git a/libcxx/utils/ci/run-buildbot-container b/libcxx/utils/ci/run-buildbot-container
index 7c00b88097f17b..74e7dab81d734d 100755
--- a/libcxx/utils/ci/run-buildbot-container
+++ b/libcxx/utils/ci/run-buildbot-container
@@ -26,6 +26,6 @@ if [[ ! -d "${MONOREPO_ROOT}/libcxx/utils/ci" ]]; then
     echo "Was unable to find the root of the LLVM monorepo; are you running from within the monorepo?"
     exit 1
 fi
-docker pull ghcr.io/libcxx/libcxx-builder
-docker run -it --volume "${MONOREPO_ROOT}:/llvm" --workdir "/llvm" --cap-add=SYS_PTRACE ghcr.io/libcxx/libcxx-builder \
+docker pull ghcr.io/libcxx/actions-builder
+docker run -it --volume "${MONOREPO_ROOT}:/llvm" --workdir "/llvm" --cap-add=SYS_PTRACE ghcr.io/libcxx/actions-builder \
     bash -c 'git config --global --add safe.directory /llvm ; exec bash'
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 872bff372b3dba..307d35349f3a46 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -421,6 +421,56 @@ def _getAndroidDeviceApi(cfg):
           """,
         ),
     ),
+    Feature(
+        name="can-create-symlinks",
+        when=lambda cfg: "_WIN32" not in compilerMacros(cfg)
+        or programSucceeds(
+            cfg,
+            # Creation of symlinks require elevated privileges on Windows unless
+            # Windows developer mode is enabled.
+            """
+            #include <stdio.h>
+            #include <windows.h>
+            int main() {
+              CHAR tempDirPath[MAX_PATH];
+              DWORD tempPathRet = GetTempPathA(MAX_PATH, tempDirPath);
+              if (tempPathRet == 0 || tempPathRet > MAX_PATH) {
+                return 1;
+              }
+
+              CHAR tempFilePath[MAX_PATH];
+              UINT uRetVal = GetTempFileNameA(
+                tempDirPath,
+                "cxx", // Prefix
+                0, // Unique=0 also implies file creation.
+                tempFilePath);
+              if (uRetVal == 0) {
+                return 1;
+              }
+
+              CHAR symlinkFilePath[MAX_PATH];
+              int ret = sprintf_s(symlinkFilePath, MAX_PATH, "%s_symlink", tempFilePath);
+              if (ret == -1) {
+                DeleteFileA(tempFilePath);
+                return 1;
+              }
+
+              // Requires either administrator, or developer mode enabled.
+              BOOL bCreatedSymlink = CreateSymbolicLinkA(symlinkFilePath,
+                tempFilePath,
+                SYMBOLIC_LINK_FLAG_ALLOW_UNPRIVILEGED_CREATE);
+              if (!bCreatedSymlink) {
+                DeleteFileA(tempFilePath);
+                return 1;
+              }
+
+              DeleteFileA(tempFilePath);
+              DeleteFileA(symlinkFilePath);
+              return 0;
+            }
+            """,
+        ),
+    ),
 ]
 
 # Add features representing the build host platform name.
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 657332deebfde1..3188772f7c4904 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -286,7 +286,7 @@ static void writeSequence(MutableArrayRef<uint32_t> buf, const char *prefix,
   // The full section content has the extent of [begin, end). We drop unused
   // instructions and write [first,end).
   auto *sec = make<InputSection>(
-      nullptr, SHF_ALLOC, SHT_PROGBITS, 4,
+      ctx.internalFile, SHF_ALLOC, SHT_PROGBITS, 4,
       ArrayRef(reinterpret_cast<uint8_t *>(buf.data() + first),
                4 * (buf.size() - first)),
       ".text");
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 9ae01eb90fa4a3..bb3608da80b21f 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -126,7 +126,7 @@ class LinkerDriver {
 private:
   void createFiles(llvm::opt::InputArgList &args);
   void inferMachineType();
-  void link(llvm::opt::InputArgList &args);
+  template <class ELFT> void link(llvm::opt::InputArgList &args);
   template <class ELFT> void compileBitcodeFiles(bool skipLinkedOutput);
   bool tryAddFatLTOFile(MemoryBufferRef mb, StringRef archiveName,
                         uint64_t offsetInArchive, bool lazy);
@@ -273,6 +273,7 @@ struct Config {
   bool printGcSections;
   bool printIcfSections;
   bool printMemoryUsage;
+  bool rejectMismatch;
   bool relax;
   bool relaxGP;
   bool relocatable;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 2439d141fb6643..7257ebd0fac994 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -650,7 +650,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     if (errorCount())
       return;
 
-    link(args);
+    invokeELFT(link, args);
   }
 
   if (config->timeTraceEnabled) {
@@ -1348,6 +1348,7 @@ static void readConfigs(opt::InputArgList &args) {
   config->printArchiveStats = args.getLastArgValue(OPT_print_archive_stats);
   config->printSymbolOrder =
       args.getLastArgValue(OPT_print_symbol_order);
+  config->rejectMismatch = !args.hasArg(OPT_no_warn_mismatch);
   config->relax = args.hasFlag(OPT_relax, OPT_no_relax, true);
   config->relaxGP = args.hasFlag(OPT_relax_gp, OPT_no_relax_gp, false);
   config->rpath = getRpath(args);
@@ -1799,6 +1800,37 @@ static void setConfigs(opt::InputArgList &args) {
       args.hasFlag(OPT_toc_optimize, OPT_no_toc_optimize, m == EM_PPC64);
   config->pcRelOptimize =
       args.hasFlag(OPT_pcrel_optimize, OPT_no_pcrel_optimize, m == EM_PPC64);
+
+  if (!args.hasArg(OPT_hash_style)) {
+    if (config->emachine == EM_MIPS)
+      config->sysvHash = true;
+    else
+      config->sysvHash = config->gnuHash = true;
+  }
+
+  // Set default entry point and output file if not specified by command line or
+  // linker scripts.
+  config->warnMissingEntry =
+      (!config->entry.empty() || (!config->shared && !config->relocatable));
+  if (config->entry.empty() && !config->relocatable)
+    config->entry = config->emachine == EM_MIPS ? "__start" : "_start";
+  if (config->outputFile.empty())
+    config->outputFile = "a.out";
+
+  // Fail early if the output file or map file is not writable. If a user has a
+  // long link, e.g. due to a large LTO link, they do not wish to run it and
+  // find that it failed because there was a mistake in their command-line.
+  {
+    llvm::TimeTraceScope timeScope("Create output files");
+    if (auto e = tryCreateFile(config->outputFile))
+      error("cannot open output file " + config->outputFile + ": " +
+            e.message());
+    if (auto e = tryCreateFile(config->mapFile))
+      error("cannot open map file " + config->mapFile + ": " + e.message());
+    if (auto e = tryCreateFile(config->whyExtract))
+      error("cannot open --why-extract= file " + config->whyExtract + ": " +
+            e.message());
+  }
 }
 
 static bool isFormatBinary(StringRef s) {
@@ -2679,45 +2711,8 @@ static void postParseObjectFile(ELFFileBase *file) {
 
 // Do actual linking. Note that when this function is called,
 // all linker scripts have already been parsed.
-void LinkerDriver::link(opt::InputArgList &args) {
+template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   llvm::TimeTraceScope timeScope("Link", StringRef("LinkerDriver::Link"));
-  // If a --hash-style option was not given, set to a default value,
-  // which varies depending on the target.
-  if (!args.hasArg(OPT_hash_style)) {
-    if (config->emachine == EM_MIPS)
-      config->sysvHash = true;
-    else
-      config->sysvHash = config->gnuHash = true;
-  }
-
-  // Default output filename is "a.out" by the Unix tradition.
-  if (config->outputFile.empty())
-    config->outputFile = "a.out";
-
-  // Fail early if the output file or map file is not writable. If a user has a
-  // long link, e.g. due to a large LTO link, they do not wish to run it and
-  // find that it failed because there was a mistake in their command-line.
-  {
-    llvm::TimeTraceScope timeScope("Create output files");
-    if (auto e = tryCreateFile(config->outputFile))
-      error("cannot open output file " + config->outputFile + ": " +
-            e.message());
-    if (auto e = tryCreateFile(config->mapFile))
-      error("cannot open map file " + config->mapFile + ": " + e.message());
-    if (auto e = tryCreateFile(config->whyExtract))
-      error("cannot open --why-extract= file " + config->whyExtract + ": " +
-            e.message());
-  }
-  if (errorCount())
-    return;
-
-  // Use default entry point name if no name was given via the command
-  // line nor linker scripts. For some reason, MIPS entry point name is
-  // different from others.
-  config->warnMissingEntry =
-      (!config->entry.empty() || (!config->shared && !config->relocatable));
-  if (config->entry.empty() && !config->relocatable)
-    config->entry = (config->emachine == EM_MIPS) ? "__start" : "_start";
 
   // Handle --trace-symbol.
   for (auto *arg : args.filtered(OPT_trace_symbol))
@@ -2738,7 +2733,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
     llvm::TimeTraceScope timeScope("Parse input files");
     for (size_t i = 0; i < files.size(); ++i) {
       llvm::TimeTraceScope timeScope("Parse input files", files[i]->getName());
-      parseFile(files[i]);
+      doParseFile<ELFT>(files[i]);
     }
     if (armCmseImpLib)
       parseArmCMSEImportLib(*armCmseImpLib);
@@ -2872,7 +2867,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
 
   // Handle --lto-validate-all-vtables-have-type-infos.
   if (config->ltoValidateAllVtablesHaveTypeInfos)
-    invokeELFT(ltoValidateAllVtablesHaveTypeInfos, args);
+    ltoValidateAllVtablesHaveTypeInfos<ELFT>(args);
 
   // Do link-time optimization if given files are LLVM bitcode files.
   // This compiles bitcode files into real object files.
@@ -2880,7 +2875,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
   // With this the symbol table should be complete. After this, no new names
   // except a few linker-synthesized ones will be added to the symbol table.
   const size_t numObjsBeforeLTO = ctx.objectFiles.size();
-  invokeELFT(compileBitcodeFiles, skipLinkedOutput);
+  compileBitcodeFiles<ELFT>(skipLinkedOutput);
 
   // Symbol resolution finished. Report backward reference problems,
   // --print-archive-stats=, and --why-extract=.
@@ -2945,7 +2940,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
       llvm::erase_if(ctx.inputSections, [](InputSectionBase *s) {
         if (s->type != SHT_LLVM_SYMPART)
           return false;
-        invokeELFT(readSymbolPartitionSection, s);
+        readSymbolPartitionSection<ELFT>(s);
         return true;
       });
     }
@@ -3003,10 +2998,10 @@ void LinkerDriver::link(opt::InputArgList &args) {
     ctx.inputSections.push_back(createCommentSection());
 
   // Split SHF_MERGE and .eh_frame sections into pieces in preparation for garbage collection.
-  invokeELFT(splitSections,);
+  splitSections<ELFT>();
 
   // Garbage collection and removal of shared symbols from unused shared objects.
-  invokeELFT(markLive,);
+  markLive<ELFT>();
 
   // Make copies of any input sections that need to be copied into each
   // partition.
@@ -3019,7 +3014,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
 
   // Create synthesized sections such as .got and .plt. This is called before
   // processSectionCommands() so that they can be placed by SECTIONS commands.
-  invokeELFT(createSyntheticSections,);
+  createSyntheticSections<ELFT>();
 
   // Some input sections that are used for exception handling need to be moved
   // into synthetic sections. Do that now so that they aren't assigned to
@@ -3059,8 +3054,8 @@ void LinkerDriver::link(opt::InputArgList &args) {
   // Two input sections with different output sections should not be folded.
   // ICF runs after processSectionCommands() so that we know the output sections.
   if (config->icf != ICFLevel::None) {
-    invokeELFT(findKeepUniqueSections, args);
-    invokeELFT(doIcf,);
+    findKeepUniqueSections<ELFT>(args);
+    doIcf<ELFT>();
   }
 
   // Read the callgraph now that we know what was gced or icfed
@@ -3068,9 +3063,9 @@ void LinkerDriver::link(opt::InputArgList &args) {
     if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file))
       if (std::optional<MemoryBufferRef> buffer = readFile(arg->getValue()))
         readCallGraph(*buffer);
-    invokeELFT(readCallGraphsFromObjectFiles,);
+    readCallGraphsFromObjectFiles<ELFT>();
   }
 
   // Write the result to the file.
-  invokeELFT(writeResult,);
+  writeResult<ELFT>();
 }
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 00aebb47640e84..4a6e691938cf46 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -288,7 +288,7 @@ static bool isCompatible(InputFile *file) {
   return false;
 }
 
-template <class ELFT> static void doParseFile(InputFile *file) {
+template <class ELFT> void elf::doParseFile(InputFile *file) {
   if (!isCompatible(file))
     return;
 
@@ -741,6 +741,15 @@ template <class ELFT> void ObjFile<ELFT>::initializeJustSymbols() {
   sections.resize(numELFShdrs);
 }
 
+static bool isKnownSpecificSectionType(uint32_t t, uint32_t flags) {
+  if (SHT_LOUSER <= t && t <= SHT_HIUSER && !(flags & SHF_ALLOC))
+    return true;
+  if (SHT_LOOS <= t && t <= SHT_HIOS && !(flags & SHF_OS_NONCONFORMING))
+    return true;
+  // Allow all processor-specific types. This is different from GNU ld.
+  return SHT_LOPROC <= t && t <= SHT_HIPROC;
+}
+
 template <class ELFT>
 void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
                                        const llvm::object::ELFFile<ELFT> &obj) {
@@ -752,14 +761,15 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
     if (this->sections[i] == &InputSection::discarded)
       continue;
     const Elf_Shdr &sec = objSections[i];
+    const uint32_t type = sec.sh_type;
 
     // SHF_EXCLUDE'ed sections are discarded by the linker. However,
     // if -r is given, we'll let the final link discard such sections.
     // This is compatible with GNU.
     if ((sec.sh_flags & SHF_EXCLUDE) && !config->relocatable) {
-      if (sec.sh_type == SHT_LLVM_CALL_GRAPH_PROFILE)
+      if (type == SHT_LLVM_CALL_GRAPH_PROFILE)
         cgProfileSectionIndex = i;
-      if (sec.sh_type == SHT_LLVM_ADDRSIG) {
+      if (type == SHT_LLVM_ADDRSIG) {
         // We ignore the address-significance table if we know that the object
         // file was created by objcopy or ld -r. This is because these tools
         // will reorder the symbols in the symbol table, invalidating the data
@@ -778,7 +788,7 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
       continue;
     }
 
-    switch (sec.sh_type) {
+    switch (type) {
     case SHT_GROUP: {
       if (!config->relocatable)
         sections[i] = &InputSection::discarded;
@@ -801,12 +811,25 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
     case SHT_RELA:
     case SHT_NULL:
       break;
-    case SHT_LLVM_SYMPART:
-      ctx.hasSympart.store(true, std::memory_order_relaxed);
-      [[fallthrough]];
+    case SHT_PROGBITS:
+    case SHT_NOTE:
+    case SHT_NOBITS:
+    case SHT_INIT_ARRAY:
+    case SHT_FINI_ARRAY:
+    case SHT_PREINIT_ARRAY:
+      this->sections[i] =
+          createInputSection(i, sec, check(obj.getSectionName(sec, shstrtab)));
+      break;
     default:
       this->sections[i] =
           createInputSection(i, sec, check(obj.getSectionName(sec, shstrtab)));
+      if (type == SHT_LLVM_SYMPART)
+        ctx.hasSympart.store(true, std::memory_order_relaxed);
+      else if (config->rejectMismatch &&
+               !isKnownSpecificSectionType(type, sec.sh_flags))
+        errorOrWarn(toString(this->sections[i]) + ": unknown section type 0x" +
+                    Twine::utohexstr(type));
+      break;
     }
   }
 
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 54de842a81cf35..3beb5a3cb9a894 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -46,6 +46,7 @@ extern std::unique_ptr<llvm::TarWriter> tar;
 std::optional<MemoryBufferRef> readFile(StringRef path);
 
 // Add symbols in File to the symbol table.
+template <class ELFT> void doParseFile(InputFile *file);
 void parseFile(InputFile *file);
 
 void parseArmCMSEImportLib(InputFile *file);
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 7508a1336c91a4..082e840adde4ab 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -316,7 +316,9 @@ InputSection::InputSection(InputFile *f, uint64_t flags, uint32_t type,
                            StringRef name, Kind k)
     : InputSectionBase(f, flags, type,
                        /*Entsize*/ 0, /*Link*/ 0, /*Info*/ 0, addralign, data,
-                       name, k) {}
+                       name, k) {
+  assert(f || this == &InputSection::discarded);
+}
 
 template <class ELFT>
 InputSection::InputSection(ObjFile<ELFT> &f, const typename ELFT::Shdr &header,
@@ -346,7 +348,7 @@ template <class ELFT> void InputSection::copyShtGroup(uint8_t *buf) {
 }
 
 InputSectionBase *InputSection::getRelocatedSection() const {
-  if (!file || file->isInternal() || (type != SHT_RELA && type != SHT_REL))
+  if (file->isInternal() || (type != SHT_RELA && type != SHT_REL))
     return nullptr;
   ArrayRef<InputSectionBase *> sections = file->getSections();
   return sections[info];
@@ -408,7 +410,7 @@ void InputSection::copyRelocations(uint8_t *buf,
     // Output section VA is zero for -r, so r_offset is an offset within the
     // section, but for --emit-relocs it is a virtual address.
     p->r_offset = sec->getVA(rel.offset);
-    p->setSymbolAndType(in.symTab->getSymbolIndex(&sym), type,
+    p->setSymbolAndType(in.symTab->getSymbolIndex(sym), type,
                         config->isMips64EL);
 
     if (sym.type == STT_SECTION) {
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index 3819b86238ea64..c5e95d0d25c1ae 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -312,6 +312,9 @@ def no_dynamic_linker: F<"no-dynamic-linker">,
 def noinhibit_exec: F<"noinhibit-exec">,
   HelpText<"Retain the executable output file whenever it is still usable">;
 
+def no_warn_mismatch: F<"no-warn-mismatch">,
+  HelpText<"Suppress errors for certain unknown seciton types">;
+
 def no_nmagic: F<"no-nmagic">, MetaVarName<"<magic>">,
   HelpText<"Page align sections (default)">;
 
@@ -753,7 +756,6 @@ def: FF<"no-add-needed">;
 def: F<"no-copy-dt-needed-entries">;
 def: F<"no-ctors-in-init-array">;
 def: F<"no-keep-memory">;
-def: F<"no-warn-mismatch">;
 def: Separate<["--", "-"], "rpath-link">;
 def: J<"rpath-link=">;
 def: F<"secure-plt">;
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 55e6a14f103e02..f986aa5f675707 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -583,7 +583,7 @@ static void finalizeShtGroup(OutputSection *os, InputSection *section) {
   // sh_info then contain index of an entry in symbol table section which
   // provides signature of the section group.
   ArrayRef<Symbol *> symbols = section->file->getSymbols();
-  os->info = in.symTab->getSymbolIndex(symbols[section->info]);
+  os->info = in.symTab->getSymbolIndex(*symbols[section->info]);
 
   // Some group members may be combined or discarded, so we need to compute the
   // new size. The content will be rewritten in InputSection::copyShtGroup.
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 206fb0f5376664..248ff6b4a865a3 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1600,7 +1600,7 @@ uint32_t DynamicReloc::getSymIndex(SymbolTableBaseSection *symTab) const {
   if (!needsDynSymIndex())
     return 0;
 
-  size_t index = symTab->getSymbolIndex(sym);
+  size_t index = symTab->getSymbolIndex(*sym);
   assert((index != 0 || (type != target->gotRel && type != target->pltRel) ||
           !mainPart->dynSymTab->getParent()) &&
          "GOT or PLT relocation must refer to symbol in dynamic symbol table");
@@ -2172,9 +2172,9 @@ void SymbolTableBaseSection::addSymbol(Symbol *b) {
   symbols.push_back({b, strTabSec.addString(b->getName(), false)});
 }
 
-size_t SymbolTableBaseSection::getSymbolIndex(Symbol *sym) {
+size_t SymbolTableBaseSection::getSymbolIndex(const Symbol &sym) {
   if (this == mainPart->dynSymTab.get())
-    return sym->dynsymIndex;
+    return sym.dynsymIndex;
 
   // Initializes symbol lookup tables lazily. This is used only for -r,
   // --emit-relocs and dynsyms in partitions other than the main one.
@@ -2191,9 +2191,9 @@ size_t SymbolTableBaseSection::getSymbolIndex(Symbol *sym) {
 
   // Section symbols are mapped based on their output sections
   // to maintain their semantics.
-  if (sym->type == STT_SECTION)
-    return sectionIndexMap.lookup(sym->getOutputSection());
-  return symbolIndexMap.lookup(sym);
+  if (sym.type == STT_SECTION)
+    return sectionIndexMap.lookup(sym.getOutputSection());
+  return symbolIndexMap.lookup(&sym);
 }
 
 template <class ELFT>
@@ -2427,7 +2427,7 @@ void GnuHashTableSection::writeTo(uint8_t *buf) {
     // Write a hash bucket. Hash buckets contain indices in the following hash
     // value table.
     write32(buckets + i->bucketIdx,
-            getPartition().dynSymTab->getSymbolIndex(i->sym));
+            getPartition().dynSymTab->getSymbolIndex(*i->sym));
     oldBucket = i->bucketIdx;
   }
 }
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 7882ad87c241d7..b41e6941005412 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -641,7 +641,7 @@ class SymbolTableBaseSection : public SyntheticSection {
   size_t getSize() const override { return getNumSymbols() * entsize; }
   void addSymbol(Symbol *sym);
   unsigned getNumSymbols() const { return symbols.size() + 1; }
-  size_t getSymbolIndex(Symbol *sym);
+  size_t getSymbolIndex(const Symbol &sym);
   ArrayRef<SymbolTableEntry> getSymbols() const { return symbols; }
 
 protected:
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index a9292b3b1a2241..d8782affe879ba 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -261,6 +261,9 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
   Undefined(sym.file, sym.getName(), binding, sym.stOther, sym.type,
             /*discardedSecIdx=*/map.lookup(sym.section))
       .overwrite(sym);
+  // Eliminate from the symbol table, otherwise we would leave an undefined
+  // symbol if the symbol is unreferenced in the absence of GC.
+  sym.isUsedInRegularObj = false;
 }
 
 // If all references to a DSO happen to be weak, the DSO is not added to
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 9edb6b9c60a1fe..36248925d65ad2 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1437,6 +1437,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     resetOutputSegments();
     resetWriter();
     InputFile::resetIdCount();
+
+    objc::doCleanup();
   };
 
   ctx->e.logName = args::getFilenameWithoutExe(argsArr[0]);
@@ -1979,9 +1981,16 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     if (config->deadStrip)
       markLive();
 
+    // Categories are not subject to dead-strip. The __objc_catlist section is
+    // marked as NO_DEAD_STRIP and that propagates into all category data.
     if (args.hasArg(OPT_check_category_conflicts))
       objc::checkCategories();
 
+    // Category merging uses "->live = false" to erase old category data, so
+    // it has to run after dead-stripping (markLive).
+    if (args.hasArg(OPT_objc_category_merging, OPT_no_objc_category_merging))
+      objc::mergeCategories();
+
     // ICF assumes that all literals have been folded already, so we must run
     // foldIdenticalLiterals before foldIdenticalSections.
     foldIdenticalLiterals();
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index becb01017d633a..b25f0638f4c6cb 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -93,9 +93,9 @@ class InputSection {
   // .subsections_via_symbols, there is typically only one element here.
   llvm::TinyPtrVector<Defined *> symbols;
 
-protected:
   const Section &section;
 
+protected:
   const Defined *getContainingSymbol(uint64_t off) const;
 };
 
diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 67254ec53a2145..40df2243b26f06 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -7,16 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "ObjC.h"
+#include "ConcatOutputSection.h"
 #include "InputFiles.h"
 #include "InputSection.h"
 #include "Layout.h"
 #include "OutputSegment.h"
+#include "SyntheticSections.h"
 #include "Target.h"
 
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Support/TimeProfiler.h"
 
 using namespace llvm;
 using namespace llvm::MachO;
@@ -78,7 +81,8 @@ namespace {
   DO(Ptr, classMethods)                                                        \
   DO(Ptr, protocols)                                                           \
   DO(Ptr, instanceProps)                                                       \
-  DO(Ptr, classProps)
+  DO(Ptr, classProps)                                                          \
+  DO(uint32_t, size)
 
 CREATE_LAYOUT_CLASS(Category, FOR_EACH_CATEGORY_FIELD);
 
@@ -112,13 +116,19 @@ CREATE_LAYOUT_CLASS(ROClass, FOR_EACH_RO_CLASS_FIELD);
 #undef FOR_EACH_RO_CLASS_FIELD
 
 #define FOR_EACH_LIST_HEADER(DO)                                               \
-  DO(uint32_t, size)                                                           \
-  DO(uint32_t, count)
+  DO(uint32_t, structSize)                                                     \
+  DO(uint32_t, structCount)
 
 CREATE_LAYOUT_CLASS(ListHeader, FOR_EACH_LIST_HEADER);
 
 #undef FOR_EACH_LIST_HEADER
 
+#define FOR_EACH_PROTOCOL_LIST_HEADER(DO) DO(Ptr, protocolCount)
+
+CREATE_LAYOUT_CLASS(ProtocolListHeader, FOR_EACH_PROTOCOL_LIST_HEADER);
+
+#undef FOR_EACH_PROTOCOL_LIST_HEADER
+
 #define FOR_EACH_METHOD(DO)                                                    \
   DO(Ptr, name)                                                                \
   DO(Ptr, type)                                                                \
@@ -311,6 +321,8 @@ void ObjcCategoryChecker::parseClass(const Defined *classSym) {
 }
 
 void objc::checkCategories() {
+  TimeTraceScope timeScope("ObjcCategoryChecker");
+
   ObjcCategoryChecker checker;
   for (const InputSection *isec : inputSections) {
     if (isec->getName() == section_names::objcCatList)
@@ -320,3 +332,914 @@ void objc::checkCategories() {
       }
   }
 }
+
+namespace {
+
+class ObjcCategoryMerger {
+  // Information about an input category
+  struct InfoInputCategory {
+    ConcatInputSection *catListIsec;
+    ConcatInputSection *catBodyIsec;
+    uint32_t offCatListIsec = 0;
+
+    bool wasMerged = false;
+  };
+
+  // To write new (merged) categories or classes, we will try make limited
+  // assumptions about the alignment and the sections the various class/category
+  // info are stored in and . So we'll just reuse the same sections and
+  // alignment as already used in existing (input) categories. To do this we
+  // have InfoCategoryWriter which contains the various sections that the
+  // generated categories will be written to.
+  template <typename T> struct InfoWriteSection {
+    bool valid = false; // Data has been successfully collected from input
+    uint32_t align = 0;
+    Section *inputSection;
+    Reloc relocTemplate;
+    T *outputSection;
+  };
+
+  struct InfoCategoryWriter {
+    InfoWriteSection<ConcatOutputSection> catListInfo;
+    InfoWriteSection<ConcatOutputSection> catBodyInfo;
+    InfoWriteSection<CStringSection> catNameInfo;
+    InfoWriteSection<ConcatOutputSection> catPtrListInfo;
+  };
+
+  // Information about a pointer list in the original categories (method lists,
+  // protocol lists, etc)
+  struct PointerListInfo {
+    PointerListInfo(const char *_categoryPrefix, uint32_t _categoryOffset,
+                    uint32_t _pointersPerStruct)
+        : categoryPrefix(_categoryPrefix), categoryOffset(_categoryOffset),
+          pointersPerStruct(_pointersPerStruct) {}
+    const char *categoryPrefix;
+    uint32_t categoryOffset = 0;
+
+    uint32_t pointersPerStruct = 0;
+
+    uint32_t structSize = 0;
+    uint32_t structCount = 0;
+
+    std::vector<Symbol *> allPtrs;
+  };
+
+  // Full information about all the categories that extend a class. This will
+  // include all the additional methods, protocols, and properties that are
+  // contained in all the categories that extend a particular class.
+  struct ClassExtensionInfo {
+    ClassExtensionInfo(CategoryLayout &_catLayout) : catLayout(_catLayout){};
+
+    // Merged names of containers. Ex: base|firstCategory|secondCategory|...
+    std::string mergedContainerName;
+    std::string baseClassName;
+    Symbol *baseClass = nullptr;
+    CategoryLayout &catLayout;
+
+    // In case we generate new data, mark the new data as belonging to this file
+    ObjFile *objFileForMergeData = nullptr;
+
+    PointerListInfo instanceMethods = {
+        objc::symbol_names::categoryInstanceMethods,
+        /*_categoryOffset=*/catLayout.instanceMethodsOffset,
+        /*pointersPerStruct=*/3};
+    PointerListInfo classMethods = {
+        objc::symbol_names::categoryClassMethods,
+        /*_categoryOffset=*/catLayout.classMethodsOffset,
+        /*pointersPerStruct=*/3};
+    PointerListInfo protocols = {objc::symbol_names::categoryProtocols,
+                                 /*_categoryOffset=*/catLayout.protocolsOffset,
+                                 /*pointersPerStruct=*/0};
+    PointerListInfo instanceProps = {
+        objc::symbol_names::listProprieties,
+        /*_categoryOffset=*/catLayout.instancePropsOffset,
+        /*pointersPerStruct=*/2};
+    PointerListInfo classProps = {
+        objc::symbol_names::klassPropList,
+        /*_categoryOffset=*/catLayout.classPropsOffset,
+        /*pointersPerStruct=*/2};
+  };
+
+public:
+  ObjcCategoryMerger(std::vector<ConcatInputSection *> &_allInputSections);
+  void doMerge();
+  static void doCleanup();
+
+private:
+  void collectAndValidateCategoriesData();
+  void
+  mergeCategoriesIntoSingleCategory(std::vector<InfoInputCategory> &categories);
+
+  void eraseISec(ConcatInputSection *isec);
+  void eraseMergedCategories();
+
+  void generateCatListForNonErasedCategories(
+      std::map<ConcatInputSection *, std::set<uint64_t>>
+          catListToErasedOffsets);
+  template <typename T>
+  void collectSectionWriteInfoFromIsec(const InputSection *isec,
+                                       InfoWriteSection<T> &catWriteInfo);
+  void collectCategoryWriterInfoFromCategory(const InfoInputCategory &catInfo);
+  void parseCatInfoToExtInfo(const InfoInputCategory &catInfo,
+                             ClassExtensionInfo &extInfo);
+
+  void parseProtocolListInfo(const ConcatInputSection *isec, uint32_t secOffset,
+                             PointerListInfo &ptrList);
+
+  void parsePointerListInfo(const ConcatInputSection *isec, uint32_t secOffset,
+                            PointerListInfo &ptrList);
+
+  void emitAndLinkPointerList(Defined *parentSym, uint32_t linkAtOffset,
+                              const ClassExtensionInfo &extInfo,
+                              const PointerListInfo &ptrList);
+
+  void emitAndLinkProtocolList(Defined *parentSym, uint32_t linkAtOffset,
+                               const ClassExtensionInfo &extInfo,
+                               const PointerListInfo &ptrList);
+
+  Defined *emitCategory(const ClassExtensionInfo &extInfo);
+  Defined *emitCatListEntrySec(const std::string &forCateogryName,
+                               const std::string &forBaseClassName,
+                               ObjFile *objFile);
+  Defined *emitCategoryBody(const std::string &name, const Defined *nameSym,
+                            const Symbol *baseClassSym,
+                            const std::string &baseClassName, ObjFile *objFile);
+  Defined *emitCategoryName(const std::string &name, ObjFile *objFile);
+  void createSymbolReference(Defined *refFrom, const Symbol *refTo,
+                             uint32_t offset, const Reloc &relocTemplate);
+  Symbol *tryGetSymbolAtIsecOffset(const ConcatInputSection *isec,
+                                   uint32_t offset);
+  Defined *tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
+                                     uint32_t offset);
+  void tryEraseDefinedAtIsecOffset(const ConcatInputSection *isec,
+                                   uint32_t offset);
+
+  // Allocate a null-terminated StringRef backed by generatedSectionData
+  StringRef newStringData(const char *str);
+  // Allocate section data, backed by generatedSectionData
+  SmallVector<uint8_t> &newSectionData(uint32_t size);
+
+  CategoryLayout catLayout;
+  ClassLayout classLayout;
+  ROClassLayout roClassLayout;
+  ListHeaderLayout listHeaderLayout;
+  MethodLayout methodLayout;
+  ProtocolListHeaderLayout protocolListHeaderLayout;
+
+  InfoCategoryWriter infoCategoryWriter;
+  std::vector<ConcatInputSection *> &allInputSections;
+  // Map of base class Symbol to list of InfoInputCategory's for it
+  DenseMap<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
+
+  // Normally, the binary data comes from the input files, but since we're
+  // generating binary data ourselves, we use the below array to store it in.
+  // Need this to be 'static' so the data survives past the ObjcCategoryMerger
+  // object, as the data will be read by the Writer when the final binary is
+  // generated.
+  static SmallVector<std::unique_ptr<SmallVector<uint8_t>>>
+      generatedSectionData;
+};
+
+SmallVector<std::unique_ptr<SmallVector<uint8_t>>>
+    ObjcCategoryMerger::generatedSectionData;
+
+ObjcCategoryMerger::ObjcCategoryMerger(
+    std::vector<ConcatInputSection *> &_allInputSections)
+    : catLayout(target->wordSize), classLayout(target->wordSize),
+      roClassLayout(target->wordSize), listHeaderLayout(target->wordSize),
+      methodLayout(target->wordSize),
+      protocolListHeaderLayout(target->wordSize),
+      allInputSections(_allInputSections) {}
+
+// This is a template so that it can be used both for CStringSection and
+// ConcatOutputSection
+template <typename T>
+void ObjcCategoryMerger::collectSectionWriteInfoFromIsec(
+    const InputSection *isec, InfoWriteSection<T> &catWriteInfo) {
+
+  catWriteInfo.inputSection = const_cast<Section *>(&isec->section);
+  catWriteInfo.align = isec->align;
+  catWriteInfo.outputSection = dyn_cast_or_null<T>(isec->parent);
+
+  assert(catWriteInfo.outputSection &&
+         "outputSection may not be null in collectSectionWriteInfoFromIsec.");
+
+  if (isec->relocs.size())
+    catWriteInfo.relocTemplate = isec->relocs[0];
+
+  catWriteInfo.valid = true;
+}
+
+Symbol *
+ObjcCategoryMerger::tryGetSymbolAtIsecOffset(const ConcatInputSection *isec,
+                                             uint32_t offset) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return nullptr;
+
+  return reloc->referent.get<Symbol *>();
+}
+
+Defined *
+ObjcCategoryMerger::tryGetDefinedAtIsecOffset(const ConcatInputSection *isec,
+                                              uint32_t offset) {
+  Symbol *sym = tryGetSymbolAtIsecOffset(isec, offset);
+  return dyn_cast_or_null<Defined>(sym);
+}
+
+// Given an ConcatInputSection or CStringInputSection and an offset, if there is
+// a symbol(Defined) at that offset, then erase the symbol (mark it not live)
+void ObjcCategoryMerger::tryEraseDefinedAtIsecOffset(
+    const ConcatInputSection *isec, uint32_t offset) {
+  const Reloc *reloc = isec->getRelocAt(offset);
+
+  if (!reloc)
+    return;
+
+  Defined *sym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  if (!sym)
+    return;
+
+  if (auto *cisec = dyn_cast_or_null<ConcatInputSection>(sym->isec))
+    eraseISec(cisec);
+  else if (auto *csisec = dyn_cast_or_null<CStringInputSection>(sym->isec)) {
+    uint32_t totalOffset = sym->value + reloc->addend;
+    StringPiece &piece = csisec->getStringPiece(totalOffset);
+    piece.live = false;
+  } else {
+    llvm_unreachable("erased symbol has to be Defined or CStringInputSection");
+  }
+}
+
+void ObjcCategoryMerger::collectCategoryWriterInfoFromCategory(
+    const InfoInputCategory &catInfo) {
+
+  if (!infoCategoryWriter.catListInfo.valid)
+    collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+        catInfo.catListIsec, infoCategoryWriter.catListInfo);
+  if (!infoCategoryWriter.catBodyInfo.valid)
+    collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+        catInfo.catBodyIsec, infoCategoryWriter.catBodyInfo);
+
+  if (!infoCategoryWriter.catNameInfo.valid) {
+    lld::macho::Defined *catNameSym =
+        tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset);
+    assert(catNameSym && "Category does not have a valid name Symbol");
+
+    collectSectionWriteInfoFromIsec<CStringSection>(
+        catNameSym->isec, infoCategoryWriter.catNameInfo);
+  }
+
+  // Collect writer info from all the category lists (we're assuming they all
+  // would provide the same info)
+  if (!infoCategoryWriter.catPtrListInfo.valid) {
+    for (uint32_t off = catLayout.instanceMethodsOffset;
+         off <= catLayout.classPropsOffset; off += target->wordSize) {
+      if (Defined *ptrList =
+              tryGetDefinedAtIsecOffset(catInfo.catBodyIsec, off)) {
+        collectSectionWriteInfoFromIsec<ConcatOutputSection>(
+            ptrList->isec, infoCategoryWriter.catPtrListInfo);
+        // we've successfully collected data, so we can break
+        break;
+      }
+    }
+  }
+}
+
+// Parse a protocol list that might be linked to ConcatInputSection at a given
+// offset. The format of the protocol list is different than other lists (prop
+// lists, method lists) so we need to parse it differently
+void ObjcCategoryMerger::parseProtocolListInfo(const ConcatInputSection *isec,
+                                               uint32_t secOffset,
+                                               PointerListInfo &ptrList) {
+  assert((isec && (secOffset + target->wordSize <= isec->data.size())) &&
+         "Tried to read pointer list beyond protocol section end");
+
+  const Reloc *reloc = isec->getRelocAt(secOffset);
+  if (!reloc)
+    return;
+
+  auto *ptrListSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  assert(ptrListSym && "Protocol list reloc does not have a valid Defined");
+
+  // Theoretically protocol count can be either 32b or 64b, depending on
+  // platform pointer size, but to simplify implementation we always just read
+  // the lower 32b which should be good enough.
+  uint32_t protocolCount = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structSizeOffset);
+
+  ptrList.structCount += protocolCount;
+  ptrList.structSize = target->wordSize;
+
+  uint32_t expectedListSize =
+      (protocolCount * target->wordSize) +
+      /*header(count)*/ protocolListHeaderLayout.totalSize +
+      /*extra null value*/ target->wordSize;
+  assert(expectedListSize == ptrListSym->isec->data.size() &&
+         "Protocol list does not match expected size");
+
+  // Suppress unsuded var warning
+  (void)expectedListSize;
+
+  uint32_t off = protocolListHeaderLayout.totalSize;
+  for (uint32_t inx = 0; inx < protocolCount; ++inx) {
+    const Reloc *reloc = ptrListSym->isec->getRelocAt(off);
+    assert(reloc && "No reloc found at protocol list offset");
+
+    auto *listSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+    assert(listSym && "Protocol list reloc does not have a valid Defined");
+
+    ptrList.allPtrs.push_back(listSym);
+    off += target->wordSize;
+  }
+  assert((ptrListSym->isec->getRelocAt(off) == nullptr) &&
+         "expected null terminating protocol");
+  assert(off + /*extra null value*/ target->wordSize == expectedListSize &&
+         "Protocol list end offset does not match expected size");
+}
+
+// Parse a pointer list that might be linked to ConcatInputSection at a given
+// offset. This can be used for instance methods, class methods, instance props
+// and class props since they have the same format.
+void ObjcCategoryMerger::parsePointerListInfo(const ConcatInputSection *isec,
+                                              uint32_t secOffset,
+                                              PointerListInfo &ptrList) {
+  assert(ptrList.pointersPerStruct == 2 || ptrList.pointersPerStruct == 3);
+  assert(isec && "Trying to parse pointer list from null isec");
+  assert(secOffset + target->wordSize <= isec->data.size() &&
+         "Trying to read pointer list beyond section end");
+
+  const Reloc *reloc = isec->getRelocAt(secOffset);
+  if (!reloc)
+    return;
+
+  auto *ptrListSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+  assert(ptrListSym && "Reloc does not have a valid Defined");
+
+  uint32_t thisStructSize = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structSizeOffset);
+  uint32_t thisStructCount = *reinterpret_cast<const uint32_t *>(
+      ptrListSym->isec->data.data() + listHeaderLayout.structCountOffset);
+  assert(thisStructSize == ptrList.pointersPerStruct * target->wordSize);
+
+  assert(!ptrList.structSize || (thisStructSize == ptrList.structSize));
+
+  ptrList.structCount += thisStructCount;
+  ptrList.structSize = thisStructSize;
+
+  uint32_t expectedListSize =
+      listHeaderLayout.totalSize + (thisStructSize * thisStructCount);
+  assert(expectedListSize == ptrListSym->isec->data.size() &&
+         "Pointer list does not match expected size");
+
+  for (uint32_t off = listHeaderLayout.totalSize; off < expectedListSize;
+       off += target->wordSize) {
+    const Reloc *reloc = ptrListSym->isec->getRelocAt(off);
+    assert(reloc && "No reloc found at pointer list offset");
+
+    auto *listSym = dyn_cast_or_null<Defined>(reloc->referent.get<Symbol *>());
+    assert(listSym && "Reloc does not have a valid Defined");
+
+    ptrList.allPtrs.push_back(listSym);
+  }
+}
+
+// Here we parse all the information of an input category (catInfo) and
+// append the parsed info into the structure which will contain all the
+// information about how a class is extended (extInfo)
+void ObjcCategoryMerger::parseCatInfoToExtInfo(const InfoInputCategory &catInfo,
+                                               ClassExtensionInfo &extInfo) {
+  const Reloc *catNameReloc =
+      catInfo.catBodyIsec->getRelocAt(catLayout.nameOffset);
+
+  // Parse name
+  assert(catNameReloc && "Category does not have a reloc at 'nameOffset'");
+
+  // is this the first category we are parsing?
+  if (extInfo.mergedContainerName.empty())
+    extInfo.objFileForMergeData =
+        dyn_cast_or_null<ObjFile>(catInfo.catBodyIsec->getFile());
+  else
+    extInfo.mergedContainerName += "|";
+
+  assert(extInfo.objFileForMergeData &&
+         "Expected to already have valid objextInfo.objFileForMergeData");
+
+  StringRef catName = getReferentString(*catNameReloc);
+  extInfo.mergedContainerName += catName.str();
+
+  // Parse base class
+  if (!extInfo.baseClass) {
+    Symbol *classSym =
+        tryGetSymbolAtIsecOffset(catInfo.catBodyIsec, catLayout.klassOffset);
+    assert(extInfo.baseClassName.empty());
+    extInfo.baseClass = classSym;
+    llvm::StringRef classPrefix(objc::symbol_names::klass);
+    assert(classSym->getName().starts_with(classPrefix) &&
+           "Base class symbol does not start with expected prefix");
+    extInfo.baseClassName = classSym->getName().substr(classPrefix.size());
+  } else {
+    assert((extInfo.baseClass ==
+            tryGetSymbolAtIsecOffset(catInfo.catBodyIsec,
+                                     catLayout.klassOffset)) &&
+           "Trying to parse category info into container with different base "
+           "class");
+  }
+
+  parsePointerListInfo(catInfo.catBodyIsec, catLayout.instanceMethodsOffset,
+                       extInfo.instanceMethods);
+
+  parsePointerListInfo(catInfo.catBodyIsec, catLayout.classMethodsOffset,
+                       extInfo.classMethods);
+
+  parseProtocolListInfo(catInfo.catBodyIsec, catLayout.protocolsOffset,
+                        extInfo.protocols);
+
+  parsePointerListInfo(catInfo.catBodyIsec, catLayout.instancePropsOffset,
+                       extInfo.instanceProps);
+
+  parsePointerListInfo(catInfo.catBodyIsec, catLayout.classPropsOffset,
+                       extInfo.classProps);
+}
+
+// Generate a protocol list (including header) and link it into the parent at
+// the specified offset.
+void ObjcCategoryMerger::emitAndLinkProtocolList(
+    Defined *parentSym, uint32_t linkAtOffset,
+    const ClassExtensionInfo &extInfo, const PointerListInfo &ptrList) {
+  if (ptrList.allPtrs.empty())
+    return;
+
+  assert(ptrList.allPtrs.size() == ptrList.structCount);
+
+  uint32_t bodySize = (ptrList.structCount * target->wordSize) +
+                      /*header(count)*/ protocolListHeaderLayout.totalSize +
+                      /*extra null value*/ target->wordSize;
+  llvm::ArrayRef<uint8_t> bodyData = newSectionData(bodySize);
+
+  // This theoretically can be either 32b or 64b, but writing just the first 32b
+  // is good enough
+  const uint32_t *ptrProtoCount = reinterpret_cast<const uint32_t *>(
+      bodyData.data() + protocolListHeaderLayout.protocolCountOffset);
+
+  *const_cast<uint32_t *>(ptrProtoCount) = ptrList.allPtrs.size();
+
+  ConcatInputSection *listSec = make<ConcatInputSection>(
+      *infoCategoryWriter.catPtrListInfo.inputSection, bodyData,
+      infoCategoryWriter.catPtrListInfo.align);
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+  listSec->live = true;
+  allInputSections.push_back(listSec);
+
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+
+  std::string symName = ptrList.categoryPrefix;
+  symName += extInfo.baseClassName + "_$_(" + extInfo.mergedContainerName + ")";
+
+  Defined *ptrListSym = make<Defined>(
+      newStringData(symName.c_str()), /*file=*/parentSym->getObjectFile(),
+      listSec, /*value=*/0, bodyData.size(), /*isWeakDef=*/false,
+      /*isExternal=*/false, /*isPrivateExtern=*/false, /*includeInSymtab=*/true,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  ptrListSym->used = true;
+  parentSym->getObjectFile()->symbols.push_back(ptrListSym);
+
+  createSymbolReference(parentSym, ptrListSym, linkAtOffset,
+                        infoCategoryWriter.catBodyInfo.relocTemplate);
+
+  uint32_t offset = protocolListHeaderLayout.totalSize;
+  for (Symbol *symbol : ptrList.allPtrs) {
+    createSymbolReference(ptrListSym, symbol, offset,
+                          infoCategoryWriter.catPtrListInfo.relocTemplate);
+    offset += target->wordSize;
+  }
+}
+
+// Generate a pointer list (including header) and link it into the parent at the
+// specified offset. This is used for instance and class methods and
+// proprieties.
+void ObjcCategoryMerger::emitAndLinkPointerList(
+    Defined *parentSym, uint32_t linkAtOffset,
+    const ClassExtensionInfo &extInfo, const PointerListInfo &ptrList) {
+  if (ptrList.allPtrs.empty())
+    return;
+
+  assert(ptrList.allPtrs.size() * target->wordSize ==
+         ptrList.structCount * ptrList.structSize);
+
+  // Generate body
+  uint32_t bodySize =
+      listHeaderLayout.totalSize + (ptrList.structSize * ptrList.structCount);
+  llvm::ArrayRef<uint8_t> bodyData = newSectionData(bodySize);
+
+  const uint32_t *ptrStructSize = reinterpret_cast<const uint32_t *>(
+      bodyData.data() + listHeaderLayout.structSizeOffset);
+  const uint32_t *ptrStructCount = reinterpret_cast<const uint32_t *>(
+      bodyData.data() + listHeaderLayout.structCountOffset);
+
+  *const_cast<uint32_t *>(ptrStructSize) = ptrList.structSize;
+  *const_cast<uint32_t *>(ptrStructCount) = ptrList.structCount;
+
+  ConcatInputSection *listSec = make<ConcatInputSection>(
+      *infoCategoryWriter.catPtrListInfo.inputSection, bodyData,
+      infoCategoryWriter.catPtrListInfo.align);
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+  listSec->live = true;
+  allInputSections.push_back(listSec);
+
+  listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
+
+  std::string symName = ptrList.categoryPrefix;
+  symName += extInfo.baseClassName + "_$_" + extInfo.mergedContainerName;
+
+  Defined *ptrListSym = make<Defined>(
+      newStringData(symName.c_str()), /*file=*/parentSym->getObjectFile(),
+      listSec, /*value=*/0, bodyData.size(), /*isWeakDef=*/false,
+      /*isExternal=*/false, /*isPrivateExtern=*/false, /*includeInSymtab=*/true,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  ptrListSym->used = true;
+  parentSym->getObjectFile()->symbols.push_back(ptrListSym);
+
+  createSymbolReference(parentSym, ptrListSym, linkAtOffset,
+                        infoCategoryWriter.catBodyInfo.relocTemplate);
+
+  uint32_t offset = listHeaderLayout.totalSize;
+  for (Symbol *symbol : ptrList.allPtrs) {
+    createSymbolReference(ptrListSym, symbol, offset,
+                          infoCategoryWriter.catPtrListInfo.relocTemplate);
+    offset += target->wordSize;
+  }
+}
+
+// This method creates an __objc_catlist ConcatInputSection with a single slot
+Defined *
+ObjcCategoryMerger::emitCatListEntrySec(const std::string &forCateogryName,
+                                        const std::string &forBaseClassName,
+                                        ObjFile *objFile) {
+  uint32_t sectionSize = target->wordSize;
+  llvm::ArrayRef<uint8_t> bodyData = newSectionData(sectionSize);
+
+  ConcatInputSection *newCatList =
+      make<ConcatInputSection>(*infoCategoryWriter.catListInfo.inputSection,
+                               bodyData, infoCategoryWriter.catListInfo.align);
+  newCatList->parent = infoCategoryWriter.catListInfo.outputSection;
+  newCatList->live = true;
+  allInputSections.push_back(newCatList);
+
+  newCatList->parent = infoCategoryWriter.catListInfo.outputSection;
+
+  std::string catSymName = "<__objc_catlist slot for merged category ";
+  catSymName += forBaseClassName + "(" + forCateogryName + ")>";
+
+  Defined *catListSym = make<Defined>(
+      newStringData(catSymName.c_str()), /*file=*/objFile, newCatList,
+      /*value=*/0, bodyData.size(), /*isWeakDef=*/false, /*isExternal=*/false,
+      /*isPrivateExtern=*/false, /*includeInSymtab=*/false,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  catListSym->used = true;
+  objFile->symbols.push_back(catListSym);
+  return catListSym;
+}
+
+// Here we generate the main category body and link the name and base class into
+// it. We don't link any other info yet like the protocol and class/instance
+// methods/props.
+Defined *ObjcCategoryMerger::emitCategoryBody(const std::string &name,
+                                              const Defined *nameSym,
+                                              const Symbol *baseClassSym,
+                                              const std::string &baseClassName,
+                                              ObjFile *objFile) {
+  llvm::ArrayRef<uint8_t> bodyData = newSectionData(catLayout.totalSize);
+
+  uint32_t *ptrSize = (uint32_t *)(const_cast<uint8_t *>(bodyData.data()) +
+                                   catLayout.sizeOffset);
+  *ptrSize = catLayout.totalSize;
+
+  ConcatInputSection *newBodySec =
+      make<ConcatInputSection>(*infoCategoryWriter.catBodyInfo.inputSection,
+                               bodyData, infoCategoryWriter.catBodyInfo.align);
+  newBodySec->parent = infoCategoryWriter.catBodyInfo.outputSection;
+  newBodySec->live = true;
+  allInputSections.push_back(newBodySec);
+
+  std::string symName =
+      objc::symbol_names::category + baseClassName + "_$_(" + name + ")";
+  Defined *catBodySym = make<Defined>(
+      newStringData(symName.c_str()), /*file=*/objFile, newBodySec,
+      /*value=*/0, bodyData.size(), /*isWeakDef=*/false, /*isExternal=*/false,
+      /*isPrivateExtern=*/false, /*includeInSymtab=*/true,
+      /*isReferencedDynamically=*/false, /*noDeadStrip=*/false,
+      /*isWeakDefCanBeHidden=*/false);
+
+  catBodySym->used = true;
+  objFile->symbols.push_back(catBodySym);
+
+  createSymbolReference(catBodySym, nameSym, catLayout.nameOffset,
+                        infoCategoryWriter.catBodyInfo.relocTemplate);
+
+  // Create a reloc to the base class (either external or internal)
+  createSymbolReference(catBodySym, baseClassSym, catLayout.klassOffset,
+                        infoCategoryWriter.catBodyInfo.relocTemplate);
+
+  return catBodySym;
+}
+
+// This writes the new category name (for the merged category) into the binary
+// and returns the sybmol for it.
+Defined *ObjcCategoryMerger::emitCategoryName(const std::string &name,
+                                              ObjFile *objFile) {
+  StringRef nameStrData = newStringData(name.c_str());
+  // We use +1 below to include the null terminator
+  llvm::ArrayRef<uint8_t> nameData(
+      reinterpret_cast<const uint8_t *>(nameStrData.data()),
+      nameStrData.size() + 1);
+
+  auto *parentSection = infoCategoryWriter.catNameInfo.inputSection;
+  CStringInputSection *newStringSec = make<CStringInputSection>(
+      *infoCategoryWriter.catNameInfo.inputSection, nameData,
+      infoCategoryWriter.catNameInfo.align, /*dedupLiterals=*/true);
+
+  parentSection->subsections.push_back({0, newStringSec});
+
+  newStringSec->splitIntoPieces();
+  newStringSec->pieces[0].live = true;
+  newStringSec->parent = infoCategoryWriter.catNameInfo.outputSection;
+  in.cStringSection->addInput(newStringSec);
+  assert(newStringSec->pieces.size() == 1);
+
+  Defined *catNameSym = make<Defined>(
+      "<merged category name>", /*file=*/objFile, newStringSec,
+      /*value=*/0, nameData.size(),
+      /*isWeakDef=*/false, /*isExternal=*/false, /*isPrivateExtern=*/false,
+      /*includeInSymtab=*/false, /*isReferencedDynamically=*/false,
+      /*noDeadStrip=*/false, /*isWeakDefCanBeHidden=*/false);
+
+  catNameSym->used = true;
+  objFile->symbols.push_back(catNameSym);
+  return catNameSym;
+}
+
+// This method fully creates a new category from the given ClassExtensionInfo.
+// It creates the category name, body and method/protocol/prop lists and links
+// them all together. Then it creates a new __objc_catlist entry and adds the
+// category to it. Calling this method will fully generate a category which will
+// be available in the final binary.
+Defined *ObjcCategoryMerger::emitCategory(const ClassExtensionInfo &extInfo) {
+  Defined *catNameSym = emitCategoryName(extInfo.mergedContainerName,
+                                         extInfo.objFileForMergeData);
+
+  Defined *catBodySym = emitCategoryBody(
+      extInfo.mergedContainerName, catNameSym, extInfo.baseClass,
+      extInfo.baseClassName, extInfo.objFileForMergeData);
+
+  Defined *catListSym =
+      emitCatListEntrySec(extInfo.mergedContainerName, extInfo.baseClassName,
+                          extInfo.objFileForMergeData);
+
+  // Add the single category body to the category list at the offset 0.
+  createSymbolReference(catListSym, catBodySym, /*offset=*/0,
+                        infoCategoryWriter.catListInfo.relocTemplate);
+
+  emitAndLinkPointerList(catBodySym, catLayout.instanceMethodsOffset, extInfo,
+                         extInfo.instanceMethods);
+
+  emitAndLinkPointerList(catBodySym, catLayout.classMethodsOffset, extInfo,
+                         extInfo.classMethods);
+
+  emitAndLinkProtocolList(catBodySym, catLayout.protocolsOffset, extInfo,
+                          extInfo.protocols);
+
+  emitAndLinkPointerList(catBodySym, catLayout.instancePropsOffset, extInfo,
+                         extInfo.instanceProps);
+
+  emitAndLinkPointerList(catBodySym, catLayout.classPropsOffset, extInfo,
+                         extInfo.classProps);
+
+  return catBodySym;
+}
+
+// This method merges all the categories (sharing a base class) into a single
+// category.
+void ObjcCategoryMerger::mergeCategoriesIntoSingleCategory(
+    std::vector<InfoInputCategory> &categories) {
+  assert(categories.size() > 1 && "Expected at least 2 categories");
+
+  ClassExtensionInfo extInfo(catLayout);
+
+  for (auto &catInfo : categories)
+    parseCatInfoToExtInfo(catInfo, extInfo);
+
+  Defined *newCatDef = emitCategory(extInfo);
+  assert(newCatDef && "Failed to create a new category");
+
+  // Suppress unsuded var warning
+  (void)newCatDef;
+
+  for (auto &catInfo : categories)
+    catInfo.wasMerged = true;
+}
+
+void ObjcCategoryMerger::createSymbolReference(Defined *refFrom,
+                                               const Symbol *refTo,
+                                               uint32_t offset,
+                                               const Reloc &relocTemplate) {
+  Reloc r = relocTemplate;
+  r.offset = offset;
+  r.addend = 0;
+  r.referent = const_cast<Symbol *>(refTo);
+  refFrom->isec->relocs.push_back(r);
+}
+
+void ObjcCategoryMerger::collectAndValidateCategoriesData() {
+  for (InputSection *sec : allInputSections) {
+    if (sec->getName() != section_names::objcCatList)
+      continue;
+    ConcatInputSection *catListCisec = dyn_cast<ConcatInputSection>(sec);
+    assert(catListCisec &&
+           "__objc_catList InputSection is not a ConcatInputSection");
+
+    for (uint32_t off = 0; off < catListCisec->getSize();
+         off += target->wordSize) {
+      Defined *categorySym = tryGetDefinedAtIsecOffset(catListCisec, off);
+      assert(categorySym &&
+             "Failed to get a valid cateogry at __objc_catlit offset");
+
+      // We only support ObjC categories (no swift + @objc)
+      // TODO: Support swift + @objc categories also
+      if (!categorySym->getName().starts_with(objc::symbol_names::category))
+        continue;
+
+      auto *catBodyIsec = dyn_cast<ConcatInputSection>(categorySym->isec);
+      assert(catBodyIsec &&
+             "Category data section is not an ConcatInputSection");
+
+      // Check that the category has a reloc at 'klassOffset' (which is
+      // a pointer to the class symbol)
+
+      Symbol *classSym =
+          tryGetSymbolAtIsecOffset(catBodyIsec, catLayout.klassOffset);
+      assert(classSym && "Category does not have a valid base class");
+
+      InfoInputCategory catInputInfo{catListCisec, catBodyIsec, off};
+      categoryMap[classSym].push_back(catInputInfo);
+
+      collectCategoryWriterInfoFromCategory(catInputInfo);
+    }
+  }
+}
+
+// In the input we have multiple __objc_catlist InputSection, each of which may
+// contain links to multiple categories. Of these categories, we will merge (and
+// erase) only some. There will be some categories that will remain untouched
+// (not erased). For these not erased categories, we generate new __objc_catlist
+// entries since the parent __objc_catlist entry will be erased
+void ObjcCategoryMerger::generateCatListForNonErasedCategories(
+    const std::map<ConcatInputSection *, std::set<uint64_t>>
+        catListToErasedOffsets) {
+
+  // Go through all offsets of all __objc_catlist's that we process and if there
+  // are categories that we didn't process - generate a new __objc_catlist for
+  // each.
+  for (auto &mapEntry : catListToErasedOffsets) {
+    ConcatInputSection *catListIsec = mapEntry.first;
+    for (uint32_t catListIsecOffset = 0;
+         catListIsecOffset < catListIsec->data.size();
+         catListIsecOffset += target->wordSize) {
+      // This slot was erased, we can just skip it
+      if (mapEntry.second.count(catListIsecOffset))
+        continue;
+
+      Defined *nonErasedCatBody =
+          tryGetDefinedAtIsecOffset(catListIsec, catListIsecOffset);
+      assert(nonErasedCatBody && "Failed to relocate non-deleted category");
+
+      // Allocate data for the new __objc_catlist slot
+      auto bodyData = newSectionData(target->wordSize);
+
+      // We mark the __objc_catlist slot as belonging to the same file as the
+      // category
+      ObjFile *objFile = dyn_cast<ObjFile>(nonErasedCatBody->getFile());
+
+      ConcatInputSection *listSec = make<ConcatInputSection>(
+          *infoCategoryWriter.catListInfo.inputSection, bodyData,
+          infoCategoryWriter.catListInfo.align);
+      listSec->parent = infoCategoryWriter.catListInfo.outputSection;
+      listSec->live = true;
+      allInputSections.push_back(listSec);
+
+      std::string slotSymName = "<__objc_catlist slot for category ";
+      slotSymName += nonErasedCatBody->getName();
+      slotSymName += ">";
+
+      Defined *catListSlotSym = make<Defined>(
+          newStringData(slotSymName.c_str()), /*file=*/objFile, listSec,
+          /*value=*/0, bodyData.size(),
+          /*isWeakDef=*/false, /*isExternal=*/false, /*isPrivateExtern=*/false,
+          /*includeInSymtab=*/false, /*isReferencedDynamically=*/false,
+          /*noDeadStrip=*/false, /*isWeakDefCanBeHidden=*/false);
+
+      catListSlotSym->used = true;
+      objFile->symbols.push_back(catListSlotSym);
+
+      // Now link the category body into the newly created slot
+      createSymbolReference(catListSlotSym, nonErasedCatBody, 0,
+                            infoCategoryWriter.catListInfo.relocTemplate);
+    }
+  }
+}
+
+void ObjcCategoryMerger::eraseISec(ConcatInputSection *isec) {
+  isec->live = false;
+  for (auto &sym : isec->symbols)
+    sym->used = false;
+}
+
+// This fully erases the merged categories, including their body, their names,
+// their method/protocol/prop lists and the __objc_catlist entries that link to
+// them.
+void ObjcCategoryMerger::eraseMergedCategories() {
+  // Map of InputSection to a set of offsets of the categories that were merged
+  std::map<ConcatInputSection *, std::set<uint64_t>> catListToErasedOffsets;
+
+  for (auto &mapEntry : categoryMap) {
+    for (InfoInputCategory &catInfo : mapEntry.second) {
+      if (catInfo.wasMerged) {
+        eraseISec(catInfo.catListIsec);
+        catListToErasedOffsets[catInfo.catListIsec].insert(
+            catInfo.offCatListIsec);
+      }
+    }
+  }
+
+  // If there were categories that we did not erase, we need to generate a new
+  // __objc_catList that contains only the un-merged categories, and get rid of
+  // the references to the ones we merged.
+  generateCatListForNonErasedCategories(catListToErasedOffsets);
+
+  // Erase the old method lists & names of the categories that were merged
+  for (auto &mapEntry : categoryMap) {
+    for (InfoInputCategory &catInfo : mapEntry.second) {
+      if (!catInfo.wasMerged)
+        continue;
+
+      eraseISec(catInfo.catBodyIsec);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec, catLayout.nameOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.instanceMethodsOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.classMethodsOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.protocolsOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.classPropsOffset);
+      tryEraseDefinedAtIsecOffset(catInfo.catBodyIsec,
+                                  catLayout.instancePropsOffset);
+    }
+  }
+}
+
+void ObjcCategoryMerger::doMerge() {
+  collectAndValidateCategoriesData();
+
+  for (auto &entry : categoryMap)
+    if (entry.second.size() > 1)
+      // Merge all categories into a new, single category
+      mergeCategoriesIntoSingleCategory(entry.second);
+
+  // Erase all categories that were merged
+  eraseMergedCategories();
+}
+
+void ObjcCategoryMerger::doCleanup() { generatedSectionData.clear(); }
+
+StringRef ObjcCategoryMerger::newStringData(const char *str) {
+  uint32_t len = strlen(str);
+  auto &data = newSectionData(len + 1);
+  char *strData = reinterpret_cast<char *>(data.data());
+  strncpy(strData, str, len);
+  return StringRef(strData, len);
+}
+
+SmallVector<uint8_t> &ObjcCategoryMerger::newSectionData(uint32_t size) {
+  generatedSectionData.push_back(
+      std::make_unique<SmallVector<uint8_t>>(size, 0));
+  return *generatedSectionData.back();
+}
+
+} // namespace
+
+void objc::mergeCategories() {
+  TimeTraceScope timeScope("ObjcCategoryMerger");
+
+  ObjcCategoryMerger merger(inputSections);
+  merger.doMerge();
+}
+
+void objc::doCleanup() { ObjcCategoryMerger::doCleanup(); }
diff --git a/lld/MachO/ObjC.h b/lld/MachO/ObjC.h
index 4c65f9a1f78812..9fbe984e6223ec 100644
--- a/lld/MachO/ObjC.h
+++ b/lld/MachO/ObjC.h
@@ -17,14 +17,26 @@ namespace objc {
 
 namespace symbol_names {
 constexpr const char klass[] = "_OBJC_CLASS_$_";
+constexpr const char klassPropList[] = "__OBJC_$_CLASS_PROP_LIST_";
+
 constexpr const char metaclass[] = "_OBJC_METACLASS_$_";
 constexpr const char ehtype[] = "_OBJC_EHTYPE_$_";
 constexpr const char ivar[] = "_OBJC_IVAR_$_";
+constexpr const char listProprieties[] = "__OBJC_$_PROP_LIST_";
+
+constexpr const char category[] = "__OBJC_$_CATEGORY_";
+constexpr const char categoryInstanceMethods[] =
+    "__OBJC_$_CATEGORY_INSTANCE_METHODS_";
+constexpr const char categoryClassMethods[] =
+    "__OBJC_$_CATEGORY_CLASS_METHODS_";
+constexpr const char categoryProtocols[] = "__OBJC_CATEGORY_PROTOCOLS_$_";
 } // namespace symbol_names
 
 // Check for duplicate method names within related categories / classes.
 void checkCategories();
+void mergeCategories();
 
+void doCleanup();
 } // namespace objc
 
 bool hasObjCSection(llvm::MemoryBufferRef);
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index a524e4a4c50841..0d8ee2a0926be2 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -129,6 +129,12 @@ def strict_auto_link : Flag<["--"], "strict-auto-link">,
 def check_category_conflicts : Flag<["--"], "check-category-conflicts">,
     HelpText<"Check for conflicts between category & class methods">,
     Group<grp_lld>;
+def objc_category_merging : Flag<["-"], "objc_category_merging">,
+    HelpText<"Merge Objective-C categories that share the same base class">,
+    Group<grp_lld>;
+def no_objc_category_merging : Flag<["-"], "no_objc_category_merging">,
+    HelpText<"Do not merge Objective-C categories">,
+    Group<grp_lld>;
 def lto_debug_pass_manager: Flag<["--"], "lto-debug-pass-manager">,
     HelpText<"Debug new pass manager">, Group<grp_lld>;
 def cs_profile_generate: Flag<["--"], "cs-profile-generate">,
@@ -966,10 +972,6 @@ def interposable_list : Separate<["-"], "interposable_list">,
 def no_function_starts : Flag<["-"], "no_function_starts">,
     HelpText<"Do not create table of function start addresses">,
     Group<grp_rare>;
-def no_objc_category_merging : Flag<["-"], "no_objc_category_merging">,
-    HelpText<"Do not merge Objective-C categories into their classes">,
-    Flags<[HelpHidden]>,
-    Group<grp_rare>;
 def object_path_lto : Separate<["-"], "object_path_lto">,
     MetaVarName<"<path>">,
     HelpText<"Retain any temporary mach-o file in <path> that would otherwise be deleted during LTO">,
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index e759776c8d55a6..65e50e349c8cc0 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -378,6 +378,8 @@ Do not put read-only non-executable sections in their own segment.
 Do not report version scripts that refer to undefined symbols.
 .It Fl -no-undefined
 Report unresolved symbols even if the linker is creating a shared library.
+.It Fl -no-warn-mismatch
+Do not reject unknown section types.
 .It Fl -no-warn-symbol-ordering
 Do not warn about problems with the symbol ordering file or call graph profile.
 .It Fl -no-warnings , Fl w
diff --git a/lld/test/ELF/aarch64-tlsdesc-zrel.s b/lld/test/ELF/aarch64-tlsdesc-zrel.s
index 1b35e0d29a26f8..83e6894ddb3b5e 100644
--- a/lld/test/ELF/aarch64-tlsdesc-zrel.s
+++ b/lld/test/ELF/aarch64-tlsdesc-zrel.s
@@ -13,6 +13,7 @@
 // RELA-NEXT:     0x[[#ADDR+16]]  R_AARCH64_TLSDESC - 0x4
 // RELA-NEXT:   }
 // RELA-NEXT: ]
+// RELA-EMPTY:
 // RELA-NEXT:              Hex dump of section '.got':
 // RELA-NEXT:              0x000[[#ADDR]]    00000000 00000000 00000000 00000000
 // RELA-NO-ADDENDS-NEXT:   0x000[[#ADDR+16]] 00000000 00000000 00000000 00000000
@@ -28,6 +29,7 @@
 // REL-NEXT:     0x[[#ADDR+16]]  R_AARCH64_TLSDESC -{{$}}
 // REL-NEXT:   }
 // REL-NEXT: ]
+// REL-EMPTY:
 // REL-NEXT: Hex dump of section '.got':
 // REL-NEXT: 0x000[[#ADDR]]    00000000 00000000 00000000 00000000
 // REL-NEXT: 0x000[[#ADDR+16]] 00000000 00000000 04000000 00000000
diff --git a/lld/test/ELF/incompatible-section-types2.s b/lld/test/ELF/incompatible-section-types2.s
index 3e281ce6c5da38..919515fe37e396 100644
--- a/lld/test/ELF/incompatible-section-types2.s
+++ b/lld/test/ELF/incompatible-section-types2.s
@@ -6,5 +6,5 @@
 // CHECK-NEXT: >>> <internal>:(.shstrtab): SHT_STRTAB
 // CHECK-NEXT: >>> output section .shstrtab: Unknown
 
-.section .shstrtab,"",@12345
+.section .shstrtab,"",@0x60000000
 .short 20
diff --git a/lld/test/ELF/linkerscript/custom-section-type.s b/lld/test/ELF/linkerscript/custom-section-type.s
index 68454f4df1c860..8ca0a4db325bda 100644
--- a/lld/test/ELF/linkerscript/custom-section-type.s
+++ b/lld/test/ELF/linkerscript/custom-section-type.s
@@ -76,7 +76,7 @@ SECTIONS {
 .section progbits,"a",@note
 .byte 0
 
-.section expr,"a",@12345
+.section expr,"a",@0x60000000
 .byte 0
 
 #--- unknown1.lds
diff --git a/lld/test/ELF/linkerscript/discard-section.s b/lld/test/ELF/linkerscript/discard-section.s
index 24f3b2b73e991f..0bbebac59bb345 100644
--- a/lld/test/ELF/linkerscript/discard-section.s
+++ b/lld/test/ELF/linkerscript/discard-section.s
@@ -9,6 +9,9 @@
 # RUN: ld.lld -r -T a.lds a.o b.o -o a.ro 2>&1 | FileCheck %s --check-prefix=WARNING --implicit-check-not=warning:
 # RUN: llvm-readelf -r -s a.ro | FileCheck %s --check-prefix=RELOC
 
+# RUN: ld.lld -r --gc-sections -T a.lds a.o b.o -o a.gc.ro --no-fatal-warnings
+# RUN: llvm-readelf -r -s a.gc.ro | FileCheck %s --check-prefix=RELOC-GC
+
 # LOCAL:      error: relocation refers to a discarded section: .aaa
 # LOCAL-NEXT: >>> defined in a.o
 # LOCAL-NEXT: >>> referenced by a.o:(.bbb+0x0)
@@ -32,16 +35,18 @@
 # WARNING:      warning: relocation refers to a discarded section: .aaa
 # WARNING-NEXT: >>> referenced by a.o:(.rela.bbb+0x0)
 
+## GNU ld reports "defined in discarded secion" errors even in -r mode.
+## We set the symbol index to 0.
 # RELOC:      Relocation section '.rela.bbb' at offset {{.*}} contains 1 entries:
 # RELOC-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
 # RELOC-NEXT: 0000000000000000  0000000000000000 R_X86_64_NONE                             0
 # RELOC-EMPTY:
 # RELOC-NEXT: Relocation section '.rela.data' at offset {{.*}} contains 4 entries:
 # RELOC-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-# RELOC-NEXT: 0000000000000000  0000000500000001 R_X86_64_64            0000000000000000 global + 0
-# RELOC-NEXT: 0000000000000008  0000000700000001 R_X86_64_64            0000000000000000 weak + 0
-# RELOC-NEXT: 0000000000000010  0000000600000001 R_X86_64_64            0000000000000000 weakref1 + 0
-# RELOC-NEXT: 0000000000000018  0000000800000001 R_X86_64_64            0000000000000000 weakref2 + 0
+# RELOC-NEXT: 0000000000000000  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000008  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000010  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000018  0000000000000001 R_X86_64_64                             0
 
 # RELOC:      Num:    Value          Size Type    Bind   Vis      Ndx Name
 # RELOC-NEXT:   0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND
@@ -49,23 +54,25 @@
 # RELOC-NEXT:   2: 0000000000000000     0 SECTION LOCAL  DEFAULT    2 .bbb
 # RELOC-NEXT:   3: 0000000000000000     0 SECTION LOCAL  DEFAULT    4 .data
 # RELOC-NEXT:   4: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT    1 _start
-# RELOC-NEXT:   5: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND global
-# RELOC-NEXT:   6: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weakref1
-# RELOC-NEXT:   7: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weak
-# RELOC-NEXT:   8: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weakref2
 # RELOC-EMPTY:
 
+# RELOC-GC:   There are no relocations in this file.
+
 #--- a.s
 .globl _start
 _start:
 
 .section .aaa,"a"
-.globl global, weakref1
+.globl global, weakref1, unused
 .weak weak, weakref2
 global:
 weak:
 weakref1:
 weakref2:
+## Eliminate `unused` just like GC discarded definitions.
+## Linux kernel's CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y configuration expects
+## that the unreferenced `unused` is not emitted to .symtab.
+unused:
   .quad 0
 
 .section .bbb,"aw"
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
index 43df8366aa2ae0..8219b8bea15754 100644
--- a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
@@ -36,7 +36,7 @@ entry:
   store ptr %this, ptr %this.addr
   %this1 = load ptr, ptr %this.addr
   call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1)
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2), ptr %this1
   ret void
 }
 
@@ -47,7 +47,7 @@ entry:
   %this.addr = alloca ptr
   store ptr %this, ptr %this.addr
   %this1 = load ptr, ptr %this.addr
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2), ptr %this1
   ret void
 }
 
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
index 6cc55df82e2f28..43230948116d30 100644
--- a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
@@ -9,7 +9,7 @@ define linkonce_odr void @_ZN1BC2Ev(ptr %this) #0 {
   %this.addr = alloca ptr, align 8
   store ptr %this, ptr %this.addr, align 8
   %this1 = load ptr, ptr %this.addr, align 8
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [4 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2), ptr %this1, align 8
   ret void
 }
 
diff --git a/lld/test/ELF/lto/devirt_split_unit_localize.ll b/lld/test/ELF/lto/devirt_split_unit_localize.ll
index 04f30dcd76b46c..fb3564b268863e 100644
--- a/lld/test/ELF/lto/devirt_split_unit_localize.ll
+++ b/lld/test/ELF/lto/devirt_split_unit_localize.ll
@@ -91,7 +91,7 @@ declare dso_local void @_ZNK3Cat9makeNoiseEv(ptr nonnull dereferenceable(8)) unn
 define dso_local void @_Z17useDoThingWithCatv() local_unnamed_addr {
 entry:
   %call = tail call noalias nonnull dereferenceable(8) ptr @_Znwm(i64 8)
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV3Cat, i64 0, inrange i32 0, i64 2), ptr %call, align 8, !tbaa !4
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV3Cat, i64 0, i32 0, i64 2), ptr %call, align 8, !tbaa !4
   tail call void @_Z14doThingWithCatP6Animal(ptr nonnull %call)
   ret void
 }
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll
index 2318af4b16ab46..2d6a30da572002 100644
--- a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll
@@ -117,7 +117,7 @@ entry:
   %this.addr = alloca ptr, align 8
   store ptr %this, ptr %this.addr, align 8
   %this1 = load ptr, ptr %this.addr, align 8
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2), ptr %this1, align 8
   ret void
 }
 
diff --git a/lld/test/ELF/pack-dyn-relocs-arm2.s b/lld/test/ELF/pack-dyn-relocs-arm2.s
index cf2cd8bb597ec2..ed4fa52e7a5ab2 100644
--- a/lld/test/ELF/pack-dyn-relocs-arm2.s
+++ b/lld/test/ELF/pack-dyn-relocs-arm2.s
@@ -5,7 +5,7 @@
 
 // RUN: llvm-mc -filetype=obj -triple=armv7a-none-linux-gnueabi %s -o %t.o
 // RUN: ld.lld -pie --pack-dyn-relocs=relr %t.o %t.so -o %t.exe
-// RUN: llvm-readobj -r %t.exe | FileCheck %s
+// RUN: llvm-readobj -r -x .data %t.exe | FileCheck %s
 
 // CHECK:      Section (5) .relr.dyn {
 // CHECK-NEXT:   0x301E8 R_ARM_RELATIVE -
@@ -43,6 +43,9 @@
 // CHECK-NEXT:   0x30268 R_ARM_RELATIVE -
 // CHECK-NEXT:   0x3026C R_ARM_RELATIVE -
 // CHECK-NEXT: }
+// CHECK:      Hex dump of section '.data':
+// CHECK-NEXT: 0x000301e8 00000000 01000000 02000000 ffffffff .
+// CHECK-NEXT: 0x000301f8 00000080 00000000 00000000 00000000 .
 
 // RUN: llvm-readobj -S --dynamic-table %t.exe | FileCheck --check-prefix=HEADER %s
 // HEADER: 0x00000023 RELRSZ 12 (bytes)
@@ -50,10 +53,10 @@
 .data
 .align 2
 .dc.a __ehdr_start
-.dc.a __ehdr_start
-.dc.a __ehdr_start
-.dc.a __ehdr_start
-.dc.a __ehdr_start
+.dc.a __ehdr_start+1
+.dc.a __ehdr_start+2
+.dc.a __ehdr_start-1
+.dc.a __ehdr_start-0x80000000
 .dc.a __ehdr_start
 .dc.a __ehdr_start
 .dc.a __ehdr_start
diff --git a/lld/test/ELF/pack-dyn-relocs.s b/lld/test/ELF/pack-dyn-relocs.s
index da97e388c3d0c5..733ddd4ecad396 100644
--- a/lld/test/ELF/pack-dyn-relocs.s
+++ b/lld/test/ELF/pack-dyn-relocs.s
@@ -198,11 +198,11 @@
 // RUN: llvm-readobj -r %t2.a64 | FileCheck --check-prefix=UNPACKED64 %s
 
 // UNPACKED64:          Section ({{.+}}) .rela.dyn {
-// UNPACKED64-NEXT:     0x30690 R_AARCH64_RELATIVE - 0x1
-// UNPACKED64-NEXT:     0x30698 R_AARCH64_RELATIVE - 0x2
-// UNPACKED64-NEXT:     0x306A0 R_AARCH64_RELATIVE - 0x3
-// UNPACKED64-NEXT:     0x306A8 R_AARCH64_RELATIVE - 0x4
-// UNPACKED64-NEXT:     0x306B0 R_AARCH64_RELATIVE - 0x5
+// UNPACKED64-NEXT:     0x30690 R_AARCH64_RELATIVE - 0x0
+// UNPACKED64-NEXT:     0x30698 R_AARCH64_RELATIVE - 0x1
+// UNPACKED64-NEXT:     0x306A0 R_AARCH64_RELATIVE - 0x2
+// UNPACKED64-NEXT:     0x306A8 R_AARCH64_RELATIVE - 0xFFFFFFFFFFFFFFFF
+// UNPACKED64-NEXT:     0x306B0 R_AARCH64_RELATIVE - 0x80000000
 // UNPACKED64-NEXT:     0x306B8 R_AARCH64_RELATIVE - 0x6
 // UNPACKED64-NEXT:     0x306C0 R_AARCH64_RELATIVE - 0x7
 // UNPACKED64-NEXT:     0x306C8 R_AARCH64_RELATIVE - 0x8
@@ -238,91 +238,72 @@
 // UNPACKED64-NEXT:     }
 
 // RUN: ld.lld -pie --pack-dyn-relocs=android %t.a64.o %t.a64.so -o %t3.a64
-// RUN: llvm-readobj -S --dynamic-table %t3.a64 | FileCheck --check-prefix=ANDROID64-HEADERS %s
-// RUN: llvm-readobj -r %t3.a64 | FileCheck --check-prefix=ANDROID64 %s
-
-// ANDROID64-HEADERS:       Index: 1
-// ANDROID64-HEADERS-NEXT:  Name: .dynsym
-
-// ANDROID64-HEADERS:       Name: .rela.dyn
-// ANDROID64-HEADERS-NEXT:  Type: SHT_ANDROID_RELA
-// ANDROID64-HEADERS-NEXT:  Flags [ (0x2)
-// ANDROID64-HEADERS-NEXT:    SHF_ALLOC (0x2)
-// ANDROID64-HEADERS-NEXT:  ]
-// ANDROID64-HEADERS-NEXT:  Address: [[ADDR:.*]]
-// ANDROID64-HEADERS-NEXT:  Offset: [[ADDR]]
-// ANDROID64-HEADERS-NEXT:  Size: [[SIZE:.*]]
-// ANDROID64-HEADERS-NEXT:  Link: 1
-// ANDROID64-HEADERS-NEXT:  Info: 0
-// ANDROID64-HEADERS-NEXT:  AddressAlignment: 8
-// ANDROID64-HEADERS-NEXT:  EntrySize: 1
+// RUN: llvm-readelf -S -d -r %t3.a64 | FileCheck --check-prefix=ANDROID64 %s
+
+// ANDROID64:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+// ANDROID64:      .dynstr           STRTAB          {{.*}}                         00   A  0   0  1
+// ANDROID64-NEXT: .rela.dyn         ANDROID_RELA    {{0*}}[[#%x,ANDROID:]] {{.*}}  01   A  1   0  8
+// ANDROID64-NEXT: .text             PROGBITS        {{.*}}                         00  AX  0   0  4
+
+// ANDROID64:      (DEBUG)           0x0
+// ANDROID64-NEXT: (ANDROID_RELA)    0x[[#ANDROID]]
+// ANDROID64-NEXT: (ANDROID_RELASZ)  122 (bytes)
+// ANDROID64-NEXT: (RELAENT)         24 (bytes)
 
 // ANDROID64-HEADERS: 0x0000000060000011 ANDROID_RELA          [[ADDR]]
 // ANDROID64-HEADERS: 0x0000000060000012 ANDROID_RELASZ        [[SIZE]]
 
-// ANDROID64:          Section ({{.+}}) .rela.dyn {
-// ANDROID64-NEXT:     0x303E0 R_AARCH64_RELATIVE - 0x1
-// ANDROID64-NEXT:     0x303E8 R_AARCH64_RELATIVE - 0x2
-// ANDROID64-NEXT:     0x303F0 R_AARCH64_RELATIVE - 0x3
-// ANDROID64-NEXT:     0x303F8 R_AARCH64_RELATIVE - 0x4
-// ANDROID64-NEXT:     0x30400 R_AARCH64_RELATIVE - 0x5
-// ANDROID64-NEXT:     0x30408 R_AARCH64_RELATIVE - 0x6
-// ANDROID64-NEXT:     0x30410 R_AARCH64_RELATIVE - 0x7
-// ANDROID64-NEXT:     0x30418 R_AARCH64_RELATIVE - 0x8
-// ANDROID64-NEXT:     0x30470 R_AARCH64_RELATIVE - 0x1
-// ANDROID64-NEXT:     0x30478 R_AARCH64_RELATIVE - 0x2
-// ANDROID64-NEXT:     0x30480 R_AARCH64_RELATIVE - 0x3
-// ANDROID64-NEXT:     0x30488 R_AARCH64_RELATIVE - 0x4
-// ANDROID64-NEXT:     0x30490 R_AARCH64_RELATIVE - 0x5
-// ANDROID64-NEXT:     0x30498 R_AARCH64_RELATIVE - 0x6
-// ANDROID64-NEXT:     0x304A0 R_AARCH64_RELATIVE - 0x7
-// ANDROID64-NEXT:     0x304A8 R_AARCH64_RELATIVE - 0x8
-// ANDROID64-NEXT:     0x304B0 R_AARCH64_RELATIVE - 0x9
-
-// ANDROID64-NEXT:     0x30428 R_AARCH64_RELATIVE - 0x1
-// ANDROID64-NEXT:     0x30430 R_AARCH64_RELATIVE - 0x2
-// ANDROID64-NEXT:     0x30438 R_AARCH64_RELATIVE - 0x3
-// ANDROID64-NEXT:     0x30440 R_AARCH64_RELATIVE - 0x4
-// ANDROID64-NEXT:     0x30448 R_AARCH64_RELATIVE - 0x5
-// ANDROID64-NEXT:     0x30450 R_AARCH64_RELATIVE - 0x6
-// ANDROID64-NEXT:     0x30458 R_AARCH64_RELATIVE - 0x7
-
-// ANDROID64-NEXT:     0x304B9 R_AARCH64_RELATIVE - 0xA
-
-// ANDROID64-NEXT:     0x30468 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x304C1 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x304C9 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x304E1 R_AARCH64_ABS64 bar2 0x0
-// ANDROID64-NEXT:     0x30420 R_AARCH64_ABS64 bar2 0x1
-// ANDROID64-NEXT:     0x30460 R_AARCH64_ABS64 zed2 0x0
-// ANDROID64-NEXT:     0x304D1 R_AARCH64_ABS64 bar2 0x1
-// ANDROID64-NEXT:     0x304D9 R_AARCH64_ABS64 bar2 0x1
-// ANDROID64-NEXT:     }
+// ANDROID64:       Relocation section '.rela.dyn' at offset {{.*}} contains 33 entries:
+// ANDROID64-NEXT:      Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+// ANDROID64-NEXT:  00000000000303e8  0000000000000403 R_AARCH64_RELATIVE                0
+// ANDROID64-NEXT:  00000000000303f0  0000000000000403 R_AARCH64_RELATIVE                1
+// ANDROID64-NEXT:  00000000000303f8  0000000000000403 R_AARCH64_RELATIVE                2
+// ANDROID64-NEXT:  0000000000030400  0000000000000403 R_AARCH64_RELATIVE                ffffffffffffffff
+// ANDROID64-NEXT:  0000000000030408  0000000000000403 R_AARCH64_RELATIVE                80000000
+// ANDROID64-NEXT:  0000000000030410  0000000000000403 R_AARCH64_RELATIVE                6
+// ANDROID64-NEXT:  0000000000030418  0000000000000403 R_AARCH64_RELATIVE                7
+// ANDROID64-NEXT:  0000000000030420  0000000000000403 R_AARCH64_RELATIVE                8
+// ANDROID64-NEXT:  0000000000030478  0000000000000403 R_AARCH64_RELATIVE                1
+// ANDROID64-NEXT:  0000000000030480  0000000000000403 R_AARCH64_RELATIVE                2
+// ANDROID64-NEXT:  0000000000030488  0000000000000403 R_AARCH64_RELATIVE                3
+// ANDROID64-NEXT:  0000000000030490  0000000000000403 R_AARCH64_RELATIVE                4
+// ANDROID64-NEXT:  0000000000030498  0000000000000403 R_AARCH64_RELATIVE                5
+// ANDROID64-NEXT:  00000000000304a0  0000000000000403 R_AARCH64_RELATIVE                6
+// ANDROID64-NEXT:  00000000000304a8  0000000000000403 R_AARCH64_RELATIVE                7
+// ANDROID64-NEXT:  00000000000304b0  0000000000000403 R_AARCH64_RELATIVE                8
+// ANDROID64-NEXT:  00000000000304b8  0000000000000403 R_AARCH64_RELATIVE                9
+// ANDROID64-NEXT:  0000000000030430  0000000000000403 R_AARCH64_RELATIVE                1
+// ANDROID64-NEXT:  0000000000030438  0000000000000403 R_AARCH64_RELATIVE                2
+// ANDROID64-NEXT:  0000000000030440  0000000000000403 R_AARCH64_RELATIVE                3
+// ANDROID64-NEXT:  0000000000030448  0000000000000403 R_AARCH64_RELATIVE                4
+// ANDROID64-NEXT:  0000000000030450  0000000000000403 R_AARCH64_RELATIVE                5
+// ANDROID64-NEXT:  0000000000030458  0000000000000403 R_AARCH64_RELATIVE                6
+// ANDROID64-NEXT:  0000000000030460  0000000000000403 R_AARCH64_RELATIVE                7
+// ANDROID64-NEXT:  00000000000304c1  0000000000000403 R_AARCH64_RELATIVE                a
+// ANDROID64-NEXT:  0000000000030470  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// ANDROID64-NEXT:  00000000000304c9  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// ANDROID64-NEXT:  00000000000304d1  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// ANDROID64-NEXT:  00000000000304e9  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// ANDROID64-NEXT:  0000000000030428  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// ANDROID64-NEXT:  0000000000030468  0000000200000101 R_AARCH64_ABS64        0000000000000000 zed2 + 0
+// ANDROID64-NEXT:  00000000000304d9  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// ANDROID64-NEXT:  00000000000304e1  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// ANDROID64-EMPTY:
 
 // RUN: ld.lld -pie --pack-dyn-relocs=relr %t.a64.o %t.a64.so -o %t4.a64
-// RUN: llvm-readobj -S --dynamic-table %t4.a64 | FileCheck --check-prefix=RELR64-HEADERS %s
+// RUN: llvm-readelf -Sdr -x .data %t4.a64 | FileCheck --check-prefix=RELR64 %s
 // RUN: llvm-readobj -r --raw-relr %t4.a64 | FileCheck --check-prefix=RAW-RELR64 %s
-// RUN: llvm-readobj -r %t4.a64 | FileCheck --check-prefix=RELR64 %s
-
-// RELR64-HEADERS:       Index: 1
-// RELR64-HEADERS-NEXT:  Name: .dynsym
-
-// RELR64-HEADERS:       Name: .relr.dyn
-// RELR64-HEADERS-NEXT:  Type: SHT_RELR
-// RELR64-HEADERS-NEXT:  Flags [ (0x2)
-// RELR64-HEADERS-NEXT:    SHF_ALLOC (0x2)
-// RELR64-HEADERS-NEXT:  ]
-// RELR64-HEADERS-NEXT:  Address: [[ADDR:.*]]
-// RELR64-HEADERS-NEXT:  Offset: [[ADDR]]
-// RELR64-HEADERS-NEXT:  Size: 16
-// RELR64-HEADERS-NEXT:  Link: 0
-// RELR64-HEADERS-NEXT:  Info: 0
-// RELR64-HEADERS-NEXT:  AddressAlignment: 8
-// RELR64-HEADERS-NEXT:  EntrySize: 8
-
-// RELR64-HEADERS:       0x0000000000000024 RELR                 [[ADDR]]
-// RELR64-HEADERS:       0x0000000000000023 RELRSZ               16 (bytes)
-// RELR64-HEADERS:       0x0000000000000025 RELRENT              8 (bytes)
+
+// RELR64:      Name              Type            Address          Off    Size   ES Flg Lk Inf Al
+// RELR64:      .dynstr           STRTAB          {{.*}}                         00   A  0   0  1
+// RELR64-NEXT: .rela.dyn         RELA            {{.*}}                         18   A  1   0  8
+// RELR64-NEXT: .relr.dyn         RELR            {{0*}}[[#%x,RELR:]] {{.*}}     08   A  0   0  8
+// RELR64-NEXT: .text             PROGBITS        0000000000010380 000380 000000 00  AX  0   0  4
+
+// RELR64:      (RELACOUNT)   1
+// RELR64:      (RELR)        0x[[#RELR]]
+// RELR64-NEXT: (RELRSZ)      16 (bytes)
+// RELR64-NEXT: (RELRENT)     8 (bytes)
 
 /// SHT_RELR section contains address/bitmap entries
 /// encoding the offsets for relative relocation.
@@ -334,51 +315,57 @@
 /// Decoded SHT_RELR section is same as UNPACKED,
 /// but contains only the relative relocations.
 /// Any relative relocations with odd offset stay in SHT_RELA.
-// RELR64:      Section ({{.+}}) .rela.dyn {
-// RELR64-NEXT:   0x30569 R_AARCH64_RELATIVE - 0xA
-// RELR64-NEXT:   0x304D0 R_AARCH64_ABS64 bar2 0x1
-// RELR64-NEXT:   0x30518 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30571 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30579 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30581 R_AARCH64_ABS64 bar2 0x1
-// RELR64-NEXT:   0x30589 R_AARCH64_ABS64 bar2 0x1
-// RELR64-NEXT:   0x30591 R_AARCH64_ABS64 bar2 0x0
-// RELR64-NEXT:   0x30510 R_AARCH64_ABS64 zed2 0x0
-// RELR64-NEXT: }
-// RELR64-NEXT: Section ({{.+}}) .relr.dyn {
-// RELR64-NEXT:   0x30490 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30498 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304A0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304A8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304B0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304B8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304C0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304C8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304D8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304E0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304E8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304F0 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x304F8 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30500 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30508 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30520 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30528 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30530 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30538 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30540 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30548 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30550 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30558 R_AARCH64_RELATIVE -
-// RELR64-NEXT:   0x30560 R_AARCH64_RELATIVE -
-// RELR64-NEXT: }
+// RELR64:       Relocation section '.rela.dyn' at offset {{.*}} contains 9 entries:
+// RELR64-NEXT:      Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
+// RELR64-NEXT:  0000000000030569  0000000000000403 R_AARCH64_RELATIVE                a
+// RELR64-NEXT:  00000000000304d0  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// RELR64-NEXT:  0000000000030518  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// RELR64-NEXT:  0000000000030571  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// RELR64-NEXT:  0000000000030579  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// RELR64-NEXT:  0000000000030581  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// RELR64-NEXT:  0000000000030589  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 1
+// RELR64-NEXT:  0000000000030591  0000000100000101 R_AARCH64_ABS64        0000000000000000 bar2 + 0
+// RELR64-NEXT:  0000000000030510  0000000200000101 R_AARCH64_ABS64        0000000000000000 zed2 + 0
+// RELR64-EMPTY: 
+// RELR64-NEXT:  Relocation section '.relr.dyn' at offset {{.*}} contains 24 entries:
+// RELR64-NEXT:      Offset             Info             Type               Symbol's Value  Symbol's Name
+// RELR64-NEXT:  0000000000030490  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030498  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304a0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304a8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304b0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304b8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304c0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304c8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304d8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304e0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304e8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304f0  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  00000000000304f8  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030500  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030508  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030520  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030528  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030530  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030538  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030540  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030548  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030550  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030558  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-NEXT:  0000000000030560  0000000000000403 R_AARCH64_RELATIVE
+// RELR64-EMPTY:
+// RELR64-NEXT: Hex dump of section '.data':
+// RELR64-NEXT: 0x00030490 00000000 00000000 01000000 00000000 .
+// RELR64-NEXT: 0x000304a0 02000000 00000000 ffffffff ffffffff .
+// RELR64-NEXT: 0x000304b0 00000080 00000000 06000000 00000000 .
 
 .data
-.align 2
+.balign 2
+.dc.a __ehdr_start
 .dc.a __ehdr_start + 1
 .dc.a __ehdr_start + 2
-.dc.a __ehdr_start + 3
-.dc.a __ehdr_start + 4
-.dc.a __ehdr_start + 5
+.dc.a __ehdr_start - 1
+.dc.a __ehdr_start + 0x80000000
 .dc.a __ehdr_start + 6
 .dc.a __ehdr_start + 7
 .dc.a __ehdr_start + 8
diff --git a/lld/test/ELF/unknown-section.test b/lld/test/ELF/unknown-section.test
new file mode 100644
index 00000000000000..f6ecca29a22aed
--- /dev/null
+++ b/lld/test/ELF/unknown-section.test
@@ -0,0 +1,48 @@
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: yaml2obj %s -o a.o
+# RUN: not ld.lld a.o -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
+
+# CHECK:      error: a.o:(relr): unknown section type 0x13
+# CHECK-NEXT: error: a.o:(regular): unknown section type 0x15
+# CHECK-NEXT: error: a.o:(loos_nonconforming): unknown section type 0x60000000
+# CHECK-NEXT: error: a.o:(hios_nonconforming): unknown section type 0x6fffffff
+# CHECK-NEXT: error: a.o:(louser_alloc): unknown section type 0x80000000
+# CHECK-NEXT: error: a.o:(hiuser_alloc): unknown section type 0xffffffff
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:  relr
+    Type:  19
+  - Name:  regular
+    Type:  21
+  - Name:  loos
+    Type:  0x60000000
+  - Name:  hios
+    Type:  0x6fffffff
+  - Name:  loos_nonconforming
+    Type:  0x60000000
+    Flags: [ SHF_OS_NONCONFORMING ]
+  - Name:  hios_nonconforming
+    Type:  0x6fffffff
+    Flags: [ SHF_OS_NONCONFORMING ]
+
+  - Name:  loproc
+    Type:  0x70000000
+  - Name:  hiproc
+    Type:  0x7fffffff
+
+  - Name:  louser
+    Type:  0x80000000
+  - Name:  hiuser
+    Type:  0xffffffff
+  - Name:  louser_alloc
+    Type:  0x80000000
+    Flags: [ SHF_ALLOC ]
+  - Name:  hiuser_alloc
+    Type:  0xffffffff
+    Flags: [ SHF_ALLOC ]
diff --git a/lld/test/MachO/objc-category-merging-complete-test.s b/lld/test/MachO/objc-category-merging-complete-test.s
new file mode 100644
index 00000000000000..3bc3ca26b6ae6c
--- /dev/null
+++ b/lld/test/MachO/objc-category-merging-complete-test.s
@@ -0,0 +1,762 @@
+# REQUIRES: aarch64
+# RUN: rm -rf %t; split-file %s %t && cd %t
+
+## Create a dylib to link against(a64_file1.dylib) and merge categories in the main binary (file2_merge_a64.exe)
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_file1.o a64_file1.s
+# RUN: %lld -arch arm64 a64_file1.o -o a64_file1.dylib -dylib
+
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_file2.o a64_file2.s
+# RUN: %lld -arch arm64 -o a64_file2_no_merge.exe a64_file1.dylib a64_file2.o
+# RUN: %lld -arch arm64 -o a64_file2_merge.exe -objc_category_merging a64_file1.dylib a64_file2.o
+
+# RUN: llvm-objdump --objc-meta-data --macho a64_file2_no_merge.exe | FileCheck %s --check-prefixes=NO_MERGE_CATS
+# RUN: llvm-objdump --objc-meta-data --macho a64_file2_merge.exe | FileCheck %s --check-prefixes=MERGE_CATS
+
+
+MERGE_CATS:     __OBJC_$_CATEGORY_MyBaseClass_$_(Category02|Category03)
+MERGE_CATS-NEXT:              name {{.*}} Category02|Category03
+MERGE_CATS:           instanceMethods
+MERGE_CATS-NEXT:           entsize 24
+MERGE_CATS-NEXT:             count 4
+MERGE_CATS-NEXT:              name {{.*}} class02InstanceMethod
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp -[MyBaseClass(Category02) class02InstanceMethod]
+MERGE_CATS-NEXT:              name {{.*}} myProtocol02Method
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp -[MyBaseClass(Category02) myProtocol02Method]
+MERGE_CATS-NEXT:              name {{.*}} class03InstanceMethod
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp -[MyBaseClass(Category03) class03InstanceMethod]
+MERGE_CATS-NEXT:              name {{.*}} myProtocol03Method
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp -[MyBaseClass(Category03) myProtocol03Method]
+MERGE_CATS-NEXT:      classMethods {{.*}}
+MERGE_CATS-NEXT:           entsize 24
+MERGE_CATS-NEXT:             count 4
+MERGE_CATS-NEXT:              name {{.*}} class02ClassMethod
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp +[MyBaseClass(Category02) class02ClassMethod]
+MERGE_CATS-NEXT:              name {{.*}} MyProtocol02Prop
+MERGE_CATS-NEXT:             types {{.*}} i16@0:8
+MERGE_CATS-NEXT:               imp +[MyBaseClass(Category02) MyProtocol02Prop]
+MERGE_CATS-NEXT:              name {{.*}} class03ClassMethod
+MERGE_CATS-NEXT:             types {{.*}} v16@0:8
+MERGE_CATS-NEXT:               imp +[MyBaseClass(Category03) class03ClassMethod]
+MERGE_CATS-NEXT:              name {{.*}} MyProtocol03Prop
+MERGE_CATS-NEXT:             types {{.*}} i16@0:8
+MERGE_CATS-NEXT:               imp +[MyBaseClass(Category03) MyProtocol03Prop]
+MERGE_CATS-NEXT:         protocols
+MERGE_CATS-NEXT:                      count 2
+MERGE_CATS-NEXT:              list[0] {{.*}} (struct protocol_t *)
+MERGE_CATS-NEXT:                  isa 0x0
+MERGE_CATS-NEXT:                 name {{.*}} MyProtocol02
+MERGE_CATS-NEXT:            protocols 0x0
+MERGE_CATS-NEXT:          instanceMethods
+MERGE_CATS-NEXT:               entsize 24
+MERGE_CATS-NEXT:                 count 2
+MERGE_CATS-NEXT:                  name {{.*}} myProtocol02Method
+MERGE_CATS-NEXT:                 types {{.*}} v16@0:8
+MERGE_CATS-NEXT:                   imp 0x0
+MERGE_CATS-NEXT:                  name {{.*}} MyProtocol02Prop
+MERGE_CATS-NEXT:                 types {{.*}} i16@0:8
+MERGE_CATS-NEXT:                   imp 0x0
+MERGE_CATS-NEXT:             classMethods
+MERGE_CATS-NEXT:      optionalInstanceMethods 0x0
+MERGE_CATS-NEXT:         optionalClassMethods 0x0
+MERGE_CATS-NEXT:           instanceProperties {{.*}}
+MERGE_CATS-NEXT:              list[1] {{.*}}
+MERGE_CATS-NEXT:                  isa 0x0
+MERGE_CATS-NEXT:                 name {{.*}} MyProtocol03
+MERGE_CATS-NEXT:            protocols 0x0
+MERGE_CATS-NEXT:          instanceMethods
+MERGE_CATS-NEXT:               entsize 24
+MERGE_CATS-NEXT:                 count 2
+MERGE_CATS-NEXT:                  name {{.*}} myProtocol03Method
+MERGE_CATS-NEXT:                 types {{.*}} v16@0:8
+MERGE_CATS-NEXT:                   imp 0x0
+MERGE_CATS-NEXT:                  name {{.*}} MyProtocol03Prop
+MERGE_CATS-NEXT:                 types {{.*}} i16@0:8
+MERGE_CATS-NEXT:                   imp 0x0
+MERGE_CATS-NEXT:             classMethods 0x0
+MERGE_CATS-NEXT:      optionalInstanceMethods 0x0
+MERGE_CATS-NEXT:         optionalClassMethods 0x0
+MERGE_CATS-NEXT:           instanceProperties {{.*}}
+MERGE_CATS-NEXT:      instanceProperties
+MERGE_CATS-NEXT:                    entsize 16
+MERGE_CATS-NEXT:                      count 2
+MERGE_CATS-NEXT:                 name {{.*}} MyProtocol02Prop
+MERGE_CATS-NEXT:            attributes {{.*}} Ti,R,D
+MERGE_CATS-NEXT:                 name {{.*}} MyProtocol03Prop
+MERGE_CATS-NEXT:            attributes {{.*}} Ti,R,D
+
+
+NO_MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_(Category02|Category03)
+NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+NO_MERGE_CATS: instanceMethods
+NO_MERGE_CATS-NEXT: 24
+NO_MERGE_CATS-NEXT: 2
+NO_MERGE_CATS: classMethods
+NO_MERGE_CATS-NEXT: 24
+NO_MERGE_CATS-NEXT: 2
+
+
+#--- a64_file1.s
+
+## @protocol MyProtocol01
+## - (void)myProtocol01Method;
+## @property (nonatomic) int MyProtocol01Prop;
+## @end
+##
+## __attribute__((objc_root_class))
+## @interface MyBaseClass<MyProtocol01>
+## - (void)baseInstanceMethod;
+## - (void)myProtocol01Method;
+## + (void)baseClassMethod;
+## @end
+##
+## @implementation MyBaseClass
+## @synthesize MyProtocol01Prop;
+## - (void)baseInstanceMethod {}
+## - (void)myProtocol01Method {}
+## + (void)baseClassMethod {}
+## @end
+##
+## void *_objc_empty_cache;
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.p2align	2                               ; -- Begin function -[MyBaseClass baseInstanceMethod]
+"-[MyBaseClass baseInstanceMethod]":    ; @"\01-[MyBaseClass baseInstanceMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass myProtocol01Method]
+"-[MyBaseClass myProtocol01Method]":    ; @"\01-[MyBaseClass myProtocol01Method]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass baseClassMethod]
+"+[MyBaseClass baseClassMethod]":       ; @"\01+[MyBaseClass baseClassMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass MyProtocol01Prop]
+"-[MyBaseClass MyProtocol01Prop]":      ; @"\01-[MyBaseClass MyProtocol01Prop]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+Lloh0:
+	adrp	x8, _OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop@PAGE
+Lloh1:
+	ldrsw	x8, [x8, _OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop@PAGEOFF]
+	ldr	w0, [x0, x8]
+	ret
+	.loh AdrpLdr	Lloh0, Lloh1
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass setMyProtocol01Prop:]
+"-[MyBaseClass setMyProtocol01Prop:]":  ; @"\01-[MyBaseClass setMyProtocol01Prop:]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+Lloh2:
+	adrp	x8, _OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop@PAGE
+Lloh3:
+	ldrsw	x8, [x8, _OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop@PAGEOFF]
+	str	w2, [x0, x8]
+	ret
+	.loh AdrpLdr	Lloh2, Lloh3
+	.cfi_endproc
+                                        ; -- End function
+	.private_extern	_OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop ; @"OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop"
+	.section	__DATA,__objc_ivar
+	.globl	_OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop
+	.p2align	2, 0x0
+_OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop:
+	.long	0                               ; 0x0
+	.section	__DATA,__objc_data
+	.globl	_OBJC_CLASS_$_MyBaseClass       ; @"OBJC_CLASS_$_MyBaseClass"
+	.p2align	3, 0x0
+_OBJC_CLASS_$_MyBaseClass:
+	.quad	_OBJC_METACLASS_$_MyBaseClass
+	.quad	0
+	.quad	__objc_empty_cache
+	.quad	0
+	.quad	__OBJC_CLASS_RO_$_MyBaseClass
+	.globl	_OBJC_METACLASS_$_MyBaseClass   ; @"OBJC_METACLASS_$_MyBaseClass"
+	.p2align	3, 0x0
+_OBJC_METACLASS_$_MyBaseClass:
+	.quad	_OBJC_METACLASS_$_MyBaseClass
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__objc_empty_cache
+	.quad	0
+	.quad	__OBJC_METACLASS_RO_$_MyBaseClass
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:                     ; @OBJC_CLASS_NAME_
+	.asciz	"MyBaseClass"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:                  ; @OBJC_METH_VAR_NAME_
+	.asciz	"baseClassMethod"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:                  ; @OBJC_METH_VAR_TYPE_
+	.asciz	"v16@0:8"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CLASS_METHODS_MyBaseClass"
+__OBJC_$_CLASS_METHODS_MyBaseClass:
+	.long	24                              ; 0x18
+	.long	1                               ; 0x1
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyBaseClass baseClassMethod]"
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.1:                   ; @OBJC_CLASS_NAME_.1
+	.asciz	"MyProtocol01"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.2:                ; @OBJC_METH_VAR_NAME_.2
+	.asciz	"myProtocol01Method"
+l_OBJC_METH_VAR_NAME_.3:                ; @OBJC_METH_VAR_NAME_.3
+	.asciz	"MyProtocol01Prop"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.4:                ; @OBJC_METH_VAR_TYPE_.4
+	.asciz	"i16@0:8"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.5:                ; @OBJC_METH_VAR_NAME_.5
+	.asciz	"setMyProtocol01Prop:"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.6:                ; @OBJC_METH_VAR_TYPE_.6
+	.asciz	"v20@0:8i16"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol01"
+__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol01:
+	.long	24                              ; 0x18
+	.long	3                               ; 0x3
+	.quad	l_OBJC_METH_VAR_NAME_.2
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	0
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	0
+	.quad	l_OBJC_METH_VAR_NAME_.5
+	.quad	l_OBJC_METH_VAR_TYPE_.6
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_:                 ; @OBJC_PROP_NAME_ATTR_
+	.asciz	"MyProtocol01Prop"
+l_OBJC_PROP_NAME_ATTR_.7:               ; @OBJC_PROP_NAME_ATTR_.7
+	.asciz	"Ti,N"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyProtocol01"
+__OBJC_$_PROP_LIST_MyProtocol01:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_
+	.quad	l_OBJC_PROP_NAME_ATTR_.7
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol01"
+__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol01:
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	l_OBJC_METH_VAR_TYPE_.6
+	.private_extern	__OBJC_PROTOCOL_$_MyProtocol01 ; @"_OBJC_PROTOCOL_$_MyProtocol01"
+	.section	__DATA,__data
+	.globl	__OBJC_PROTOCOL_$_MyProtocol01
+	.weak_definition	__OBJC_PROTOCOL_$_MyProtocol01
+	.p2align	3, 0x0
+__OBJC_PROTOCOL_$_MyProtocol01:
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_.1
+	.quad	0
+	.quad	__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol01
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	__OBJC_$_PROP_LIST_MyProtocol01
+	.long	96                              ; 0x60
+	.long	0                               ; 0x0
+	.quad	__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol01
+	.quad	0
+	.quad	0
+	.private_extern	__OBJC_LABEL_PROTOCOL_$_MyProtocol01 ; @"_OBJC_LABEL_PROTOCOL_$_MyProtocol01"
+	.section	__DATA,__objc_protolist,coalesced,no_dead_strip
+	.globl	__OBJC_LABEL_PROTOCOL_$_MyProtocol01
+	.weak_definition	__OBJC_LABEL_PROTOCOL_$_MyProtocol01
+	.p2align	3, 0x0
+__OBJC_LABEL_PROTOCOL_$_MyProtocol01:
+	.quad	__OBJC_PROTOCOL_$_MyProtocol01
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_CLASS_PROTOCOLS_$_MyBaseClass"
+__OBJC_CLASS_PROTOCOLS_$_MyBaseClass:
+	.quad	1                               ; 0x1
+	.quad	__OBJC_PROTOCOL_$_MyProtocol01
+	.quad	0
+	.p2align	3, 0x0                          ; @"_OBJC_METACLASS_RO_$_MyBaseClass"
+__OBJC_METACLASS_RO_$_MyBaseClass:
+	.long	3                               ; 0x3
+	.long	40                              ; 0x28
+	.long	40                              ; 0x28
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	__OBJC_$_CLASS_METHODS_MyBaseClass
+	.quad	__OBJC_CLASS_PROTOCOLS_$_MyBaseClass
+	.quad	0
+	.quad	0
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.8:                ; @OBJC_METH_VAR_NAME_.8
+	.asciz	"baseInstanceMethod"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_INSTANCE_METHODS_MyBaseClass"
+__OBJC_$_INSTANCE_METHODS_MyBaseClass:
+	.long	24                              ; 0x18
+	.long	4                               ; 0x4
+	.quad	l_OBJC_METH_VAR_NAME_.8
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass baseInstanceMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.2
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass myProtocol01Method]"
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	"-[MyBaseClass MyProtocol01Prop]"
+	.quad	l_OBJC_METH_VAR_NAME_.5
+	.quad	l_OBJC_METH_VAR_TYPE_.6
+	.quad	"-[MyBaseClass setMyProtocol01Prop:]"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.9:                ; @OBJC_METH_VAR_TYPE_.9
+	.asciz	"i"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_INSTANCE_VARIABLES_MyBaseClass"
+__OBJC_$_INSTANCE_VARIABLES_MyBaseClass:
+	.long	32                              ; 0x20
+	.long	1                               ; 0x1
+	.quad	_OBJC_IVAR_$_MyBaseClass.MyProtocol01Prop
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.9
+	.long	2                               ; 0x2
+	.long	4                               ; 0x4
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_.10:              ; @OBJC_PROP_NAME_ATTR_.10
+	.asciz	"Ti,N,VMyProtocol01Prop"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyBaseClass"
+__OBJC_$_PROP_LIST_MyBaseClass:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_
+	.quad	l_OBJC_PROP_NAME_ATTR_.10
+	.p2align	3, 0x0                          ; @"_OBJC_CLASS_RO_$_MyBaseClass"
+__OBJC_CLASS_RO_$_MyBaseClass:
+	.long	2                               ; 0x2
+	.long	0                               ; 0x0
+	.long	4                               ; 0x4
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	__OBJC_$_INSTANCE_METHODS_MyBaseClass
+	.quad	__OBJC_CLASS_PROTOCOLS_$_MyBaseClass
+	.quad	__OBJC_$_INSTANCE_VARIABLES_MyBaseClass
+	.quad	0
+	.quad	__OBJC_$_PROP_LIST_MyBaseClass
+	.globl	__objc_empty_cache              ; @_objc_empty_cache
+.zerofill __DATA,__common,__objc_empty_cache,8,3
+	.section	__DATA,__objc_classlist,regular,no_dead_strip
+	.p2align	3, 0x0                          ; @"OBJC_LABEL_CLASS_$"
+l_OBJC_LABEL_CLASS_$:
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.no_dead_strip	__OBJC_LABEL_PROTOCOL_$_MyProtocol01
+	.no_dead_strip	__OBJC_PROTOCOL_$_MyProtocol01
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	96
+.subsections_via_symbols
+
+
+#--- a64_file2.s
+
+## @protocol MyProtocol01
+## - (void)myProtocol01Method;
+## @end
+##
+## @protocol MyProtocol02
+## - (void)myProtocol02Method;
+## @property(readonly) int MyProtocol02Prop;
+## @end
+##
+## @protocol MyProtocol03
+## - (void)myProtocol03Method;
+## @property(readonly) int MyProtocol03Prop;
+## @end
+##
+##
+## __attribute__((objc_root_class))
+## @interface MyBaseClass<MyProtocol01>
+## - (void)baseInstanceMethod;
+## - (void)myProtocol01Method;
+## + (void)baseClassMethod;
+## @end
+##
+##
+##
+## @interface MyBaseClass(Category02)<MyProtocol02>
+## - (void)class02InstanceMethod;
+## - (void)myProtocol02Method;
+## + (void)class02ClassMethod;
+## + (int)MyProtocol02Prop;
+## @end
+##
+## @implementation MyBaseClass(Category02)
+## - (void)class02InstanceMethod {}
+## - (void)myProtocol02Method {}
+## + (void)class02ClassMethod {}
+## + (int)MyProtocol02Prop { return 0;}
+## @dynamic MyProtocol02Prop;
+## @end
+##
+## @interface MyBaseClass(Category03)<MyProtocol03>
+## - (void)class03InstanceMethod;
+## - (void)myProtocol03Method;
+## + (void)class03ClassMethod;
+## + (int)MyProtocol03Prop;
+## @end
+##
+## @implementation MyBaseClass(Category03)
+## - (void)class03InstanceMethod {}
+## - (void)myProtocol03Method {}
+## + (void)class03ClassMethod {}
+## + (int)MyProtocol03Prop { return 0;}
+## @dynamic MyProtocol03Prop;
+## @end
+##
+## int main() {
+##     return 0;
+## }
+
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category02) class02InstanceMethod]
+"-[MyBaseClass(Category02) class02InstanceMethod]": ; @"\01-[MyBaseClass(Category02) class02InstanceMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category02) myProtocol02Method]
+"-[MyBaseClass(Category02) myProtocol02Method]": ; @"\01-[MyBaseClass(Category02) myProtocol02Method]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass(Category02) class02ClassMethod]
+"+[MyBaseClass(Category02) class02ClassMethod]": ; @"\01+[MyBaseClass(Category02) class02ClassMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass(Category02) MyProtocol02Prop]
+"+[MyBaseClass(Category02) MyProtocol02Prop]": ; @"\01+[MyBaseClass(Category02) MyProtocol02Prop]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	b	_OUTLINED_FUNCTION_0
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category03) class03InstanceMethod]
+"-[MyBaseClass(Category03) class03InstanceMethod]": ; @"\01-[MyBaseClass(Category03) class03InstanceMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category03) myProtocol03Method]
+"-[MyBaseClass(Category03) myProtocol03Method]": ; @"\01-[MyBaseClass(Category03) myProtocol03Method]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass(Category03) class03ClassMethod]
+"+[MyBaseClass(Category03) class03ClassMethod]": ; @"\01+[MyBaseClass(Category03) class03ClassMethod]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function +[MyBaseClass(Category03) MyProtocol03Prop]
+"+[MyBaseClass(Category03) MyProtocol03Prop]": ; @"\01+[MyBaseClass(Category03) MyProtocol03Prop]"
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	b	_OUTLINED_FUNCTION_0
+	.cfi_endproc
+                                        ; -- End function
+	.globl	_main                           ; -- Begin function main
+	.p2align	2
+_main:                                  ; @main
+	.cfi_startproc
+; %bb.0:                                ; %entry
+	b	_OUTLINED_FUNCTION_0
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function OUTLINED_FUNCTION_0
+_OUTLINED_FUNCTION_0:                   ; @OUTLINED_FUNCTION_0 Tail Call
+	.cfi_startproc
+; %bb.0:
+	mov	w0, #0
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:                     ; @OBJC_CLASS_NAME_
+	.asciz	"Category02"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:                  ; @OBJC_METH_VAR_NAME_
+	.asciz	"class02InstanceMethod"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:                  ; @OBJC_METH_VAR_TYPE_
+	.asciz	"v16@0:8"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.1:                ; @OBJC_METH_VAR_NAME_.1
+	.asciz	"myProtocol02Method"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category02) class02InstanceMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.1
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category02) myProtocol02Method]"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.2:                ; @OBJC_METH_VAR_NAME_.2
+	.asciz	"class02ClassMethod"
+l_OBJC_METH_VAR_NAME_.3:                ; @OBJC_METH_VAR_NAME_.3
+	.asciz	"MyProtocol02Prop"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.4:                ; @OBJC_METH_VAR_TYPE_.4
+	.asciz	"i16@0:8"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category02:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.2
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyBaseClass(Category02) class02ClassMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	"+[MyBaseClass(Category02) MyProtocol02Prop]"
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.5:                   ; @OBJC_CLASS_NAME_.5
+	.asciz	"MyProtocol02"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol02"
+__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol02:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.1
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	0
+	.quad	l_OBJC_METH_VAR_NAME_.3
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_:                 ; @OBJC_PROP_NAME_ATTR_
+	.asciz	"MyProtocol02Prop"
+l_OBJC_PROP_NAME_ATTR_.6:               ; @OBJC_PROP_NAME_ATTR_.6
+	.asciz	"Ti,R"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyProtocol02"
+__OBJC_$_PROP_LIST_MyProtocol02:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_
+	.quad	l_OBJC_PROP_NAME_ATTR_.6
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol02"
+__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol02:
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.private_extern	__OBJC_PROTOCOL_$_MyProtocol02 ; @"_OBJC_PROTOCOL_$_MyProtocol02"
+	.section	__DATA,__data
+	.globl	__OBJC_PROTOCOL_$_MyProtocol02
+	.weak_definition	__OBJC_PROTOCOL_$_MyProtocol02
+	.p2align	3, 0x0
+__OBJC_PROTOCOL_$_MyProtocol02:
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_.5
+	.quad	0
+	.quad	__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol02
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	__OBJC_$_PROP_LIST_MyProtocol02
+	.long	96                              ; 0x60
+	.long	0                               ; 0x0
+	.quad	__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol02
+	.quad	0
+	.quad	0
+	.private_extern	__OBJC_LABEL_PROTOCOL_$_MyProtocol02 ; @"_OBJC_LABEL_PROTOCOL_$_MyProtocol02"
+	.section	__DATA,__objc_protolist,coalesced,no_dead_strip
+	.globl	__OBJC_LABEL_PROTOCOL_$_MyProtocol02
+	.weak_definition	__OBJC_LABEL_PROTOCOL_$_MyProtocol02
+	.p2align	3, 0x0
+__OBJC_LABEL_PROTOCOL_$_MyProtocol02:
+	.quad	__OBJC_PROTOCOL_$_MyProtocol02
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category02"
+__OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category02:
+	.quad	1                               ; 0x1
+	.quad	__OBJC_PROTOCOL_$_MyProtocol02
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_.7:               ; @OBJC_PROP_NAME_ATTR_.7
+	.asciz	"Ti,R,D"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyBaseClass_$_Category02"
+__OBJC_$_PROP_LIST_MyBaseClass_$_Category02:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_
+	.quad	l_OBJC_PROP_NAME_ATTR_.7
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_MyBaseClass_$_Category02:
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02
+	.quad	__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category02
+	.quad	__OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category02
+	.quad	__OBJC_$_PROP_LIST_MyBaseClass_$_Category02
+	.quad	0
+	.long	64                              ; 0x40
+	.space	4
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.8:                   ; @OBJC_CLASS_NAME_.8
+	.asciz	"Category03"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.9:                ; @OBJC_METH_VAR_NAME_.9
+	.asciz	"class03InstanceMethod"
+l_OBJC_METH_VAR_NAME_.10:               ; @OBJC_METH_VAR_NAME_.10
+	.asciz	"myProtocol03Method"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category03"
+__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category03:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.9
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category03) class03InstanceMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.10
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category03) myProtocol03Method]"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.11:               ; @OBJC_METH_VAR_NAME_.11
+	.asciz	"class03ClassMethod"
+l_OBJC_METH_VAR_NAME_.12:               ; @OBJC_METH_VAR_NAME_.12
+	.asciz	"MyProtocol03Prop"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category03"
+__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category03:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.11
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"+[MyBaseClass(Category03) class03ClassMethod]"
+	.quad	l_OBJC_METH_VAR_NAME_.12
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	"+[MyBaseClass(Category03) MyProtocol03Prop]"
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.13:                  ; @OBJC_CLASS_NAME_.13
+	.asciz	"MyProtocol03"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol03"
+__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol03:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_.10
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	0
+	.quad	l_OBJC_METH_VAR_NAME_.12
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.quad	0
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_PROP_NAME_ATTR_.14:              ; @OBJC_PROP_NAME_ATTR_.14
+	.asciz	"MyProtocol03Prop"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyProtocol03"
+__OBJC_$_PROP_LIST_MyProtocol03:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_.14
+	.quad	l_OBJC_PROP_NAME_ATTR_.6
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol03"
+__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol03:
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	l_OBJC_METH_VAR_TYPE_.4
+	.private_extern	__OBJC_PROTOCOL_$_MyProtocol03 ; @"_OBJC_PROTOCOL_$_MyProtocol03"
+	.section	__DATA,__data
+	.globl	__OBJC_PROTOCOL_$_MyProtocol03
+	.weak_definition	__OBJC_PROTOCOL_$_MyProtocol03
+	.p2align	3, 0x0
+__OBJC_PROTOCOL_$_MyProtocol03:
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_.13
+	.quad	0
+	.quad	__OBJC_$_PROTOCOL_INSTANCE_METHODS_MyProtocol03
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	__OBJC_$_PROP_LIST_MyProtocol03
+	.long	96                              ; 0x60
+	.long	0                               ; 0x0
+	.quad	__OBJC_$_PROTOCOL_METHOD_TYPES_MyProtocol03
+	.quad	0
+	.quad	0
+	.private_extern	__OBJC_LABEL_PROTOCOL_$_MyProtocol03 ; @"_OBJC_LABEL_PROTOCOL_$_MyProtocol03"
+	.section	__DATA,__objc_protolist,coalesced,no_dead_strip
+	.globl	__OBJC_LABEL_PROTOCOL_$_MyProtocol03
+	.weak_definition	__OBJC_LABEL_PROTOCOL_$_MyProtocol03
+	.p2align	3, 0x0
+__OBJC_LABEL_PROTOCOL_$_MyProtocol03:
+	.quad	__OBJC_PROTOCOL_$_MyProtocol03
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category03"
+__OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category03:
+	.quad	1                               ; 0x1
+	.quad	__OBJC_PROTOCOL_$_MyProtocol03
+	.quad	0
+	.p2align	3, 0x0                          ; @"_OBJC_$_PROP_LIST_MyBaseClass_$_Category03"
+__OBJC_$_PROP_LIST_MyBaseClass_$_Category03:
+	.long	16                              ; 0x10
+	.long	1                               ; 0x1
+	.quad	l_OBJC_PROP_NAME_ATTR_.14
+	.quad	l_OBJC_PROP_NAME_ATTR_.7
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_MyBaseClass_$_Category03"
+__OBJC_$_CATEGORY_MyBaseClass_$_Category03:
+	.quad	l_OBJC_CLASS_NAME_.8
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category03
+	.quad	__OBJC_$_CATEGORY_CLASS_METHODS_MyBaseClass_$_Category03
+	.quad	__OBJC_CATEGORY_PROTOCOLS_$_MyBaseClass_$_Category03
+	.quad	__OBJC_$_PROP_LIST_MyBaseClass_$_Category03
+	.quad	0
+	.long	64                              ; 0x40
+	.space	4
+	.section	__DATA,__objc_catlist,regular,no_dead_strip
+	.p2align	3, 0x0                          ; @"OBJC_LABEL_CATEGORY_$"
+l_OBJC_LABEL_CATEGORY_$:
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category02
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category03
+	.no_dead_strip	__OBJC_LABEL_PROTOCOL_$_MyProtocol02
+	.no_dead_strip	__OBJC_LABEL_PROTOCOL_$_MyProtocol03
+	.no_dead_strip	__OBJC_PROTOCOL_$_MyProtocol02
+	.no_dead_strip	__OBJC_PROTOCOL_$_MyProtocol03
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	96
+.subsections_via_symbols
diff --git a/lld/test/MachO/objc-category-merging-extern-class-minimal.s b/lld/test/MachO/objc-category-merging-extern-class-minimal.s
new file mode 100644
index 00000000000000..ede7ef5d9c32d4
--- /dev/null
+++ b/lld/test/MachO/objc-category-merging-extern-class-minimal.s
@@ -0,0 +1,155 @@
+# REQUIRES: aarch64
+# RUN: rm -rf %t; split-file %s %t && cd %t
+
+## Create a dylib with a fake base class to link against
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o a64_fakedylib.o a64_fakedylib.s
+# RUN: %lld -arch arm64 a64_fakedylib.o -o a64_fakedylib.dylib -dylib
+
+## Create our main testing dylib - linking against the fake dylib above
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-macos -o merge_cat_minimal.o merge_cat_minimal.s
+# RUN: %lld -arch arm64 -dylib -o merge_cat_minimal_no_merge.dylib a64_fakedylib.dylib merge_cat_minimal.o
+# RUN: %lld -arch arm64 -dylib -o merge_cat_minimal_merge.dylib -objc_category_merging a64_fakedylib.dylib merge_cat_minimal.o
+
+## Now verify that the flag caused category merging to happen appropriatelly
+# RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_no_merge.dylib | FileCheck %s --check-prefixes=NO_MERGE_CATS
+# RUN: llvm-objdump --objc-meta-data --macho merge_cat_minimal_merge.dylib | FileCheck %s --check-prefixes=MERGE_CATS
+
+#### Check merge categories enabled ###
+# Check that the original categories are not there
+MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+
+# Check that the merged cateogry is there, in the correct format
+MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_(Category01|Category02)
+MERGE_CATS-NEXT:   name {{.*}} Category01|Category02
+MERGE_CATS:       instanceMethods
+MERGE_CATS-NEXT:  24
+MERGE_CATS-NEXT:  2
+MERGE_CATS-NEXT:   name {{.*}} cat01_InstanceMethod
+MERGE_CATS-NEXT:  types {{.*}} v16@0:8
+MERGE_CATS-NEXT:    imp -[MyBaseClass(Category01) cat01_InstanceMethod]
+MERGE_CATS-NEXT:   name {{.*}} cat02_InstanceMethod
+MERGE_CATS-NEXT:  types {{.*}} v16@0:8
+MERGE_CATS-NEXT:    imp -[MyBaseClass(Category02) cat02_InstanceMethod]
+MERGE_CATS-NEXT:         classMethods 0x0
+MERGE_CATS-NEXT:            protocols 0x0
+MERGE_CATS-NEXT:   instanceProperties 0x0
+
+#### Check merge categories disabled ###
+# Check that the merged category is not there
+NO_MERGE_CATS-NOT: __OBJC_$_CATEGORY_MyBaseClass_$_(Category01|Category02)
+
+# Check that the original categories are there
+NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category01
+NO_MERGE_CATS: __OBJC_$_CATEGORY_MyBaseClass_$_Category02
+
+
+
+#--- a64_fakedylib.s
+
+    .section    __DATA,__objc_data
+    .globl    _OBJC_CLASS_$_MyBaseClass
+_OBJC_CLASS_$_MyBaseClass:
+    .quad    0
+
+#--- merge_cat_minimal.s
+
+;  ================== Generated from ObjC: ==================
+; __attribute__((objc_root_class))
+; @interface MyBaseClass
+; - (void)baseInstanceMethod;
+; @end
+;
+; @interface MyBaseClass(Category01)
+; - (void)cat01_InstanceMethod;
+; @end
+;
+; @implementation MyBaseClass(Category01)
+; - (void)cat01_InstanceMethod {}
+; @end
+;
+; @interface MyBaseClass(Category02)
+; - (void)cat02_InstanceMethod;
+; @end
+;
+; @implementation MyBaseClass(Category02)
+; - (void)cat02_InstanceMethod {}
+; @end
+;  ================== Generated from ObjC: ==================
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category01) cat01_InstanceMethod]
+"-[MyBaseClass(Category01) cat01_InstanceMethod]": ; @"\01-[MyBaseClass(Category01) cat01_InstanceMethod]"
+	.cfi_startproc
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[MyBaseClass(Category02) cat02_InstanceMethod]
+"-[MyBaseClass(Category02) cat02_InstanceMethod]": ; @"\01-[MyBaseClass(Category02) cat02_InstanceMethod]"
+	.cfi_startproc
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:                     ; @OBJC_CLASS_NAME_
+	.asciz	"Category01"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:                  ; @OBJC_METH_VAR_NAME_
+	.asciz	"cat01_InstanceMethod"
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:                  ; @OBJC_METH_VAR_TYPE_
+	.asciz	"v16@0:8"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category01"
+__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category01:
+	.long	24                              ; 0x18
+	.long	1                               ; 0x1
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category01) cat01_InstanceMethod]"
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_MyBaseClass_$_Category01"
+__OBJC_$_CATEGORY_MyBaseClass_$_Category01:
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category01
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.long	64                              ; 0x40
+	.space	4
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_.1:                   ; @OBJC_CLASS_NAME_.1
+	.asciz	"Category02"
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.2:                ; @OBJC_METH_VAR_NAME_.2
+	.asciz	"cat02_InstanceMethod"
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02:
+	.long	24                              ; 0x18
+	.long	1                               ; 0x1
+	.quad	l_OBJC_METH_VAR_NAME_.2
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[MyBaseClass(Category02) cat02_InstanceMethod]"
+	.p2align	3, 0x0                          ; @"_OBJC_$_CATEGORY_MyBaseClass_$_Category02"
+__OBJC_$_CATEGORY_MyBaseClass_$_Category02:
+	.quad	l_OBJC_CLASS_NAME_.1
+	.quad	_OBJC_CLASS_$_MyBaseClass
+	.quad	__OBJC_$_CATEGORY_INSTANCE_METHODS_MyBaseClass_$_Category02
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.long	64                              ; 0x40
+	.space	4
+	.section	__DATA,__objc_catlist,regular,no_dead_strip
+	.p2align	3, 0x0                          ; @"OBJC_LABEL_CATEGORY_$"
+l_OBJC_LABEL_CATEGORY_$:
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category01
+	.quad	__OBJC_$_CATEGORY_MyBaseClass_$_Category02
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	96
+.subsections_via_symbols
diff --git a/lld/test/MachO/thinlto-split-unit-start-lib.ll b/lld/test/MachO/thinlto-split-unit-start-lib.ll
index 8c643b7c594e89..1c4a2f68a8b47f 100644
--- a/lld/test/MachO/thinlto-split-unit-start-lib.ll
+++ b/lld/test/MachO/thinlto-split-unit-start-lib.ll
@@ -57,14 +57,14 @@ target triple = "x86_64-apple-macosx11.0.0"
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind ssp willreturn memory(argmem: write) uwtable
 define void @_ZN1SC2Ev(ptr nocapture noundef nonnull writeonly align 8 dereferenceable(8) %this) unnamed_addr align 2 {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1S, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1S, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind ssp willreturn memory(argmem: write) uwtable
 define void @_ZN1SC1Ev(ptr nocapture noundef nonnull writeonly align 8 dereferenceable(8) %this) unnamed_addr align 2 {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1S, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1S, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
diff --git a/lldb/cmake/caches/Apple-lldb-Linux.cmake b/lldb/cmake/caches/Apple-lldb-Linux.cmake
index 13d3839f852f61..b2d3cf595fe18d 100644
--- a/lldb/cmake/caches/Apple-lldb-Linux.cmake
+++ b/lldb/cmake/caches/Apple-lldb-Linux.cmake
@@ -5,4 +5,5 @@ set(LLVM_DISTRIBUTION_COMPONENTS
   liblldb
   lldb-argdumper
   lldb-server
+  lldb-python-scripts
   CACHE STRING "")
diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h
index f7bdd3093d2025..3644ac056da3dc 100644
--- a/lldb/include/lldb/API/SBTarget.h
+++ b/lldb/include/lldb/API/SBTarget.h
@@ -42,7 +42,8 @@ class LLDB_API SBTarget {
     eBroadcastBitModulesLoaded = (1 << 1),
     eBroadcastBitModulesUnloaded = (1 << 2),
     eBroadcastBitWatchpointChanged = (1 << 3),
-    eBroadcastBitSymbolsLoaded = (1 << 4)
+    eBroadcastBitSymbolsLoaded = (1 << 4),
+    eBroadcastBitSymbolsChanged = (1 << 5),
   };
 
   // Constructors
diff --git a/lldb/include/lldb/Host/Alarm.h b/lldb/include/lldb/Host/Alarm.h
new file mode 100644
index 00000000000000..23b1ff1af56890
--- /dev/null
+++ b/lldb/include/lldb/Host/Alarm.h
@@ -0,0 +1,115 @@
+//===-- Alarm.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_HOST_ALARM_H
+#define LLDB_HOST_ALARM_H
+
+#include "lldb/Host/HostThread.h"
+#include "lldb/lldb-types.h"
+#include "llvm/Support/Chrono.h"
+
+#include <condition_variable>
+#include <mutex>
+
+namespace lldb_private {
+
+/// \class Alarm abstraction that enables scheduling a callback function after a
+/// specified timeout. Creating an alarm for a callback returns a Handle that
+/// can be used to restart or cancel the alarm.
+class Alarm {
+public:
+  using Handle = uint64_t;
+  using Callback = std::function<void()>;
+  using TimePoint = llvm::sys::TimePoint<>;
+  using Duration = std::chrono::milliseconds;
+
+  Alarm(Duration timeout, bool run_callback_on_exit = false);
+  ~Alarm();
+
+  /// Create an alarm for the given callback. The alarm will expire and the
+  /// callback will be called after the timeout.
+  ///
+  /// \returns
+  ///   Handle which can be used to restart or cancel the alarm.
+  Handle Create(Callback callback);
+
+  /// Restart the alarm for the given Handle. The alarm will expire and the
+  /// callback will be called after the timeout.
+  ///
+  /// \returns
+  ///   True if the alarm was successfully restarted. False if there is no alarm
+  ///   for the given Handle or the alarm already expired.
+  bool Restart(Handle handle);
+
+  /// Cancel the alarm for the given Handle. The alarm and its handle will be
+  /// removed.
+  ///
+  /// \returns
+  ///   True if the alarm was successfully canceled and the Handle removed.
+  ///   False if there is no alarm for the given Handle or the alarm already
+  ///   expired.
+  bool Cancel(Handle handle);
+
+  static constexpr Handle INVALID_HANDLE = 0;
+
+private:
+  /// Helper functions to start, stop and check the status of the alarm thread.
+  /// @{
+  void StartAlarmThread();
+  void StopAlarmThread();
+  bool AlarmThreadRunning();
+  /// @}
+
+  /// Return an unique, monotonically increasing handle.
+  static Handle GetNextUniqueHandle();
+
+  /// Helper to compute the next time the alarm thread needs to wake up.
+  TimePoint GetNextExpiration() const;
+
+  /// Alarm entry.
+  struct Entry {
+    Handle handle;
+    Callback callback;
+    TimePoint expiration;
+
+    Entry(Callback callback, TimePoint expiration);
+    bool operator==(const Entry &rhs) { return handle == rhs.handle; }
+  };
+
+  /// List of alarm entries.
+  std::vector<Entry> m_entries;
+
+  /// Timeout between when an alarm is created and when it fires.
+  Duration m_timeout;
+
+  /// The alarm thread.
+  /// @{
+  HostThread m_alarm_thread;
+  lldb::thread_result_t AlarmThread();
+  /// @}
+
+  /// Synchronize access between the alarm thread and the main thread.
+  std::mutex m_alarm_mutex;
+
+  /// Condition variable used to wake up the alarm thread.
+  std::condition_variable m_alarm_cv;
+
+  /// Flag to signal the alarm thread that something changed and we need to
+  /// recompute the next alarm.
+  bool m_recompute_next_alarm = false;
+
+  /// Flag to signal the alarm thread to exit.
+  bool m_exit = false;
+
+  /// Flag to signal we should run all callbacks on exit.
+  bool m_run_callbacks_on_exit = false;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_HOST_ALARM_H
diff --git a/lldb/include/lldb/Symbol/LineEntry.h b/lldb/include/lldb/Symbol/LineEntry.h
index c2daba916e3f98..31e1cd0b36f96e 100644
--- a/lldb/include/lldb/Symbol/LineEntry.h
+++ b/lldb/include/lldb/Symbol/LineEntry.h
@@ -130,31 +130,40 @@ struct LineEntry {
   ///     Shared pointer to the target this LineEntry belongs to.
   void ApplyFileMappings(lldb::TargetSP target_sp);
 
-  // Member variables.
-  AddressRange range; ///< The section offset address range for this line entry.
-  FileSpec file; ///< The source file, possibly mapped by the target.source-map
-                 ///setting
-  lldb::SupportFileSP
-      original_file_sp; ///< The original source file, from debug info.
-  uint32_t line = LLDB_INVALID_LINE_NUMBER; ///< The source line number, or zero
-                                            ///< if there is no line number
-                                            /// information.
-  uint16_t column =
-      0; ///< The column number of the source line, or zero if there
-         /// is no column information.
-  uint16_t is_start_of_statement : 1, ///< Indicates this entry is the beginning
-                                      ///of a statement.
-      is_start_of_basic_block : 1, ///< Indicates this entry is the beginning of
-                                   ///a basic block.
-      is_prologue_end : 1,   ///< Indicates this entry is one (of possibly many)
-                             ///where execution should be suspended for an entry
-                             ///breakpoint of a function.
-      is_epilogue_begin : 1, ///< Indicates this entry is one (of possibly many)
-                             ///where execution should be suspended for an exit
-                             ///breakpoint of a function.
-      is_terminal_entry : 1; ///< Indicates this entry is that of the first byte
-                             ///after the end of a sequence of target machine
-                             ///instructions.
+  /// The section offset address range for this line entry.
+  AddressRange range;
+
+  /// The source file, possibly mapped by the target.source-map setting.
+  FileSpec file;
+
+  /// The original source file, from debug info.
+  lldb::SupportFileSP original_file_sp;
+
+  /// The source line number, or LLDB_INVALID_LINE_NUMBER if there is no line
+  /// number information.
+  uint32_t line = LLDB_INVALID_LINE_NUMBER;
+
+  /// The column number of the source line, or zero if there is no column
+  /// information.
+  uint16_t column = 0;
+
+  /// Indicates this entry is the beginning of a statement.
+  uint16_t is_start_of_statement : 1;
+
+  /// Indicates this entry is the beginning of a basic block.
+  uint16_t is_start_of_basic_block : 1;
+
+  /// Indicates this entry is one (of possibly many) where execution should be
+  /// suspended for an entry breakpoint of a function.
+  uint16_t is_prologue_end : 1;
+
+  /// Indicates this entry is one (of possibly many) where execution should be
+  /// suspended for an exit breakpoint of a function.
+  uint16_t is_epilogue_begin : 1;
+
+  /// Indicates this entry is that of the first byte after the end of a sequence
+  /// of target machine instructions.
+  uint16_t is_terminal_entry : 1;
 };
 
 /// Less than operator.
diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index 0cbd8a32dccd54..67714e6fdf942e 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -26,6 +26,15 @@
 
 namespace lldb_private {
 
+class LanguageProperties : public Properties {
+public:
+  LanguageProperties();
+
+  static llvm::StringRef GetSettingName();
+
+  bool GetEnableFilterForLineBreakpoints() const;
+};
+
 class Language : public PluginInterface {
 public:
   class TypeScavenger {
@@ -324,6 +333,8 @@ class Language : public PluginInterface {
   static LanguageSet GetLanguagesSupportingTypeSystemsForExpressions();
   static LanguageSet GetLanguagesSupportingREPLs();
 
+  static LanguageProperties &GetGlobalLanguageProperties();
+
   // Given a mangled function name, calculates some alternative manglings since
   // the compiler mangling may not line up with the symbol we are expecting.
   virtual std::vector<ConstString>
@@ -339,6 +350,15 @@ class Language : public PluginInterface {
 
   virtual llvm::StringRef GetInstanceVariableName() { return {}; }
 
+  /// Returns true if this SymbolContext should be ignored when setting
+  /// breakpoints by line (number or regex). Helpful for languages that create
+  /// artificial functions without meaningful user code associated with them
+  /// (e.g. code that gets expanded in late compilation stages, like by
+  /// CoroSplitter).
+  virtual bool IgnoreForLineBreakpoints(const SymbolContext &) const {
+    return false;
+  }
+
 protected:
   // Classes that inherit from Language can see and modify these
 
diff --git a/lldb/source/Breakpoint/BreakpointResolver.cpp b/lldb/source/Breakpoint/BreakpointResolver.cpp
index bc6348716ef418..1861a0fe7c4fe4 100644
--- a/lldb/source/Breakpoint/BreakpointResolver.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolver.cpp
@@ -23,6 +23,7 @@
 #include "lldb/Symbol/CompileUnit.h"
 #include "lldb/Symbol/Function.h"
 #include "lldb/Symbol/SymbolContext.h"
+#include "lldb/Target/Language.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
@@ -203,8 +204,15 @@ void BreakpointResolver::SetSCMatchesByLine(
     SearchFilter &filter, SymbolContextList &sc_list, bool skip_prologue,
     llvm::StringRef log_ident, uint32_t line, std::optional<uint16_t> column) {
   llvm::SmallVector<SymbolContext, 16> all_scs;
-  for (uint32_t i = 0; i < sc_list.GetSize(); ++i)
-    all_scs.push_back(sc_list[i]);
+
+  for (const auto &sc : sc_list) {
+    if (Language::GetGlobalLanguageProperties()
+            .GetEnableFilterForLineBreakpoints())
+      if (Language *lang = Language::FindPlugin(sc.GetLanguage());
+          lang && lang->IgnoreForLineBreakpoints(sc))
+        continue;
+    all_scs.push_back(sc);
+  }
 
   while (all_scs.size()) {
     uint32_t closest_line = UINT32_MAX;
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index b183cb423111fb..e1255f37d9bc58 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -23,7 +23,6 @@
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-forward.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatVariadic.h"
 
 #include <regex>
 
@@ -130,6 +129,19 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
     }
   };
 
+  // Dump `valobj` according to whether `po` was requested or not.
+  auto dump_val_object = [&](ValueObject &valobj) {
+    if (is_po) {
+      StreamString temp_result_stream;
+      valobj.Dump(temp_result_stream, dump_options);
+      llvm::StringRef output = temp_result_stream.GetString();
+      maybe_add_hint(output);
+      result.GetOutputStream() << output;
+    } else {
+      valobj.Dump(result.GetOutputStream(), dump_options);
+    }
+  };
+
   // First, try `expr` as the name of a frame variable.
   if (frame) {
     auto valobj_sp = frame->FindVariable(ConstString(expr));
@@ -147,21 +159,23 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
                                         flags, expr);
       }
 
-      if (is_po) {
-        StreamString temp_result_stream;
-        valobj_sp->Dump(temp_result_stream, dump_options);
-        llvm::StringRef output = temp_result_stream.GetString();
-        maybe_add_hint(output);
-        result.GetOutputStream() << output;
-      } else {
-        valobj_sp->Dump(result.GetOutputStream(), dump_options);
-      }
+      dump_val_object(*valobj_sp);
       result.SetStatus(eReturnStatusSuccessFinishResult);
       return;
     }
   }
 
-  // Second, also lastly, try `expr` as a source expression to evaluate.
+  // Second, try `expr` as a persistent variable.
+  if (expr.starts_with("$"))
+    if (auto *state = target.GetPersistentExpressionStateForLanguage(language))
+      if (auto var_sp = state->GetVariable(expr))
+        if (auto valobj_sp = var_sp->GetValueObject()) {
+          dump_val_object(*valobj_sp);
+          result.SetStatus(eReturnStatusSuccessFinishResult);
+          return;
+        }
+
+  // Third, and lastly, try `expr` as a source expression to evaluate.
   {
     auto *exe_scope = m_exe_ctx.GetBestExecutionContextScope();
     ValueObjectSP valobj_sp;
@@ -187,17 +201,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
                                         expr);
       }
 
-      if (valobj_sp->GetError().GetError() != UserExpression::kNoResult) {
-        if (is_po) {
-          StreamString temp_result_stream;
-          valobj_sp->Dump(temp_result_stream, dump_options);
-          llvm::StringRef output = temp_result_stream.GetString();
-          maybe_add_hint(output);
-          result.GetOutputStream() << output;
-        } else {
-          valobj_sp->Dump(result.GetOutputStream(), dump_options);
-        }
-      }
+      if (valobj_sp->GetError().GetError() != UserExpression::kNoResult)
+        dump_val_object(*valobj_sp);
 
       if (suppress_result)
         if (auto result_var_sp =
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index b2346c2402a81d..ae6c6d5479a198 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -3377,7 +3377,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
       case 'r': {
         size_t ref_count = 0;
         char in_shared_cache = 'Y';
-        
+
         ModuleSP module_sp(module->shared_from_this());
         if (!ModuleList::ModuleIsInCache(module))
           in_shared_cache = 'N';
@@ -4508,11 +4508,8 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
 
     ModuleSpec module_spec;
     module_spec.GetUUID() = frame_module_sp->GetUUID();
-
-    if (FileSystem::Instance().Exists(frame_module_sp->GetPlatformFileSpec())) {
-      module_spec.GetArchitecture() = frame_module_sp->GetArchitecture();
-      module_spec.GetFileSpec() = frame_module_sp->GetPlatformFileSpec();
-    }
+    module_spec.GetArchitecture() = frame_module_sp->GetArchitecture();
+    module_spec.GetFileSpec() = frame_module_sp->GetPlatformFileSpec();
 
     if (!DownloadObjectAndSymbolFile(module_spec, result, flush)) {
       result.AppendError("unable to find debug symbols for the current frame");
@@ -4557,12 +4554,8 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
 
       ModuleSpec module_spec;
       module_spec.GetUUID() = frame_module_sp->GetUUID();
-
-      if (FileSystem::Instance().Exists(
-              frame_module_sp->GetPlatformFileSpec())) {
-        module_spec.GetArchitecture() = frame_module_sp->GetArchitecture();
-        module_spec.GetFileSpec() = frame_module_sp->GetPlatformFileSpec();
-      }
+      module_spec.GetFileSpec() = frame_module_sp->GetPlatformFileSpec();
+      module_spec.GetArchitecture() = frame_module_sp->GetArchitecture();
 
       bool current_frame_flush = false;
       if (DownloadObjectAndSymbolFile(module_spec, result, current_frame_flush))
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 90aabde2b764f1..ebd112110e5f2d 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -860,6 +860,9 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton)
   m_collection_sp->AppendProperty(
       "symbols", "Symbol lookup and cache settings.", true,
       ModuleList::GetGlobalModuleListProperties().GetValueProperties());
+  m_collection_sp->AppendProperty(
+      LanguageProperties::GetSettingName(), "Language settings.", true,
+      Language::GetGlobalLanguageProperties().GetValueProperties());
   if (m_command_interpreter_up) {
     m_collection_sp->AppendProperty(
         "interpreter",
diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt
index fe6e539f758fda..c2e091ee8555b7 100644
--- a/lldb/source/Host/CMakeLists.txt
+++ b/lldb/source/Host/CMakeLists.txt
@@ -13,6 +13,7 @@ macro(add_host_subdirectory group)
 endmacro()
 
 add_host_subdirectory(common
+  common/Alarm.cpp
   common/FileAction.cpp
   common/FileCache.cpp
   common/File.cpp
diff --git a/lldb/source/Host/common/Alarm.cpp b/lldb/source/Host/common/Alarm.cpp
new file mode 100644
index 00000000000000..245cdc7ae5c2da
--- /dev/null
+++ b/lldb/source/Host/common/Alarm.cpp
@@ -0,0 +1,216 @@
+//===-- Alarm.cpp ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/Alarm.h"
+#include "lldb/Host/ThreadLauncher.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+Alarm::Alarm(Duration timeout, bool run_callback_on_exit)
+    : m_timeout(timeout), m_run_callbacks_on_exit(run_callback_on_exit) {
+  StartAlarmThread();
+}
+
+Alarm::~Alarm() { StopAlarmThread(); }
+
+Alarm::Handle Alarm::Create(std::function<void()> callback) {
+  // Gracefully deal with the unlikely event that the alarm thread failed to
+  // launch.
+  if (!AlarmThreadRunning())
+    return INVALID_HANDLE;
+
+  // Compute the next expiration before we take the lock. This ensures that
+  // waiting on the lock doesn't eat into the timeout.
+  const TimePoint expiration = GetNextExpiration();
+
+  Handle handle = INVALID_HANDLE;
+
+  {
+    std::lock_guard<std::mutex> alarm_guard(m_alarm_mutex);
+
+    // Create a new unique entry and remember its handle.
+    m_entries.emplace_back(callback, expiration);
+    handle = m_entries.back().handle;
+
+    // Tell the alarm thread we need to recompute the next alarm.
+    m_recompute_next_alarm = true;
+  }
+
+  m_alarm_cv.notify_one();
+  return handle;
+}
+
+bool Alarm::Restart(Handle handle) {
+  // Gracefully deal with the unlikely event that the alarm thread failed to
+  // launch.
+  if (!AlarmThreadRunning())
+    return false;
+
+  // Compute the next expiration before we take the lock. This ensures that
+  // waiting on the lock doesn't eat into the timeout.
+  const TimePoint expiration = GetNextExpiration();
+
+  {
+    std::lock_guard<std::mutex> alarm_guard(m_alarm_mutex);
+
+    // Find the entry corresponding to the given handle.
+    const auto it =
+        std::find_if(m_entries.begin(), m_entries.end(),
+                     [handle](Entry &entry) { return entry.handle == handle; });
+    if (it == m_entries.end())
+      return false;
+
+    // Update the expiration.
+    it->expiration = expiration;
+
+    // Tell the alarm thread we need to recompute the next alarm.
+    m_recompute_next_alarm = true;
+  }
+
+  m_alarm_cv.notify_one();
+  return true;
+}
+
+bool Alarm::Cancel(Handle handle) {
+  // Gracefully deal with the unlikely event that the alarm thread failed to
+  // launch.
+  if (!AlarmThreadRunning())
+    return false;
+
+  {
+    std::lock_guard<std::mutex> alarm_guard(m_alarm_mutex);
+
+    const auto it =
+        std::find_if(m_entries.begin(), m_entries.end(),
+                     [handle](Entry &entry) { return entry.handle == handle; });
+
+    if (it == m_entries.end())
+      return false;
+
+    m_entries.erase(it);
+  }
+
+  // No need to notify the alarm thread. This only affects the alarm thread if
+  // we removed the entry that corresponds to the next alarm. If that's the
+  // case, the thread will wake up as scheduled, find no expired events, and
+  // recompute the next alarm time.
+  return true;
+}
+
+Alarm::Entry::Entry(Alarm::Callback callback, Alarm::TimePoint expiration)
+    : handle(Alarm::GetNextUniqueHandle()), callback(std::move(callback)),
+      expiration(std::move(expiration)) {}
+
+void Alarm::StartAlarmThread() {
+  if (!m_alarm_thread.IsJoinable()) {
+    llvm::Expected<HostThread> alarm_thread = ThreadLauncher::LaunchThread(
+        "lldb.debugger.alarm-thread", [this] { return AlarmThread(); },
+        8 * 1024 * 1024); // Use larger 8MB stack for this thread
+    if (alarm_thread) {
+      m_alarm_thread = *alarm_thread;
+    } else {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Host), alarm_thread.takeError(),
+                     "failed to launch host thread: {0}");
+    }
+  }
+}
+
+void Alarm::StopAlarmThread() {
+  if (m_alarm_thread.IsJoinable()) {
+    {
+      std::lock_guard<std::mutex> alarm_guard(m_alarm_mutex);
+      m_exit = true;
+    }
+    m_alarm_cv.notify_one();
+    m_alarm_thread.Join(nullptr);
+  }
+}
+
+bool Alarm::AlarmThreadRunning() { return m_alarm_thread.IsJoinable(); }
+
+lldb::thread_result_t Alarm::AlarmThread() {
+  bool exit = false;
+  std::optional<TimePoint> next_alarm;
+
+  const auto predicate = [this] { return m_exit || m_recompute_next_alarm; };
+
+  while (!exit) {
+    // Synchronization between the main thread and the alarm thread using a
+    // mutex and condition variable. There are 2 reasons the thread can wake up:
+    //
+    // 1. The timeout for the next alarm expired.
+    //
+    // 2. The condition variable is notified that one of our shared variables
+    //    (see predicate) was modified. Either the thread is asked to shut down
+    //    or a new alarm came in and we need to recompute the next timeout.
+    //
+    // Below we only deal with the timeout expiring and fall through for dealing
+    // with the rest.
+    std::unique_lock<std::mutex> alarm_lock(m_alarm_mutex);
+    if (next_alarm) {
+      if (!m_alarm_cv.wait_until(alarm_lock, *next_alarm, predicate)) {
+        // The timeout for the next alarm expired.
+
+        // Clear the next timeout to signal that we need to recompute the next
+        // timeout.
+        next_alarm.reset();
+
+        // Iterate over all the callbacks. Call the ones that have expired
+        // and remove them from the list.
+        const TimePoint now = std::chrono::system_clock::now();
+        auto it = m_entries.begin();
+        while (it != m_entries.end()) {
+          if (it->expiration <= now) {
+            it->callback();
+            it = m_entries.erase(it);
+          } else {
+            it++;
+          }
+        }
+      }
+    } else {
+      m_alarm_cv.wait(alarm_lock, predicate);
+    }
+
+    // Fall through after waiting on the condition variable. At this point
+    // either the predicate is true or we woke up because an alarm expired.
+
+    // The alarm thread is shutting down.
+    if (m_exit) {
+      exit = true;
+      if (m_run_callbacks_on_exit) {
+        for (Entry &entry : m_entries)
+          entry.callback();
+      }
+      continue;
+    }
+
+    // A new alarm was added or an alarm expired. Either way we need to
+    // recompute when this thread should wake up for the next alarm.
+    if (m_recompute_next_alarm || !next_alarm) {
+      for (Entry &entry : m_entries) {
+        if (!next_alarm || entry.expiration < *next_alarm)
+          next_alarm = entry.expiration;
+      }
+      m_recompute_next_alarm = false;
+    }
+  }
+  return {};
+}
+
+Alarm::TimePoint Alarm::GetNextExpiration() const {
+  return std::chrono::system_clock::now() + m_timeout;
+}
+
+Alarm::Handle Alarm::GetNextUniqueHandle() {
+  static std::atomic<Handle> g_next_handle = 1;
+  return g_next_handle++;
+}
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
index 97fa894ea73761..0c6fdb2b957315 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt
@@ -13,6 +13,7 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN
   LibCxxMap.cpp
   LibCxxQueue.cpp
   LibCxxRangesRefView.cpp
+  LibCxxSliceArray.cpp
   LibCxxSpan.cpp
   LibCxxTuple.cpp
   LibCxxUnorderedMap.cpp
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 675ca385186102..4a536096a066ff 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -755,6 +755,11 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       lldb_private::formatters::LibcxxStdValarraySyntheticFrontEndCreator,
       "libc++ std::valarray synthetic children",
       "^std::__[[:alnum:]]+::valarray<.+>$", stl_deref_flags, true);
+  AddCXXSynthetic(
+      cpp_category_sp,
+      lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEndCreator,
+      "libc++ std::slice_array synthetic children",
+      "^std::__[[:alnum:]]+::slice_array<.+>$", stl_deref_flags, true);
   AddCXXSynthetic(
       cpp_category_sp,
       lldb_private::formatters::LibcxxStdForwardListSyntheticFrontEndCreator,
@@ -880,6 +885,11 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
                 lldb_private::formatters::LibcxxContainerSummaryProvider,
                 "libc++ std::valarray summary provider",
                 "^std::__[[:alnum:]]+::valarray<.+>$", stl_summary_flags, true);
+  AddCXXSummary(cpp_category_sp,
+                lldb_private::formatters::LibcxxStdSliceArraySummaryProvider,
+                "libc++ std::slice_array summary provider",
+                "^std::__[[:alnum:]]+::slice_array<.+>$", stl_summary_flags,
+                true);
   AddCXXSummary(
       cpp_category_sp, lldb_private::formatters::LibcxxContainerSummaryProvider,
       "libc++ std::list summary provider",
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
index a59f21841ec890..d8b807d180e068 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
@@ -59,6 +59,10 @@ bool LibcxxWStringViewSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions &options); // libc++ std::wstring_view
 
+bool LibcxxStdSliceArraySummaryProvider(
+    ValueObject &valobj, Stream &stream,
+    const TypeSummaryOptions &options); // libc++ std::slice_array
+
 bool LibcxxSmartPointerSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions
@@ -223,6 +227,10 @@ SyntheticChildrenFrontEnd *
 LibcxxStdValarraySyntheticFrontEndCreator(CXXSyntheticChildren *,
                                           lldb::ValueObjectSP);
 
+SyntheticChildrenFrontEnd *
+LibcxxStdSliceArraySyntheticFrontEndCreator(CXXSyntheticChildren *,
+                                            lldb::ValueObjectSP);
+
 SyntheticChildrenFrontEnd *
 LibcxxStdListSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                       lldb::ValueObjectSP);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp
new file mode 100644
index 00000000000000..32e67d2e38c5d3
--- /dev/null
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp
@@ -0,0 +1,166 @@
+//===-- LibCxxSliceArray.cpp-----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LibCxx.h"
+
+#include "lldb/Core/ValueObject.h"
+#include "lldb/DataFormatters/FormattersHelpers.h"
+#include <optional>
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::formatters;
+
+namespace lldb_private {
+namespace formatters {
+
+bool LibcxxStdSliceArraySummaryProvider(ValueObject &valobj, Stream &stream,
+                                        const TypeSummaryOptions &options) {
+  ValueObjectSP obj = valobj.GetNonSyntheticValue();
+  if (!obj)
+    return false;
+
+  ValueObjectSP ptr_sp = obj->GetChildMemberWithName("__size_");
+  if (!ptr_sp)
+    return false;
+  const size_t size = ptr_sp->GetValueAsUnsigned(0);
+
+  ptr_sp = obj->GetChildMemberWithName("__stride_");
+  if (!ptr_sp)
+    return false;
+  const size_t stride = ptr_sp->GetValueAsUnsigned(0);
+
+  stream.Printf("stride=%zu size=%zu", stride, size);
+
+  return true;
+}
+
+/// Data formatter for libc++'s std::slice_array.
+///
+/// A slice_array is created by using:
+///   operator[](std::slice slicearr);
+/// and std::slice is created by:
+///   slice(std::size_t start, std::size_t size, std::size_t stride);
+/// The std::slice_array has the following members:
+/// - __vp_ points to std::valarray::__begin_ + @a start
+/// - __size_ is @a size
+/// - __stride_is @a stride
+class LibcxxStdSliceArraySyntheticFrontEnd : public SyntheticChildrenFrontEnd {
+public:
+  LibcxxStdSliceArraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp);
+
+  ~LibcxxStdSliceArraySyntheticFrontEnd() override;
+
+  llvm::Expected<uint32_t> CalculateNumChildren() override;
+
+  lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override;
+
+  lldb::ChildCacheState Update() override;
+
+  bool MightHaveChildren() override;
+
+  size_t GetIndexOfChildWithName(ConstString name) override;
+
+private:
+  /// A non-owning pointer to slice_array.__vp_.
+  ValueObject *m_start = nullptr;
+  /// slice_array.__size_.
+  size_t m_size = 0;
+  /// slice_array.__stride_.
+  size_t m_stride = 0;
+  /// The type of slize_array's template argument T.
+  CompilerType m_element_type;
+  /// The sizeof slize_array's template argument T.
+  uint32_t m_element_size = 0;
+};
+
+} // namespace formatters
+} // namespace lldb_private
+
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::
+    LibcxxStdSliceArraySyntheticFrontEnd(lldb::ValueObjectSP valobj_sp)
+    : SyntheticChildrenFrontEnd(*valobj_sp), m_element_type() {
+  if (valobj_sp)
+    Update();
+}
+
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::
+    ~LibcxxStdSliceArraySyntheticFrontEnd() {
+  // these need to stay around because they are child objects who will follow
+  // their parent's life cycle
+  // delete m_start;
+}
+
+llvm::Expected<uint32_t> lldb_private::formatters::
+    LibcxxStdSliceArraySyntheticFrontEnd::CalculateNumChildren() {
+  return m_size;
+}
+
+lldb::ValueObjectSP
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::GetChildAtIndex(
+    uint32_t idx) {
+  if (!m_start)
+    return lldb::ValueObjectSP();
+
+  uint64_t offset = idx * m_stride * m_element_size;
+  offset = offset + m_start->GetValueAsUnsigned(0);
+  StreamString name;
+  name.Printf("[%" PRIu64 "]", (uint64_t)idx);
+  return CreateValueObjectFromAddress(name.GetString(), offset,
+                                      m_backend.GetExecutionContextRef(),
+                                      m_element_type);
+}
+
+lldb::ChildCacheState
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::Update() {
+  m_start = nullptr;
+
+  CompilerType type = m_backend.GetCompilerType();
+  if (type.GetNumTemplateArguments() == 0)
+    return ChildCacheState::eRefetch;
+
+  m_element_type = type.GetTypeTemplateArgument(0);
+  if (std::optional<uint64_t> size = m_element_type.GetByteSize(nullptr))
+    m_element_size = *size;
+
+  if (m_element_size == 0)
+    return ChildCacheState::eRefetch;
+
+  ValueObjectSP start = m_backend.GetChildMemberWithName("__vp_");
+  ValueObjectSP size = m_backend.GetChildMemberWithName("__size_");
+  ValueObjectSP stride = m_backend.GetChildMemberWithName("__stride_");
+
+  if (!start || !size || !stride)
+    return ChildCacheState::eRefetch;
+
+  m_start = start.get();
+  m_size = size->GetValueAsUnsigned(0);
+  m_stride = stride->GetValueAsUnsigned(0);
+
+  return ChildCacheState::eRefetch;
+}
+
+bool lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::
+    MightHaveChildren() {
+  return true;
+}
+
+size_t lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::
+    GetIndexOfChildWithName(ConstString name) {
+  if (!m_start)
+    return std::numeric_limits<size_t>::max();
+  return ExtractIndexFromString(name.GetCString());
+}
+
+lldb_private::SyntheticChildrenFrontEnd *
+lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEndCreator(
+    CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) {
+  if (!valobj_sp)
+    return nullptr;
+  return new LibcxxStdSliceArraySyntheticFrontEnd(valobj_sp);
+}
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
index 3961dcf0fbcc0e..7b9938d4f02020 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
@@ -652,7 +652,7 @@ size_t ProcessMachCore::ReadMemory(addr_t addr, void *buf, size_t size,
                                    Status &error) {
   // Don't allow the caching that lldb_private::Process::ReadMemory does since
   // in core files we have it all cached our our core file anyway.
-  return DoReadMemory(addr, buf, size, error);
+  return DoReadMemory(FixAnyAddress(addr), buf, size, error);
 }
 
 size_t ProcessMachCore::DoReadMemory(addr_t addr, void *buf, size_t size,
diff --git a/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp b/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp
index f7df4650941a80..69595c6c14e4c0 100644
--- a/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp
+++ b/lldb/source/Plugins/SymbolLocator/DebugSymbols/SymbolLocatorDebugSymbols.cpp
@@ -1066,11 +1066,21 @@ bool SymbolLocatorDebugSymbols::DownloadObjectAndSymbolFile(
   command << lookup_arg;
 
   // Log and report progress.
+  std::string lookup_desc;
+  if (uuid_ptr && file_spec_ptr)
+    lookup_desc =
+        llvm::formatv("{0} ({1})", file_spec_ptr->GetFilename().GetString(),
+                      uuid_ptr->GetAsString());
+  else if (uuid_ptr)
+    lookup_desc = uuid_ptr->GetAsString();
+  else if (file_spec_ptr)
+    lookup_desc = file_spec_ptr->GetFilename().GetString();
+
   Log *log = GetLog(LLDBLog::Host);
-  LLDB_LOG(log, "Calling {0} with {1} to find dSYM: {2}", dsymForUUID_exe_path,
-           lookup_arg, command.GetString());
+  LLDB_LOG(log, "Calling {0} for {1} to find dSYM: {2}", dsymForUUID_exe_path,
+           lookup_desc, command.GetString());
 
-  Progress progress("Downloading symbol file", lookup_arg);
+  Progress progress("Downloading symbol file for", lookup_desc);
 
   // Invoke dsymForUUID.
   int exit_status = -1;
diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp
index caf3e6636c1de1..1542c8cb68ce65 100644
--- a/lldb/source/Target/Language.cpp
+++ b/lldb/source/Target/Language.cpp
@@ -13,6 +13,7 @@
 #include "lldb/Target/Language.h"
 
 #include "lldb/Core/PluginManager.h"
+#include "lldb/Interpreter/OptionValueProperties.h"
 #include "lldb/Symbol/SymbolFile.h"
 #include "lldb/Symbol/TypeList.h"
 #include "lldb/Target/Target.h"
@@ -27,6 +28,35 @@ using namespace lldb_private::formatters;
 typedef std::unique_ptr<Language> LanguageUP;
 typedef std::map<lldb::LanguageType, LanguageUP> LanguagesMap;
 
+#define LLDB_PROPERTIES_language
+#include "TargetProperties.inc"
+
+enum {
+#define LLDB_PROPERTIES_language
+#include "TargetPropertiesEnum.inc"
+};
+
+LanguageProperties &Language::GetGlobalLanguageProperties() {
+  static LanguageProperties g_settings;
+  return g_settings;
+}
+
+llvm::StringRef LanguageProperties::GetSettingName() {
+  static constexpr llvm::StringLiteral g_setting_name("language");
+  return g_setting_name;
+}
+
+LanguageProperties::LanguageProperties() {
+  m_collection_sp = std::make_shared<OptionValueProperties>(GetSettingName());
+  m_collection_sp->Initialize(g_language_properties);
+}
+
+bool LanguageProperties::GetEnableFilterForLineBreakpoints() const {
+  const uint32_t idx = ePropertyEnableFilterForLineBreakpoints;
+  return GetPropertyAtIndexAs<bool>(
+      idx, g_language_properties[idx].default_uint_value != 0);
+}
+
 static LanguagesMap &GetLanguagesMap() {
   static LanguagesMap *g_map = nullptr;
   static llvm::once_flag g_initialize;
diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index d2fccdb7b9b39c..7f79218e0a6a4d 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -311,3 +311,9 @@ let Definition = "thread" in {
     DefaultUnsignedValue<600000>,
     Desc<"Maximum number of frames to backtrace.">;
 }
+
+let Definition = "language" in {
+  def EnableFilterForLineBreakpoints: Property<"enable-filter-for-line-breakpoints", "Boolean">,
+    DefaultTrue,
+    Desc<"If true, allow Language plugins to filter locations when setting breakpoints by line number or regex.">;
+}
diff --git a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
index 040632096c70e7..c650b1e3533e08 100644
--- a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
+++ b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py
@@ -146,3 +146,15 @@ def test_void_result(self):
             self, "// break here", lldb.SBFileSpec("main.c")
         )
         self.expect("dwim-print (void)15", matching=False, patterns=["(?i)error"])
+
+    def test_preserves_persistent_variables(self):
+        """Test dwim-print does not delete persistent variables."""
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "// break here", lldb.SBFileSpec("main.c")
+        )
+        self.expect("dwim-print int $i = 15")
+        # Run the same expression twice and verify success. This ensures the
+        # first command does not delete the persistent variable.
+        for _ in range(2):
+            self.expect("dwim-print $i", startstr="(int) 15")
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
index 7b54b3485d04d4..b59b770ed6790d 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/TestDataFormatterLibcxxValarray.py
@@ -18,6 +18,10 @@ def test_with_run_command(self):
             self, "break here", lldb.SBFileSpec("main.cpp", False)
         )
 
+        #
+        # std::valarray
+        #
+
         self.expect(
             "frame variable va_int",
             substrs=[
@@ -76,3 +80,30 @@ def test_with_run_command(self):
             error=True,
             substrs=['array index 4 is not valid for "(valarray<double>) va_double"'],
         )
+
+        #
+        # std::slice_array
+        #
+
+        self.expect(
+            "frame variable sa",
+            substrs=[
+                "sa = stride=2 size=4",
+                "[0] = 1",
+                "[1] = 3",
+                "[2] = 5",
+                "[3] = 7",
+                "}",
+            ],
+        )
+
+        # check access-by-index
+        self.expect("frame variable sa[0]", substrs=["1"])
+        self.expect("frame variable sa[1]", substrs=["3"])
+        self.expect("frame variable sa[2]", substrs=["5"])
+        self.expect("frame variable sa[3]", substrs=["7"])
+        self.expect(
+            "frame variable sa[4]",
+            error=True,
+            substrs=['array index 4 is not valid for "(slice_array<int>) sa"'],
+        )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
index f32921e16fa10e..1481d8b4032927 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/valarray/main.cpp
@@ -13,5 +13,8 @@ int main() {
 
   std::valarray<double> va_double({1.0, 0.5, 0.25, 0.125});
 
+  std::valarray<int> va({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+  std::slice_array<int> sa = va[std::slice(1, 4, 2)];
+
   std::cout << "break here\n";
 }
diff --git a/lldb/test/API/macosx/tbi-honored/Makefile b/lldb/test/API/macosx/tbi-honored/Makefile
new file mode 100644
index 00000000000000..10495940055b63
--- /dev/null
+++ b/lldb/test/API/macosx/tbi-honored/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/macosx/tbi-honored/TestTBIHonored.py b/lldb/test/API/macosx/tbi-honored/TestTBIHonored.py
new file mode 100644
index 00000000000000..a5c0abd70e5a90
--- /dev/null
+++ b/lldb/test/API/macosx/tbi-honored/TestTBIHonored.py
@@ -0,0 +1,57 @@
+"""Test that lldb on Darwin ignores metadata in the top byte of addresses, both corefile and live."""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestTBIHonored(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def do_variable_access_tests(self, frame):
+        self.assertEqual(
+            frame.variables["pb"][0]
+            .GetChildMemberWithName("p")
+            .Dereference()
+            .GetValueAsUnsigned(),
+            15,
+        )
+        addr = frame.variables["pb"][0].GetChildMemberWithName("p").GetValueAsUnsigned()
+        # Confirm that there is metadata in the top byte of our pointer
+        self.assertEqual((addr >> 56) & 0xFF, 0xFE)
+        self.expect("expr -- *pb.p", substrs=["15"])
+        self.expect("frame variable *pb.p", substrs=["15"])
+        self.expect("expr -- *(int*)0x%x" % addr, substrs=["15"])
+
+    # This test is valid on AArch64 systems with TBI mode enabled,
+    # and an address mask that clears the top byte before reading
+    # from memory.
+    @skipUnlessDarwin
+    @skipIf(archs=no_match(["arm64", "arm64e"]))
+    @skipIfRemote
+    def test(self):
+        corefile = self.getBuildArtifact("process.core")
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "// break here", lldb.SBFileSpec("main.c")
+        )
+
+        # Test that we can dereference a pointer with TBI data
+        # in a live process.
+        self.do_variable_access_tests(thread.GetFrameAtIndex(0))
+
+        # Create a corefile, delete this process
+        self.runCmd("process save-core -s stack " + corefile)
+        process.Destroy()
+        self.dbg.DeleteTarget(target)
+
+        # Now load the corefile
+        target = self.dbg.CreateTarget("")
+        process = target.LoadCore(corefile)
+        thread = process.GetSelectedThread()
+        self.assertTrue(process.GetSelectedThread().IsValid())
+
+        # Test that we can dereference a pointer with TBI data
+        # in a corefile process.
+        self.do_variable_access_tests(thread.GetFrameAtIndex(0))
diff --git a/lldb/test/API/macosx/tbi-honored/main.c b/lldb/test/API/macosx/tbi-honored/main.c
new file mode 100644
index 00000000000000..3d7ad0b04cd664
--- /dev/null
+++ b/lldb/test/API/macosx/tbi-honored/main.c
@@ -0,0 +1,13 @@
+#include <stdint.h>
+#include <stdio.h>
+union ptrbytes {
+  int *p;
+  uint8_t bytes[8];
+};
+int main() {
+  int c = 15;
+  union ptrbytes pb;
+  pb.p = &c;
+  pb.bytes[7] = 0xfe;
+  printf("%d\n", *pb.p); // break here
+}
diff --git a/lldb/unittests/Host/AlarmTest.cpp b/lldb/unittests/Host/AlarmTest.cpp
new file mode 100644
index 00000000000000..9f6ad189dee970
--- /dev/null
+++ b/lldb/unittests/Host/AlarmTest.cpp
@@ -0,0 +1,159 @@
+//===-- AlarmTest.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/Alarm.h"
+#include "gtest/gtest.h"
+
+#include <chrono>
+#include <thread>
+
+using namespace lldb_private;
+using namespace std::chrono_literals;
+
+// Increase the timeout tenfold when running under ASan as it can have about the
+// same performance overhead.
+#if __has_feature(address_sanitizer)
+static constexpr auto TEST_TIMEOUT = 10000ms;
+#else
+static constexpr auto TEST_TIMEOUT = 1000ms;
+#endif
+
+// The time between scheduling a callback and it getting executed. This should
+// NOT be increased under ASan.
+static constexpr auto ALARM_TIMEOUT = 500ms;
+
+// If there are any pending callbacks, make sure they run before the Alarm
+// object is destroyed.
+static constexpr bool RUN_CALLBACKS_ON_EXIT = true;
+
+TEST(AlarmTest, Create) {
+  std::mutex m;
+
+  std::vector<Alarm::TimePoint> callbacks_actual;
+  std::vector<Alarm::TimePoint> callbacks_expected;
+
+  Alarm alarm(ALARM_TIMEOUT, RUN_CALLBACKS_ON_EXIT);
+
+  // Create 5 alarms some time apart.
+  for (size_t i = 0; i < 5; ++i) {
+    callbacks_actual.emplace_back();
+    callbacks_expected.emplace_back(std::chrono::system_clock::now() +
+                                    ALARM_TIMEOUT);
+
+    alarm.Create([&callbacks_actual, &m, i]() {
+      std::lock_guard<std::mutex> guard(m);
+      callbacks_actual[i] = std::chrono::system_clock::now();
+    });
+
+    std::this_thread::sleep_for(ALARM_TIMEOUT / 5);
+  }
+
+  // Leave plenty of time for all the alarms to fire.
+  std::this_thread::sleep_for(TEST_TIMEOUT);
+
+  // Make sure all the alarms fired around the expected time.
+  for (size_t i = 0; i < 5; ++i)
+    EXPECT_GE(callbacks_actual[i], callbacks_expected[i]);
+}
+
+TEST(AlarmTest, Exit) {
+  std::mutex m;
+
+  std::vector<Alarm::Handle> handles;
+  std::vector<bool> callbacks;
+
+  {
+    Alarm alarm(ALARM_TIMEOUT, RUN_CALLBACKS_ON_EXIT);
+
+    // Create 5 alarms.
+    for (size_t i = 0; i < 5; ++i) {
+      callbacks.emplace_back(false);
+
+      handles.push_back(alarm.Create([&callbacks, &m, i]() {
+        std::lock_guard<std::mutex> guard(m);
+        callbacks[i] = true;
+      }));
+    }
+
+    // Let the alarm go out of scope before any alarm had a chance to fire.
+  }
+
+  // Make sure none of the alarms fired.
+  for (bool callback : callbacks)
+    EXPECT_TRUE(callback);
+}
+
+TEST(AlarmTest, Cancel) {
+  std::mutex m;
+
+  std::vector<Alarm::Handle> handles;
+  std::vector<bool> callbacks;
+
+  Alarm alarm(ALARM_TIMEOUT, RUN_CALLBACKS_ON_EXIT);
+
+  // Create 5 alarms.
+  for (size_t i = 0; i < 5; ++i) {
+    callbacks.emplace_back(false);
+
+    handles.push_back(alarm.Create([&callbacks, &m, i]() {
+      std::lock_guard<std::mutex> guard(m);
+      callbacks[i] = true;
+    }));
+  }
+
+  // Make sure we can cancel the first 4 alarms.
+  for (size_t i = 0; i < 4; ++i)
+    EXPECT_TRUE(alarm.Cancel(handles[i]));
+
+  // Leave plenty of time for all the alarms to fire.
+  std::this_thread::sleep_for(TEST_TIMEOUT);
+
+  // Make sure none of the first 4 alarms fired.
+  for (size_t i = 0; i < 4; ++i)
+    EXPECT_FALSE(callbacks[i]);
+
+  // Make sure the fifth alarm still fired.
+  EXPECT_TRUE(callbacks[4]);
+}
+
+TEST(AlarmTest, Restart) {
+  std::mutex m;
+
+  std::vector<Alarm::Handle> handles;
+  std::vector<Alarm::TimePoint> callbacks_actual;
+  std::vector<Alarm::TimePoint> callbacks_expected;
+
+  Alarm alarm(ALARM_TIMEOUT, RUN_CALLBACKS_ON_EXIT);
+
+  // Create 5 alarms some time apart.
+  for (size_t i = 0; i < 5; ++i) {
+    callbacks_actual.emplace_back();
+    callbacks_expected.emplace_back(std::chrono::system_clock::now() +
+                                    ALARM_TIMEOUT);
+
+    handles.push_back(alarm.Create([&callbacks_actual, &m, i]() {
+      std::lock_guard<std::mutex> guard(m);
+      callbacks_actual[i] = std::chrono::system_clock::now();
+    }));
+
+    std::this_thread::sleep_for(ALARM_TIMEOUT / 5);
+  }
+
+  // Update the last 2 alarms.
+  for (size_t i = 3; i < 5; ++i) {
+    callbacks_expected[i] = std::chrono::system_clock::now() + ALARM_TIMEOUT;
+    EXPECT_TRUE(alarm.Restart(handles[i]));
+  }
+
+  // Leave plenty of time for all the alarms to fire.
+  std::this_thread::sleep_for(TEST_TIMEOUT);
+
+  // Make sure all the alarms around the expected time.
+  for (size_t i = 0; i < 5; ++i)
+    EXPECT_GE(callbacks_actual[i], callbacks_expected[i]);
+}
diff --git a/lldb/unittests/Host/CMakeLists.txt b/lldb/unittests/Host/CMakeLists.txt
index c959478970d184..7c09932b39c2ad 100644
--- a/lldb/unittests/Host/CMakeLists.txt
+++ b/lldb/unittests/Host/CMakeLists.txt
@@ -1,4 +1,5 @@
 set (FILES
+  AlarmTest.cpp
   ConnectionFileDescriptorTest.cpp
   FileActionTest.cpp
   FileSystemTest.cpp
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index ad2a7114270bbb..805c0c1b75fbf6 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1641,8 +1641,14 @@ function(add_unittest test_suite test_name)
   # The runtime benefits of LTO don't outweight the compile time costs for tests.
   if(LLVM_ENABLE_LTO)
     if((UNIX OR MINGW) AND LINKER_IS_LLD)
-      set_property(TARGET ${test_name} APPEND_STRING PROPERTY
-                    LINK_FLAGS " -Wl,--lto-O0")
+      if(LLVM_ENABLE_FATLTO AND NOT APPLE)
+        # When using FatLTO, just use relocatable linking.
+        set_property(TARGET ${test_name} APPEND_STRING PROPERTY
+                      LINK_FLAGS " -Wl,--no-fat-lto-objects")
+      else()
+        set_property(TARGET ${test_name} APPEND_STRING PROPERTY
+                      LINK_FLAGS " -Wl,--lto-O0")
+      endif()
     elseif(LINKER_IS_LLD_LINK)
       set_property(TARGET ${test_name} APPEND_STRING PROPERTY
                     LINK_FLAGS " /opt:lldlto=0")
@@ -2094,7 +2100,7 @@ function(add_lit_testsuites project directory)
 endfunction()
 
 function(llvm_install_library_symlink name dest type)
-  cmake_parse_arguments(ARG "" "COMPONENT;SOVERSION" "" ${ARGN})
+  cmake_parse_arguments(ARG "FULL_DEST" "COMPONENT" "" ${ARGN})
   foreach(path ${CMAKE_MODULE_PATH})
     if(EXISTS ${path}/LLVMInstallSymlink.cmake)
       set(INSTALL_SYMLINK ${path}/LLVMInstallSymlink.cmake)
@@ -2108,8 +2114,8 @@ function(llvm_install_library_symlink name dest type)
   endif()
 
   set(full_name ${CMAKE_${type}_LIBRARY_PREFIX}${name}${CMAKE_${type}_LIBRARY_SUFFIX})
-  if (ARG_SOVERSION)
-    set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX}.${ARG_SOVERSION})
+  if (ARG_FULL_DEST)
+    set(full_dest ${dest})
   else()
     set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX})
   endif()
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 745a8354f11896..185266c0861e86 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -32,6 +32,8 @@ endif()
 set(LLVM_ENABLE_LTO OFF CACHE STRING "Build LLVM with LTO. May be specified as Thin or Full to use a particular kind of LTO")
 string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
 
+option(LLVM_ENABLE_FATLTO "Build LLVM with -ffat-lto-objects." OFF)
+
 # Ninja Job Pool support
 # The following only works with the Ninja generator in CMake >= 3.0.
 set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
@@ -1280,6 +1282,13 @@ elseif(LLVM_ENABLE_LTO)
   endif()
 endif()
 
+if(LLVM_ENABLE_FATLTO AND UNIX AND NOT APPLE)
+  append("-ffat-lto-objects" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  if(NOT LINKER_IS_LLD_LINK)
+    append("-ffat-lto-objects" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS)
+  endif()
+endif()
+
 # Set an AIX default for LLVM_EXPORT_SYMBOLS_FOR_PLUGINS based on whether we are
 # doing dynamic linking (see below).
 set(LLVM_EXPORT_SYMBOLS_FOR_PLUGINS_AIX_default OFF)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index fe37e85c2a40a6..29ea5005c0c409 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1552,6 +1552,25 @@ The AMDGPU backend supports the following calling conventions:
 
      =============================== ==========================================================
 
+AMDGPU MCExpr
+-------------
+
+As part of the AMDGPU MC layer, AMDGPU provides the following target specific
+``MCExpr``\s.
+
+  .. table:: AMDGPU MCExpr types:
+     :name: amdgpu-mcexpr-table
+
+     =================== ================= ========================================================
+     MCExpr              Operands          Return value
+     =================== ================= ========================================================
+     ``max(arg, ...)``   1 or more         Variadic signed operation that returns the maximum
+                                           value of all its arguments.
+
+     ``or(arg, ...)``    1 or more         Variadic signed operation that returns the bitwise-or
+                                           result of all its arguments.
+
+     =================== ================= ========================================================
 
 .. _amdgpu-elf-code-object:
 
diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index af9d7f16b36196..e3f2f33224b015 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -140,10 +140,6 @@ OPTIONS
  (in bytes) to the linked dSYM. The table is sorted by the output size listing
  the object files with the largest contribution first.
 
-.. option:: --symbol-map <bcsymbolmap>
-
- Update the existing dSYMs inplace using symbol map specified.
-
 .. option:: -s, --symtab
 
  Dumps the symbol table found in *executable* or object file(s) and exits.
diff --git a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
index 00c070cedb479c..54622cc61fdfda 100644
--- a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
+++ b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
@@ -1982,7 +1982,7 @@ level and debug info format.
   llvm-debuginfo-analyzer --attribute=level,format
                           --output-sort=offset
                           --print=scopes,symbols,types,lines,instructions
-                          test-clang.wasm
+                          test-clang.o
 
 or
 
@@ -1991,7 +1991,7 @@ or
   llvm-debuginfo-analyzer --attribute=level,format
                           --output-sort=offset
                           --print=elements
-                          test-clang.wasm
+                          test-clang.o
 
 Each row represents an element that is present within the debug
 information. The first column represents the scope level, followed by
@@ -2001,7 +2001,7 @@ the element.
 .. code-block:: none
 
   Logical View:
-  [000]           {File} 'test-clang.wasm' -> WASM
+  [000]           {File} 'test-clang.o' -> WASM
 
   [001]             {CompileUnit} 'test.cpp'
   [002]     2         {Function} extern not_inlined 'foo' -> 'int'
@@ -2099,10 +2099,10 @@ layout and given the number of matches.
                           --select=BLOCK --select=.store
                           --report=list
                           --print=symbols,types,instructions,summary
-                          test-clang.wasm
+                          test-clang.o
 
   Logical View:
-  [000]           {File} 'test-clang.wasm'
+  [000]           {File} 'test-clang.o'
 
   [001]           {CompileUnit} 'test.cpp'
   [003]           {Code} 'block'
@@ -2139,10 +2139,10 @@ with the printing mode to obtain the following logical view output.
                           --select-regex --select-nocase --select=INTe
                           --report=list
                           --print=symbols,types
-                          test-clang.wasm test-dwarf-gcc.o
+                          test-clang.o test-dwarf-gcc.o
 
   Logical View:
-  [000]           {File} 'test-clang.wasm'
+  [000]           {File} 'test-clang.o'
 
   [001]           {CompileUnit} 'test.cpp'
   [003]     4     {TypeAlias} 'INTEGER' -> 'int'
@@ -2175,13 +2175,13 @@ giving more context by swapping the reference and target object files.
                           --compare=types
                           --report=view
                           --print=symbols,types
-                          test-clang.wasm test-dwarf-gcc.o
+                          test-clang.o test-dwarf-gcc.o
 
-  Reference: 'test-clang.wasm'
+  Reference: 'test-clang.o'
   Target:    'test-dwarf-gcc.o'
 
   Logical View:
-   [000]           {File} 'test-clang.wasm'
+   [000]           {File} 'test-clang.o'
 
    [001]             {CompileUnit} 'test.cpp'
    [002]     1         {TypeAlias} 'INTPTR' -> '* const int'
@@ -2209,9 +2209,9 @@ includes the name, source code location, type, lexical scope level.
                           --compare=types
                           --report=list
                           --print=symbols,types,summary
-                          test-clang.wasm test-dwarf-gcc.o
+                          test-clang.o test-dwarf-gcc.o
 
-  Reference: 'test-clang.wasm'
+  Reference: 'test-clang.o'
   Target:    'test-dwarf-gcc.o'
 
   (1) Missing Types:
@@ -2238,10 +2238,10 @@ Changing the *Reference* and *Target* order:
                           --compare=types
                           --report=list
                           --print=symbols,types,summary
-                          test-dwarf-gcc.o test-clang.wasm
+                          test-dwarf-gcc.o test-clang.o
 
   Reference: 'test-dwarf-gcc.o'
-  Target:    'test-clang.wasm'
+  Target:    'test-clang.o'
 
   (1) Missing Types:
   -[004]     4     {TypeAlias} 'INTEGER' -> 'int'
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 763aeb87c68805..a4247796cb6591 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -323,6 +323,11 @@ The :doc:`CodeOfConduct` applies to all office hours.
     - Monthly, 4th Wednesday of the month at 9:30am PT, for 30 minutes.
     - `Google meet <https://meet.google.com/pdd-dibg-cwv>`__
     - English
+  * - Maksim Levental and Jeremy Kun
+    - MLIR newcomers and general discussion (`livestreamed <https://www.youtube.com/playlist?list=PLhxO86S3jsX2k7kOhZaV-qKWm8tNsUdAE>`__)
+    - Every two weeks, Fridays at 3:00pm US Pacific, for 90 minutes.
+    - Livestream chat or `Google meet <https://meet.google.com/wit-tvzc-dwc>`__
+    - English
   * - Rotating hosts
     - Getting Started, beginner questions, new contributors.
     - Every Tuesday at 2 PM ET (11 AM PT), for 30 minutes.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ecedd3a32c7b36..8bc1cab01bf0a6 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -11049,9 +11049,10 @@ Syntax:
 
 ::
 
-      <result> = getelementptr <ty>, ptr <ptrval>{, [inrange] <ty> <idx>}*
-      <result> = getelementptr inbounds <ty>, ptr <ptrval>{, [inrange] <ty> <idx>}*
-      <result> = getelementptr <ty>, <N x ptr> <ptrval>, [inrange] <vector index type> <idx>
+      <result> = getelementptr <ty>, ptr <ptrval>{, <ty> <idx>}*
+      <result> = getelementptr inbounds <ty>, ptr <ptrval>{, <ty> <idx>}*
+      <result> = getelementptr inrange(S,E) <ty>, ptr <ptrval>{, <ty> <idx>}*
+      <result> = getelementptr <ty>, <N x ptr> <ptrval>, <vector index type> <idx>
 
 Overview:
 """""""""
@@ -11196,16 +11197,15 @@ These rules are based on the assumption that no allocated object may cross
 the unsigned address space boundary, and no allocated object may be larger
 than half the pointer index type space.
 
-If the ``inrange`` keyword is present before any index, loading from or
+If the ``inrange(Start, End)`` attribute is present, loading from or
 storing to any pointer derived from the ``getelementptr`` has undefined
-behavior if the load or store would access memory outside of the bounds of
-the element selected by the index marked as ``inrange``. The result of a
-pointer comparison or ``ptrtoint`` (including ``ptrtoint``-like operations
+behavior if the load or store would access memory outside the half-open range
+``[Start, End)`` from the ``getelementptr`` expression result. The result of
+a pointer comparison or ``ptrtoint`` (including ``ptrtoint``-like operations
 involving memory) involving a pointer derived from a ``getelementptr`` with
 the ``inrange`` keyword is undefined, with the exception of comparisons
-in the case where both operands are in the range of the element selected
-by the ``inrange`` keyword, inclusive of the address one past the end of
-that element. Note that the ``inrange`` keyword is currently only allowed
+in the case where both operands are in the closed range ``[Start, End]``.
+Note that the ``inrange`` keyword is currently only allowed
 in constant ``getelementptr`` expressions.
 
 The getelementptr instruction is often confusing. For some more insight
@@ -14574,6 +14574,63 @@ The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
 integer element type. The argument types must match each other, and the return
 type must match the argument type.
 
+.. _int_scmp:
+
+'``llvm.scmp.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``@llvm.scmp`` on any
+integer bit width or any vector of integer elements.
+
+::
+
+      declare i2 @llvm.scmp.i2.i32(i32 %a, i32 %b)
+      declare <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview:
+"""""""""
+
+Return ``-1`` if ``%a`` is signed less than ``%b``, ``0`` if they are equal, and 
+``1`` if ``%a`` is signed greater than ``%b``. Vector intrinsics operate on a per-element basis. 
+
+Arguments:
+""""""""""
+
+The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
+integer element type. The argument types must match each other, and the return
+type must be at least as wide as ``i2``, to hold the three possible return values.
+
+.. _int_ucmp:
+
+'``llvm.ucmp.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``@llvm.ucmp`` on any
+integer bit width or any vector of integer elements.
+
+::
+
+      declare i2 @llvm.ucmp.i2.i32(i32 %a, i32 %b)
+      declare <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview:
+"""""""""
+
+Return ``-1`` if ``%a`` is unsigned less than ``%b``, ``0`` if they are equal, and 
+``1`` if ``%a`` is unsigned greater than ``%b``. Vector intrinsics operate on a per-element basis. 
+
+Arguments:
+""""""""""
+
+The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
+integer element type. The argument types must match each other, and the return
+type must be at least as wide as ``i2``, to hold the three possible return values.
 
 .. _int_memcpy:
 
diff --git a/llvm/docs/PointerAuth.md b/llvm/docs/PointerAuth.md
index 41266b43bc29ab..a8d2b4d8f5f0bd 100644
--- a/llvm/docs/PointerAuth.md
+++ b/llvm/docs/PointerAuth.md
@@ -10,6 +10,9 @@ Before the pointer is used, it needs to be authenticated, i.e., have its
 signature checked.  This prevents pointer values of unknown origin from being
 used to replace the signed pointer value.
 
+For more details, see the clang documentation page for
+[Pointer Authentication](https://clang.llvm.org/docs/PointerAuthentication.html).
+
 At the IR level, it is represented using:
 
 * a [set of intrinsics](#intrinsics) (to sign/authenticate pointers)
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 7be51730663bd1..03691efe836fe8 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -67,6 +67,8 @@ Changes to Interprocedural Optimizations
 Changes to the AArch64 Backend
 ------------------------------
 
+* Added support for Cortex-A78AE, Cortex-A520AE and Cortex-A720AE CPUs.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
@@ -106,6 +108,7 @@ Changes to the RISC-V Backend
 * The experimental Ssnpm, Smnpm, Smmpm, Sspm, and Supm 0.8.1 Pointer Masking extensions are supported.
 * The experimental Ssqosid extension is supported.
 * Zacas is no longer experimental.
+* Added the CSR names from the Resumable Non-Maskable Interrupts (Smrnmi) extension.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -131,6 +134,18 @@ Changes to the C API
 * Added ``LLVMConstStringInContext2`` function, which better matches the C++
   API by using ``size_t`` for string length. Deprecated ``LLVMConstStringInContext``. 
 
+* Added the following functions for accessing a function's prefix data:
+
+  * ``LLVMHasPrefixData``
+  * ``LLVMGetPrefixData``
+  * ``LLVMSetPrefixData``
+
+* Added the following functions for accessing a function's prologue data:
+
+  * ``LLVMHasPrologueData``
+  * ``LLVMGetPrologueData``
+  * ``LLVMSetPrologueData``
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
diff --git a/llvm/docs/RemoveDIsDebugInfo.md b/llvm/docs/RemoveDIsDebugInfo.md
index df66c26308a3c6..2cb17e2b5e4483 100644
--- a/llvm/docs/RemoveDIsDebugInfo.md
+++ b/llvm/docs/RemoveDIsDebugInfo.md
@@ -30,6 +30,27 @@ There are two significant changes to be aware of. Firstly, we're adding a single
 
 The second matter is that if you transfer sequences of instructions from one place to another manually, i.e. repeatedly using `moveBefore` where you might have used `splice`, then you should instead use the method `moveBeforePreserving`. `moveBeforePreserving` will transfer debug info records with the instruction they're attached to. This is something that happens automatically today -- if you use `moveBefore` on every element of an instruction sequence, then debug intrinsics will be moved in the normal course of your code, but we lose this behaviour with non-instruction debug info.
 
+# C-API changes
+
+All the functions that have been added are temporary and will be deprecated in the future. The intention is that they'll help downstream projects adapt during the transition period.
+
+```
+New functions (all to be deprecated)
+------------------------------------
+LLVMIsNewDbgInfoFormat                      # Returns true if the module is in the new non-instruction mode.
+LLVMSetIsNewDbgInfoFormat                   # Convert to the requested debug info format.
+
+LLVMDIBuilderInsertDeclareIntrinsicBefore   # Insert a debug intrinsic (old debug info format). 
+LLVMDIBuilderInsertDeclareIntrinsicAtEnd    # Same as above.
+LLVMDIBuilderInsertDbgValueIntrinsicBefore  # Same as above.
+LLVMDIBuilderInsertDbgValueIntrinsicAtEnd   # Same as above.
+
+LLVMDIBuilderInsertDeclareRecordBefore      # Insert a debug record (new debug info format). 
+LLVMDIBuilderInsertDeclareRecordAtEnd       # Same as above.
+LLVMDIBuilderInsertDbgValueRecordBefore     # Same as above.
+LLVMDIBuilderInsertDbgValueRecordAtEnd      # Same as above.
+```
+
 # Anything else?
 
 Not really, but here's an "old vs new" comparison of how to do certain things and quickstart for how this "new" debug info is structured.
@@ -43,10 +64,10 @@ This will all happen transparently without needing to think about it!
 We're using a dedicated C++ class called `DbgRecord` to store debug info, with a one-to-one relationship between each instance of a debug intrinsic and each `DbgRecord` object in any LLVM IR program; these `DbgRecord`s are represented in the IR as non-instruction debug records, as described in the [Source Level Debugging](project:SourceLevelDebugging.rst#Debug Records) document. This class has a set of subclasses that store exactly the same information as is stored in debugging intrinsics. Each one also has almost entirely the same set of methods, that behave in the same way:
 
   https://llvm.org/docs/doxygen/classllvm_1_1DbgRecord.html
-  https://llvm.org/docs/doxygen/classllvm_1_1DPValue.html
+  https://llvm.org/docs/doxygen/classllvm_1_1DbgVariableRecord.html
   https://llvm.org/docs/doxygen/classllvm_1_1DPLabel.html
 
-This allows you to treat a `DPValue` as if it's a `dbg.value`/`dbg.declare`/`dbg.assign` intrinsic most of the time, for example in generic (auto-param) lambdas, and the same for `DPLabel` and `dbg.label`s.
+This allows you to treat a `DbgVariableRecord` as if it's a `dbg.value`/`dbg.declare`/`dbg.assign` intrinsic most of the time, for example in generic (auto-param) lambdas, and the same for `DPLabel` and `dbg.label`s.
 
 ## How do these `DbgRecords` fit into the instruction stream?
 
@@ -74,13 +95,13 @@ Like so:
 
 Each instruction has a pointer to a `DPMarker` (which will become optional), that contains a list of `DbgRecord` objects. No debugging records appear in the instruction list at all. `DbgRecord`s have a parent pointer to their owning `DPMarker`, and each `DPMarker` has a pointer back to it's owning instruction.
 
-Not shown are the links from DbgRecord to other parts of the `Value`/`Metadata` hierachy: `DbgRecord` subclasses have tracking pointers to the DIMetadata that they use, and `DPValue` has references to `Value`s that are stored in a `DebugValueUser` base class. This refers to a `ValueAsMetadata` object referring to `Value`s, via the `TrackingMetadata` facility.
+Not shown are the links from DbgRecord to other parts of the `Value`/`Metadata` hierachy: `DbgRecord` subclasses have tracking pointers to the DIMetadata that they use, and `DbgVariableRecord` has references to `Value`s that are stored in a `DebugValueUser` base class. This refers to a `ValueAsMetadata` object referring to `Value`s, via the `TrackingMetadata` facility.
 
-The various kinds of debug intrinsic (value, declare, assign, label) are all stored in `DbgRecord` subclasses, with a "RecordKind" field distinguishing `DPLabel`s from `DPValue`s, and a `LocationType` field in the `DPValue` class further disambiguating the various debug variable intrinsics it can represent.
+The various kinds of debug intrinsic (value, declare, assign, label) are all stored in `DbgRecord` subclasses, with a "RecordKind" field distinguishing `DPLabel`s from `DbgVariableRecord`s, and a `LocationType` field in the `DbgVariableRecord` class further disambiguating the various debug variable intrinsics it can represent.
 
 ## Finding debug info records
 
-Utilities such as `findDbgUsers` and the like now have an optional argument that will return the set of `DPValue` records that refer to a `Value`. You should be able to treat them the same as intrinsics.
+Utilities such as `findDbgUsers` and the like now have an optional argument that will return the set of `DbgVariableRecord` records that refer to a `Value`. You should be able to treat them the same as intrinsics.
 
 ## Examining debug info records at positions
 
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 7cfe4dc4f775fd..f56a6c961aad74 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -744,6 +744,24 @@ LLVMModuleRef LLVMCloneModule(LLVMModuleRef M);
  */
 void LLVMDisposeModule(LLVMModuleRef M);
 
+/**
+ * Soon to be deprecated.
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Returns true if the module is in the new debug info mode which uses
+ * non-instruction debug records instead of debug intrinsics for variable
+ * location tracking.
+ */
+LLVMBool LLVMIsNewDbgInfoFormat(LLVMModuleRef M);
+
+/**
+ * Soon to be deprecated.
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Convert module into desired debug info format.
+ */
+void LLVMSetIsNewDbgInfoFormat(LLVMModuleRef M, LLVMBool UseNewFormat);
+
 /**
  * Obtain the identifier of a module.
  *
@@ -2730,6 +2748,44 @@ const char *LLVMGetGC(LLVMValueRef Fn);
  */
 void LLVMSetGC(LLVMValueRef Fn, const char *Name);
 
+/**
+ * Gets the prefix data associated with a function. Only valid on functions, and
+ * only if LLVMHasPrefixData returns true.
+ * See https://llvm.org/docs/LangRef.html#prefix-data
+ */
+LLVMValueRef LLVMGetPrefixData(LLVMValueRef Fn);
+
+/**
+ * Check if a given function has prefix data. Only valid on functions.
+ * See https://llvm.org/docs/LangRef.html#prefix-data
+ */
+LLVMBool LLVMHasPrefixData(LLVMValueRef Fn);
+
+/**
+ * Sets the prefix data for the function. Only valid on functions.
+ * See https://llvm.org/docs/LangRef.html#prefix-data
+ */
+void LLVMSetPrefixData(LLVMValueRef Fn, LLVMValueRef prefixData);
+
+/**
+ * Gets the prologue data associated with a function. Only valid on functions,
+ * and only if LLVMHasPrologueData returns true.
+ * See https://llvm.org/docs/LangRef.html#prologue-data
+ */
+LLVMValueRef LLVMGetPrologueData(LLVMValueRef Fn);
+
+/**
+ * Check if a given function has prologue data. Only valid on functions.
+ * See https://llvm.org/docs/LangRef.html#prologue-data
+ */
+LLVMBool LLVMHasPrologueData(LLVMValueRef Fn);
+
+/**
+ * Sets the prologue data for the function. Only valid on functions.
+ * See https://llvm.org/docs/LangRef.html#prologue-data
+ */
+void LLVMSetPrologueData(LLVMValueRef Fn, LLVMValueRef prologueData);
+
 /**
  * Add an attribute to a function.
  *
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index 5924294708cc35..b23ff63c862f84 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -1248,7 +1248,24 @@ LLVMMetadataRef LLVMDIBuilderCreateTempGlobalVariableFwdDecl(
     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
     LLVMMetadataRef Decl, uint32_t AlignInBits);
 
+/*
+ * Insert a new llvm.dbg.declare intrinsic call before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new intrinsic.
+ */
+LLVMValueRef
+LLVMDIBuilderInsertDeclareBefore(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
+                                 LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
+                                 LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
 /**
+ * Soon to be deprecated.
+ * Only use in "old debug mode" (LLVMIsNewDbgFormat() is false).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
  * Insert a new llvm.dbg.declare intrinsic call before the given instruction.
  * \param Builder     The DIBuilder.
  * \param Storage     The storage of the variable to declare.
@@ -1257,9 +1274,25 @@ LLVMMetadataRef LLVMDIBuilderCreateTempGlobalVariableFwdDecl(
  * \param DebugLoc    Debug info location.
  * \param Instr       Instruction acting as a location for the new intrinsic.
  */
-LLVMValueRef LLVMDIBuilderInsertDeclareBefore(
-  LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
-  LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
+LLVMValueRef LLVMDIBuilderInsertDeclareIntrinsicBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
+/**
+ * Soon to be deprecated.
+ * Only use in "new debug mode" (LLVMIsNewDbgFormat() is true).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a Declare DbgRecord before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new record.
+ */
+LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
 
 /**
  * Insert a new llvm.dbg.declare intrinsic call at the end of the given basic
@@ -1275,6 +1308,42 @@ LLVMValueRef LLVMDIBuilderInsertDeclareBefore(
 LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(
     LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
     LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
+/**
+ * Soon to be deprecated.
+ * Only use in "old debug mode" (LLVMIsNewDbgFormat() is false).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.declare intrinsic call at the end of the given basic
+ * block. If the basic block has a terminator instruction, the intrinsic is
+ * inserted before that terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDeclareIntrinsicAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
+/**
+ * Soon to be deprecated.
+ * Only use in "new debug mode" (LLVMIsNewDbgFormat() is true).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a Declare DbgRecord at the end of the given basic block. If the basic
+ * block has a terminator instruction, the record is inserted before that
+ * terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Storage     The storage of the variable to declare.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new record.
+ */
+LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
 
 /**
  * Insert a new llvm.dbg.value intrinsic call before the given instruction.
@@ -1285,12 +1354,42 @@ LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(
  * \param DebugLoc    Debug info location.
  * \param Instr       Instruction acting as a location for the new intrinsic.
  */
-LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder,
-                                               LLVMValueRef Val,
-                                               LLVMMetadataRef VarInfo,
-                                               LLVMMetadataRef Expr,
-                                               LLVMMetadataRef DebugLoc,
-                                               LLVMValueRef Instr);
+LLVMValueRef
+LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder, LLVMValueRef Val,
+                                  LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
+                                  LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
+/**
+ * Soon to be deprecated.
+ * Only use in "old debug mode" (Module::IsNewDbgInfoFormat is false).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.value intrinsic call before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDbgValueIntrinsicBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
+/**
+ * Soon to be deprecated.
+ * Only use in "new debug mode" (Module::IsNewDbgInfoFormat is true).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.value intrinsic call before the given instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Instr       Instruction acting as a location for the new intrinsic.
+ */
+LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr);
 
 /**
  * Insert a new llvm.dbg.value intrinsic call at the end of the given basic
@@ -1303,12 +1402,45 @@ LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder,
  * \param DebugLoc    Debug info location.
  * \param Block       Basic block acting as a location for the new intrinsic.
  */
-LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(LLVMDIBuilderRef Builder,
-                                              LLVMValueRef Val,
-                                              LLVMMetadataRef VarInfo,
-                                              LLVMMetadataRef Expr,
-                                              LLVMMetadataRef DebugLoc,
-                                              LLVMBasicBlockRef Block);
+LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
+/**
+ * Soon to be deprecated.
+ * Only use in "old debug mode" (Module::IsNewDbgInfoFormat is false).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.value intrinsic call at the end of the given basic
+ * block. If the basic block has a terminator instruction, the intrinsic is
+ * inserted before that terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new intrinsic.
+ */
+LLVMValueRef LLVMDIBuilderInsertDbgValueIntrinsicAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
+/**
+ * Soon to be deprecated.
+ * Only use in "new debug mode" (Module::IsNewDbgInfoFormat is true).
+ * See https://llvm.org/docs/RemoveDIsDebugInfo.html#c-api-changes
+ *
+ * Insert a new llvm.dbg.value intrinsic call at the end of the given basic
+ * block. If the basic block has a terminator instruction, the intrinsic is
+ * inserted before that terminator instruction.
+ * \param Builder     The DIBuilder.
+ * \param Val         The value of the variable.
+ * \param VarInfo     The variable's debug info descriptor.
+ * \param Expr        A complex location expression for the variable.
+ * \param DebugLoc    Debug info location.
+ * \param Block       Basic block acting as a location for the new intrinsic.
+ */
+LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block);
 
 /**
  * Create a new descriptor for a local auto variable.
diff --git a/llvm/include/llvm-c/Types.h b/llvm/include/llvm-c/Types.h
index d5474d986309fa..4681500ef9da3d 100644
--- a/llvm/include/llvm-c/Types.h
+++ b/llvm/include/llvm-c/Types.h
@@ -169,6 +169,11 @@ typedef struct LLVMOpaqueJITEventListener *LLVMJITEventListenerRef;
  */
 typedef struct LLVMOpaqueBinary *LLVMBinaryRef;
 
+/**
+ * @see llvm::DbgRecord
+ */
+typedef struct LLVMOpaqueDbgRecord *LLVMDbgRecordRef;
+
 /**
  * @}
  */
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index b10e2107b794a3..1abea9eb24a3c4 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -997,6 +997,12 @@ class [[nodiscard]] APInt {
   APInt ushl_ov(const APInt &Amt, bool &Overflow) const;
   APInt ushl_ov(unsigned Amt, bool &Overflow) const;
 
+  /// Signed integer floor division operation.
+  ///
+  /// Rounds towards negative infinity, i.e. 5 / -2 = -3. Iff minimum value
+  /// divided by -1 set Overflow to true.
+  APInt sfloordiv_ov(const APInt &RHS, bool &Overflow) const;
+
   // Operations that saturate
   APInt sadd_sat(const APInt &RHS) const;
   APInt uadd_sat(const APInt &RHS) const;
diff --git a/llvm/include/llvm/Analysis/ConstantFolding.h b/llvm/include/llvm/Analysis/ConstantFolding.h
index fd885a8c0ea3e5..c54b1e8f01d2b6 100644
--- a/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -22,6 +22,11 @@
 #include <stdint.h>
 
 namespace llvm {
+
+namespace Intrinsic {
+using ID = unsigned;
+}
+
 class APInt;
 template <typename T> class ArrayRef;
 class CallBase;
@@ -187,6 +192,10 @@ Constant *ConstantFoldCall(const CallBase *Call, Function *F,
                            ArrayRef<Constant *> Operands,
                            const TargetLibraryInfo *TLI = nullptr);
 
+Constant *ConstantFoldBinaryIntrinsic(Intrinsic::ID ID, Constant *LHS,
+                                      Constant *RHS, Type *Ty,
+                                      Instruction *FMFSource);
+
 /// ConstantFoldLoadThroughBitcast - try to cast constant to destination type
 /// returning null if unsuccessful. Can cast pointer to pointer or pointer to
 /// integer and vice versa if their sizes are equal.
diff --git a/llvm/include/llvm/Analysis/InstSimplifyFolder.h b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
index 23e2ea80e8cbe6..8a3269d6add0e0 100644
--- a/llvm/include/llvm/Analysis/InstSimplifyFolder.h
+++ b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
@@ -117,6 +117,12 @@ class InstSimplifyFolder final : public IRBuilderFolder {
     return simplifyCastInst(Op, V, DestTy, SQ);
   }
 
+  Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                             Instruction *FMFSource) const override {
+    return simplifyBinaryIntrinsic(ID, Ty, LHS, RHS, SQ,
+                                   dyn_cast_if_present<CallBase>(FMFSource));
+  }
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h
index a29955a06cf4e0..03d7ad12c12d8f 100644
--- a/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -186,6 +186,11 @@ Value *simplifyExtractElementInst(Value *Vec, Value *Idx,
 Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
                         const SimplifyQuery &Q);
 
+/// Given operands for a BinaryIntrinsic, fold the result or return null.
+Value *simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType, Value *Op0,
+                               Value *Op1, const SimplifyQuery &Q,
+                               const CallBase *Call);
+
 /// Given operands for a ShuffleVectorInst, fold the result or return null.
 /// See class ShuffleVectorInst for a description of the mask representation.
 Value *simplifyShuffleVectorInst(Value *Op0, Value *Op1, ArrayRef<int> Mask,
diff --git a/llvm/include/llvm/Analysis/MemoryLocation.h b/llvm/include/llvm/Analysis/MemoryLocation.h
index b72a27cab86b34..830eed5d60ee46 100644
--- a/llvm/include/llvm/Analysis/MemoryLocation.h
+++ b/llvm/include/llvm/Analysis/MemoryLocation.h
@@ -191,8 +191,14 @@ class LocationSize {
     return Value == Other.Value;
   }
 
+  bool operator==(const TypeSize &Other) const {
+    return hasValue() && getValue() == Other;
+  }
+
   bool operator!=(const LocationSize &Other) const { return !(*this == Other); }
 
+  bool operator!=(const TypeSize &Other) const { return !(*this == Other); }
+
   // Ordering operators are not provided, since it's unclear if there's only one
   // reasonable way to compare:
   // - values that don't exist against values that do, and
@@ -292,9 +298,10 @@ class MemoryLocation {
   }
 
   // Return the exact size if the exact size is known at compiletime,
-  // otherwise return MemoryLocation::UnknownSize.
-  static uint64_t getSizeOrUnknown(const TypeSize &T) {
-    return T.isScalable() ? UnknownSize : T.getFixedValue();
+  // otherwise return LocationSize::beforeOrAfterPointer().
+  static LocationSize getSizeOrUnknown(const TypeSize &T) {
+    return T.isScalable() ? LocationSize::beforeOrAfterPointer()
+                          : LocationSize::precise(T.getFixedValue());
   }
 
   MemoryLocation() : Ptr(nullptr), Size(LocationSize::beforeOrAfterPointer()) {}
diff --git a/llvm/include/llvm/Analysis/TargetFolder.h b/llvm/include/llvm/Analysis/TargetFolder.h
index 978e1002515fc0..b4105ad76c02e2 100644
--- a/llvm/include/llvm/Analysis/TargetFolder.h
+++ b/llvm/include/llvm/Analysis/TargetFolder.h
@@ -191,6 +191,15 @@ class TargetFolder final : public IRBuilderFolder {
     return nullptr;
   }
 
+  Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                             Instruction *FMFSource) const override {
+    auto *C1 = dyn_cast<Constant>(LHS);
+    auto *C2 = dyn_cast<Constant>(RHS);
+    if (C1 && C2)
+      return ConstantFoldBinaryIntrinsic(ID, C1, C2, Ty, FMFSource);
+    return nullptr;
+  }
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 4eab357f1b33b6..bad0a77b0f2da2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -696,6 +696,12 @@ class TargetTransformInfo {
   /// immediate without having to materialize the immediate into a register.
   bool isLegalAddImmediate(int64_t Imm) const;
 
+  /// Return true if adding the specified scalable immediate is legal, that is
+  /// the target has add instructions which can add a register with the
+  /// immediate (multiplied by vscale) without having to materialize the
+  /// immediate into a register.
+  bool isLegalAddScalableImmediate(int64_t Imm) const;
+
   /// Return true if the specified immediate is legal icmp immediate,
   /// that is the target has icmp instructions which can compare a register
   /// against the immediate without having to materialize the immediate into a
@@ -707,11 +713,15 @@ class TargetTransformInfo {
   /// The type may be VoidTy, in which case only return true if the addressing
   /// mode is legal for a load/store of any legal type.
   /// If target returns true in LSRWithInstrQueries(), I may be valid.
+  /// \param ScalableOffset represents a quantity of bytes multiplied by vscale,
+  /// an invariant value known only at runtime. Most targets should not accept
+  /// a scalable offset.
+  ///
   /// TODO: Handle pre/postinc as well.
   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg, int64_t Scale,
-                             unsigned AddrSpace = 0,
-                             Instruction *I = nullptr) const;
+                             unsigned AddrSpace = 0, Instruction *I = nullptr,
+                             int64_t ScalableOffset = 0) const;
 
   /// Return true if LSR cost of C1 is lower than C2.
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
@@ -1247,13 +1257,16 @@ class TargetTransformInfo {
   /// cases or optimizations based on those values.
   /// \p CxtI is the optional original context instruction, if one exists, to
   /// provide even more information.
+  /// \p TLibInfo is used to search for platform specific vector library
+  /// functions for instructions that might be converted to calls (e.g. frem).
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
       TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None},
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
-      const Instruction *CxtI = nullptr) const;
+      const Instruction *CxtI = nullptr,
+      const TargetLibraryInfo *TLibInfo = nullptr) const;
 
   /// Returns the cost estimation for alternating opcode pattern that can be
   /// lowered to a single instruction on the target. In X86 this is for the
@@ -1835,11 +1848,13 @@ class TargetTransformInfo::Concept {
       std::function<void(Instruction *, unsigned, APInt, APInt &)>
           SimplifyAndSetOp) = 0;
   virtual bool isLegalAddImmediate(int64_t Imm) = 0;
+  virtual bool isLegalAddScalableImmediate(int64_t Imm) = 0;
   virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                      int64_t BaseOffset, bool HasBaseReg,
                                      int64_t Scale, unsigned AddrSpace,
-                                     Instruction *I) = 0;
+                                     Instruction *I,
+                                     int64_t ScalableOffset) = 0;
   virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                              const TargetTransformInfo::LSRCost &C2) = 0;
   virtual bool isNumRegsMajorCostOfLSR() = 0;
@@ -2295,14 +2310,17 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   bool isLegalAddImmediate(int64_t Imm) override {
     return Impl.isLegalAddImmediate(Imm);
   }
+  bool isLegalAddScalableImmediate(int64_t Imm) override {
+    return Impl.isLegalAddScalableImmediate(Imm);
+  }
   bool isLegalICmpImmediate(int64_t Imm) override {
     return Impl.isLegalICmpImmediate(Imm);
   }
   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
-                             Instruction *I) override {
+                             Instruction *I, int64_t ScalableOffset) override {
     return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
-                                      AddrSpace, I);
+                                      AddrSpace, I, ScalableOffset);
   }
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                      const TargetTransformInfo::LSRCost &C2) override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7f661bb4a1df20..7c47d3c2338a87 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -216,11 +216,14 @@ class TargetTransformInfoImplBase {
 
   bool isLegalAddImmediate(int64_t Imm) const { return false; }
 
+  bool isLegalAddScalableImmediate(int64_t Imm) const { return false; }
+
   bool isLegalICmpImmediate(int64_t Imm) const { return false; }
 
   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                              bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
-                             Instruction *I = nullptr) const {
+                             Instruction *I = nullptr,
+                             int64_t ScalableOffset = 0) const {
     // Guess that only reg and reg+reg addressing is allowed. This heuristic is
     // taken from the implementation of LSR.
     return !BaseGV && BaseOffset == 0 && (Scale == 0 || Scale == 1);
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index e85728aa3c0da0..8ebd0d3409c899 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -563,8 +563,7 @@ namespace llvm {
                     Type *ExpectedTy = nullptr);
     bool parseGlobalValue(Type *Ty, Constant *&C);
     bool parseGlobalTypeAndValue(Constant *&V);
-    bool parseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
-                                std::optional<unsigned> *InRangeOp = nullptr);
+    bool parseGlobalValueVector(SmallVectorImpl<Constant *> &Elts);
     bool parseOptionalComdat(StringRef GlobalName, Comdat *&C);
     bool parseSanitizer(GlobalVariable *GV);
     bool parseMetadataAsValue(Value *&V, PerFunctionState &PFS);
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index a28e19edb4c6a6..532f9481766a95 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -141,7 +141,7 @@ enum class PartType {
 #include "DXContainerConstants.def"
 };
 
-#define SHADER_FEATURE_FLAG(Num, Val, Str) Val = 1ull << Num,
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str) Val = 1ull << Num,
 enum class FeatureFlags : uint64_t {
 #include "DXContainerConstants.def"
 };
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 80ed86bc3a499e..62dc573555198b 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -9,46 +9,63 @@ CONTAINER_PART(OSG1)
 CONTAINER_PART(PSG1)
 
 #undef CONTAINER_PART
-#endif 
+#endif // CONTAINER_PART
 
 #ifdef SHADER_FEATURE_FLAG
 
-SHADER_FEATURE_FLAG(0, Doubles, "Double-precision floating point")
-SHADER_FEATURE_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
-SHADER_FEATURE_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
-SHADER_FEATURE_FLAG(3, Max64UAVs, "64 UAV slots")
-SHADER_FEATURE_FLAG(4, MinimumPrecision, "Minimum-precision data types")
-SHADER_FEATURE_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
-SHADER_FEATURE_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
-SHADER_FEATURE_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
-SHADER_FEATURE_FLAG(8, TiledResources, "Tiled resources")
-SHADER_FEATURE_FLAG(9, StencilRef, "PS Output Stencil Ref")
-SHADER_FEATURE_FLAG(10, InnerCoverage, "PS Inner Coverage")
-SHADER_FEATURE_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
-SHADER_FEATURE_FLAG(12, ROVs, "Raster Ordered UAVs")
-SHADER_FEATURE_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
-SHADER_FEATURE_FLAG(14, WaveOps, "Wave level operations")
-SHADER_FEATURE_FLAG(15, Int64Ops, "64-Bit integer")
-SHADER_FEATURE_FLAG(16, ViewID, "View Instancing")
-SHADER_FEATURE_FLAG(17, Barycentrics, "Barycentrics")
-SHADER_FEATURE_FLAG(18, NativeLowPrecision, "Use native low precision")
-SHADER_FEATURE_FLAG(19, ShadingRate, "Shading Rate")
-SHADER_FEATURE_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
-SHADER_FEATURE_FLAG(21, SamplerFeedback, "Sampler feedback")
-SHADER_FEATURE_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
-SHADER_FEATURE_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
-SHADER_FEATURE_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
-SHADER_FEATURE_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
-SHADER_FEATURE_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
-SHADER_FEATURE_FLAG(27, RESERVED, "<RESERVED>")
-SHADER_FEATURE_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
-SHADER_FEATURE_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
-SHADER_FEATURE_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
-
-SHADER_FEATURE_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
+// SHADER_FEATURE_FLAG(bit offset for the shader info flag, bit offset for DXIL module flag, name, description.
+
+SHADER_FEATURE_FLAG(0,   2, Doubles, "Double-precision floating point")
+SHADER_FEATURE_FLAG(1,  17, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
+SHADER_FEATURE_FLAG(2,  16, UAVsAtEveryStage, "UAVs at every shader stage")
+SHADER_FEATURE_FLAG(3,  15, Max64UAVs, "64 UAV slots")
+SHADER_FEATURE_FLAG(4,  -1, MinimumPrecision, "Minimum-precision data types")
+SHADER_FEATURE_FLAG(5,   6, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
+SHADER_FEATURE_FLAG(6,   7, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
+SHADER_FEATURE_FLAG(7,  14, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
+SHADER_FEATURE_FLAG(8,  12, TiledResources, "Tiled resources")
+SHADER_FEATURE_FLAG(9,  11, StencilRef, "PS Output Stencil Ref")
+SHADER_FEATURE_FLAG(10, 10, InnerCoverage, "PS Inner Coverage")
+SHADER_FEATURE_FLAG(11, 13, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
+SHADER_FEATURE_FLAG(12, 18, ROVs, "Raster Ordered UAVs")
+SHADER_FEATURE_FLAG(13,  9, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
+SHADER_FEATURE_FLAG(14, 19, WaveOps, "Wave level operations")
+SHADER_FEATURE_FLAG(15, 20, Int64Ops, "64-Bit integer")
+SHADER_FEATURE_FLAG(16, 21, ViewID, "View Instancing")
+SHADER_FEATURE_FLAG(17, 22, Barycentrics, "Barycentrics")
+SHADER_FEATURE_FLAG(18, -1, NativeLowPrecision, "Use native low precision")
+SHADER_FEATURE_FLAG(19, 24, ShadingRate, "Shading Rate")
+SHADER_FEATURE_FLAG(20, 25, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
+SHADER_FEATURE_FLAG(21, 26, SamplerFeedback, "Sampler feedback")
+SHADER_FEATURE_FLAG(22, 27, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
+SHADER_FEATURE_FLAG(23, 28, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
+SHADER_FEATURE_FLAG(24, 29, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
+SHADER_FEATURE_FLAG(25, 30, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
+SHADER_FEATURE_FLAG(26, 31, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
+SHADER_FEATURE_FLAG(27, 63, RESERVED, "<RESERVED>")
+SHADER_FEATURE_FLAG(28, 32, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
+SHADER_FEATURE_FLAG(29, 34, AdvancedTextureOps, "Advanced Texture Ops")
+SHADER_FEATURE_FLAG(30, 35, WriteableMSAATextures, "Writeable MSAA Textures")
+
+SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
 
 #undef SHADER_FEATURE_FLAG
-#endif
+#endif // SHADER_FEATURE_FLAG
+
+#ifdef DXIL_MODULE_FLAG
+
+// Only save DXIL module flags which not map to feature flags here.
+DXIL_MODULE_FLAG( 0,  DisableOptimizations,   "D3D11_1_SB_GLOBAL_FLAG_SKIP_OPTIMIZATION")
+DXIL_MODULE_FLAG( 1,  DisableMathRefactoring, "D3D10_SB_GLOBAL_FLAG_REFACTORING_ALLOWED")
+DXIL_MODULE_FLAG( 3,  ForceEarlyDepthStencil, "D3D11_SB_GLOBAL_FLAG_FORCE_EARLY_DEPTH_STENCIL")
+DXIL_MODULE_FLAG( 4,  EnableRawAndStructuredBuffers, "D3D11_SB_GLOBAL_FLAG_ENABLE_RAW_AND_STRUCTURED_BUFFERS")
+DXIL_MODULE_FLAG( 5,  LowPrecisionPresent, "D3D11_1_SB_GLOBAL_FLAG_ENABLE_MINIMUM_PRECISION")
+DXIL_MODULE_FLAG( 8,  AllResourcesBound, "D3D12_SB_GLOBAL_FLAG_ALL_RESOURCES_BOUND")
+DXIL_MODULE_FLAG(23,  UseNativeLowPrecision, "Native 16bit types enabled")
+DXIL_MODULE_FLAG(33,  ResMayNotAlias, "Any UAV may not alias any other UAV")
+
+#undef DXIL_MODULE_FLAG
+#endif // DXIL_MODULE_FLAG
 
 #ifdef SEMANTIC_KIND
 
@@ -86,7 +103,7 @@ SEMANTIC_KIND(30, CullPrimitive)
 SEMANTIC_KIND(30, Invalid)
 
 #undef SEMANTIC_KIND
-#endif
+#endif // SEMANTIC_KIND
 
 #ifdef COMPONENT_TYPE
 
@@ -102,7 +119,7 @@ COMPONENT_TYPE(8, SInt64)
 COMPONENT_TYPE(9, Float64)
 
 #undef COMPONENT_TYPE
-#endif
+#endif // COMPONENT_TYPE
 
 #ifdef COMPONENT_PRECISION
 
@@ -116,7 +133,7 @@ COMPONENT_PRECISION(0xf0, Any16)
 COMPONENT_PRECISION(0xf1, Any10)
 
 #undef COMPONENT_PRECISION
-#endif
+#endif // COMPONENT_PRECISION
 
 #ifdef INTERPOLATION_MODE
 
@@ -131,7 +148,7 @@ INTERPOLATION_MODE(7, LinearNoperspectiveSample)
 INTERPOLATION_MODE(8, Invalid)
 
 #undef INTERPOLATION_MODE
-#endif
+#endif // INTERPOLATION_MODE
 
 #ifdef D3D_SYSTEM_VALUE
 
@@ -165,4 +182,4 @@ D3D_SYSTEM_VALUE(70, InnerCoverage)
 
 #undef D3D_SYSTEM_VALUE
 
-#endif
+#endif // D3D_SYSTEM_VALUE
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index bace3a92677a82..877f3f7862c8ba 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1141,6 +1141,8 @@ enum : unsigned {
 
   SHT_CSKY_ATTRIBUTES = 0x70000001U,
 
+  SHT_HEXAGON_ATTRIBUTES = 0x70000003U,
+
   SHT_HIPROC = 0x7fffffff, // Highest processor arch-specific type.
   SHT_LOUSER = 0x80000000, // Lowest type reserved for applications.
   SHT_HIUSER = 0xffffffff  // Highest type reserved for applications.
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index c0a52d64a101d0..4018ef03f960ca 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -399,18 +399,19 @@ enum ConstantsCodes {
   CST_CODE_DATA = 22,            // DATA:          [n x elements]
   CST_CODE_INLINEASM_OLD2 = 23,  // INLINEASM:     [sideeffect|alignstack|
                                  //                 asmdialect,asmstr,conststr]
-  CST_CODE_CE_GEP_WITH_INRANGE_INDEX = 24, //      [opty, flags, n x operands]
-  CST_CODE_CE_UNOP = 25,                   // CE_UNOP:      [opcode, opval]
-  CST_CODE_POISON = 26,                    // POISON
-  CST_CODE_DSO_LOCAL_EQUIVALENT = 27,      // DSO_LOCAL_EQUIVALENT [gvty, gv]
-  CST_CODE_INLINEASM_OLD3 = 28,    // INLINEASM:     [sideeffect|alignstack|
-                                   //                 asmdialect|unwind,
-                                   //                 asmstr,conststr]
-  CST_CODE_NO_CFI_VALUE = 29, // NO_CFI [ fty, f ]
-  CST_CODE_INLINEASM = 30,    // INLINEASM:     [fnty,
-                              //                 sideeffect|alignstack|
-                              //                 asmdialect|unwind,
-                              //                 asmstr,conststr]
+  CST_CODE_CE_GEP_WITH_INRANGE_INDEX_OLD = 24, //  [opty, flags, n x operands]
+  CST_CODE_CE_UNOP = 25,                       // CE_UNOP:      [opcode, opval]
+  CST_CODE_POISON = 26,                        // POISON
+  CST_CODE_DSO_LOCAL_EQUIVALENT = 27, // DSO_LOCAL_EQUIVALENT [gvty, gv]
+  CST_CODE_INLINEASM_OLD3 = 28,       // INLINEASM:     [sideeffect|alignstack|
+                                      //                 asmdialect|unwind,
+                                      //                 asmstr,conststr]
+  CST_CODE_NO_CFI_VALUE = 29,         // NO_CFI [ fty, f ]
+  CST_CODE_INLINEASM = 30,            // INLINEASM:     [fnty,
+                                      //                 sideeffect|alignstack|
+                                      //                 asmdialect|unwind,
+                                      //                 asmstr,conststr]
+  CST_CODE_CE_GEP_WITH_INRANGE = 31,  // [opty, flags, range, n x operands]
 };
 
 /// CastOpcodes - These are values used in the bitcode files to encode which
@@ -624,6 +625,17 @@ enum FunctionCodes {
                                   //             operation, align, vol,
                                   //             ordering, synchscope]
   FUNC_CODE_BLOCKADDR_USERS = 60, // BLOCKADDR_USERS: [value...]
+
+  FUNC_CODE_DEBUG_RECORD_VALUE =
+      61, // [DILocation, DILocalVariable, DIExpression, ValueAsMetadata]
+  FUNC_CODE_DEBUG_RECORD_DECLARE =
+      62, // [DILocation, DILocalVariable, DIExpression, ValueAsMetadata]
+  FUNC_CODE_DEBUG_RECORD_ASSIGN =
+      63, // [DILocation, DILocalVariable, DIExpression, ValueAsMetadata,
+          //  DIAssignID, DIExpression (addr), ValueAsMetadata (addr)]
+  FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE =
+      64, // [DILocation, DILocalVariable, DIExpression, Value]
+  FUNC_CODE_DEBUG_RECORD_LABEL = 65, // [DILocation, DILabel]
 };
 
 enum UseListCodes {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 61f6564e8cd79b..92fa726c31df14 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -328,18 +328,24 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return getTLI()->isLegalAddImmediate(imm);
   }
 
+  bool isLegalAddScalableImmediate(int64_t Imm) {
+    return getTLI()->isLegalAddScalableImmediate(Imm);
+  }
+
   bool isLegalICmpImmediate(int64_t imm) {
     return getTLI()->isLegalICmpImmediate(imm);
   }
 
   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
-                             bool HasBaseReg, int64_t Scale,
-                             unsigned AddrSpace, Instruction *I = nullptr) {
+                             bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
+                             Instruction *I = nullptr,
+                             int64_t ScalableOffset = 0) {
     TargetLoweringBase::AddrMode AM;
     AM.BaseGV = BaseGV;
     AM.BaseOffs = BaseOffset;
     AM.HasBaseReg = HasBaseReg;
     AM.Scale = Scale;
+    AM.ScalableOffset = ScalableOffset;
     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
   }
 
diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
index 31af3014afe4e7..45a47d7333e35a 100644
--- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -191,7 +191,7 @@ class FunctionLoweringInfo {
   /// Collection of dbg.declare instructions handled after argument
   /// lowering and before ISel proper.
   SmallPtrSet<const DbgDeclareInst *, 8> PreprocessedDbgDeclares;
-  SmallPtrSet<const DPValue *, 8> PreprocessedDPVDeclares;
+  SmallPtrSet<const DbgVariableRecord *, 8> PreprocessedDVRDeclares;
 
   /// set - Initialize this FunctionLoweringInfo with the given Function
   /// and its associated MachineFunction.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 3c3b91661d5788..4c187a3068d823 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -117,6 +117,9 @@ class CallLowering {
     /// vreg that the swifterror should be copied into after the call.
     Register SwiftErrorVReg;
 
+    /// Valid if the call is a controlled convergent operation.
+    Register ConvergenceCtrlToken;
+
     /// Original IR callsite corresponding to this call, if available.
     const CallBase *CB = nullptr;
 
@@ -584,6 +587,7 @@ class CallLowering {
   bool lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &Call,
                  ArrayRef<Register> ResRegs,
                  ArrayRef<ArrayRef<Register>> ArgRegs, Register SwiftErrorVReg,
+                 Register ConvergenceCtrlToken,
                  std::function<unsigned()> GetCalleeReg) const;
 
   /// For targets which want to use big-endian can enable it with
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index 8e456015cd3736..c73ac2c9f55b7b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -647,15 +647,15 @@ bool GIMatchTableExecutor::executeMatchTable(
 
       unsigned Size = MRI.getType(MO.getReg()).getSizeInBits();
       if (MatcherOpcode == GIM_CheckMemorySizeEqualToLLT &&
-          MMO->getSizeInBits() != Size) {
+          MMO->getSizeInBits().getValue() != Size) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeLessThanLLT &&
-                 MMO->getSizeInBits() >= Size) {
+                 MMO->getSizeInBits().getValue() >= Size) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       } else if (MatcherOpcode == GIM_CheckMemorySizeGreaterThanLLT &&
-                 MMO->getSizeInBits() <= Size)
+                 MMO->getSizeInBits().getValue() <= Size)
         if (handleReject() == RejectAndGiveUp)
           return false;
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 6b03703192df91..261cfcf504d5fe 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -54,9 +54,9 @@ class GMemOperation : public GenericMachineInstr {
   bool isUnordered() const { return getMMO().isUnordered(); }
 
   /// Returns the size in bytes of the memory access.
-  uint64_t getMemSize() const { return getMMO().getSize(); }
+  LocationSize getMemSize() const { return getMMO().getSize(); }
   /// Returns the size in bits of the memory access.
-  uint64_t getMemSizeInBits() const { return getMMO().getSizeInBits(); }
+  LocationSize getMemSizeInBits() const { return getMMO().getSizeInBits(); }
 
   static bool classof(const MachineInstr *MI) {
     return GenericMachineInstr::classof(MI) && MI->hasOneMemOperand();
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 29f675b2203b6b..6ae7c144090798 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -243,6 +243,14 @@ class IRTranslator : public MachineFunctionPass {
   bool translateMemFunc(const CallInst &CI, MachineIRBuilder &MIRBuilder,
                         unsigned Opcode);
 
+  // Translate @llvm.experimental.vector.interleave2 and
+  // @llvm.experimental.vector.deinterleave2 intrinsics for fixed-width vector
+  // types into vector shuffles.
+  bool translateVectorInterleave2Intrinsic(const CallInst &CI,
+                                           MachineIRBuilder &MIRBuilder);
+  bool translateVectorDeinterleave2Intrinsic(const CallInst &CI,
+                                             MachineIRBuilder &MIRBuilder);
+
   void getStackGuard(Register DstReg, MachineIRBuilder &MIRBuilder);
 
   bool translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
@@ -579,6 +587,10 @@ class IRTranslator : public MachineFunctionPass {
     return false;
   }
 
+  bool translateConvergenceControlIntrinsic(const CallInst &CI,
+                                            Intrinsic::ID ID,
+                                            MachineIRBuilder &MIRBuilder);
+
   /// @}
 
   // Builder for machine instruction a la IRBuilder.
@@ -697,6 +709,23 @@ class IRTranslator : public MachineFunctionPass {
     return Regs[0];
   }
 
+  Register getOrCreateConvergenceTokenVReg(const Value &Token) {
+    assert(Token.getType()->isTokenTy());
+    auto &Regs = *VMap.getVRegs(Token);
+    if (!Regs.empty()) {
+      assert(Regs.size() == 1 &&
+             "Expected a single register for convergence tokens.");
+      return Regs[0];
+    }
+
+    auto Reg = MRI->createGenericVirtualRegister(LLT::token());
+    Regs.push_back(Reg);
+    auto &Offsets = *VMap.getOffsets(Token);
+    if (Offsets.empty())
+      Offsets.push_back(0);
+    return Reg;
+  }
+
   /// Allocate some vregs and offsets in the VMap. Then populate just the
   /// offsets while leaving the vregs empty.
   ValueToVRegInfo::VRegListT &allocateVRegs(const Value &Val);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index da330b517c2801..ca62f38061b115 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -432,9 +432,13 @@ class LegalizationArtifactCombiner {
             DestTy.isVector() ? CastSrcTy.getNumElements() / NumDefs : 1;
         LLT UnmergeTy = CastSrcTy.changeElementCount(
             ElementCount::getFixed(UnmergeNumElts));
+        LLT SrcWideTy =
+            SrcTy.changeElementCount(ElementCount::getFixed(UnmergeNumElts));
 
         if (isInstUnsupported(
-                {TargetOpcode::G_UNMERGE_VALUES, {UnmergeTy, CastSrcTy}}))
+                {TargetOpcode::G_UNMERGE_VALUES, {UnmergeTy, CastSrcTy}}) ||
+            LI.getAction({TargetOpcode::G_TRUNC, {SrcWideTy, UnmergeTy}})
+                    .Action == LegalizeActions::MoreElements)
           return false;
 
         Builder.setInstr(MI);
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 7d11d63d4066f4..0fe73fec7ee67f 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -638,13 +638,17 @@ class MachineFrameInfo {
   bool hasTailCall() const { return HasTailCall; }
   void setHasTailCall(bool V = true) { HasTailCall = V; }
 
-  /// Computes the maximum size of a callframe and the AdjustsStack property.
+  /// Computes the maximum size of a callframe.
   /// This only works for targets defining
   /// TargetInstrInfo::getCallFrameSetupOpcode(), getCallFrameDestroyOpcode(),
   /// and getFrameSize().
   /// This is usually computed by the prologue epilogue inserter but some
   /// targets may call this to compute it earlier.
-  void computeMaxCallFrameSize(const MachineFunction &MF);
+  /// If FrameSDOps is passed, the frame instructions in the MF will be
+  /// inserted into it.
+  void computeMaxCallFrameSize(
+      MachineFunction &MF,
+      std::vector<MachineBasicBlock::iterator> *FrameSDOps = nullptr);
 
   /// Return the maximum size of a call frame that must be
   /// allocated for an outgoing function call.  This is only available if
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index a2c90c9f42f38f..dfbf7a1e7aae53 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -1026,18 +1026,27 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   /// MachineMemOperands are owned by the MachineFunction and need not be
   /// explicitly deallocated.
   MachineMemOperand *getMachineMemOperand(
-      MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
+      MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy,
       Align base_alignment, const AAMDNodes &AAInfo = AAMDNodes(),
       const MDNode *Ranges = nullptr, SyncScope::ID SSID = SyncScope::System,
       AtomicOrdering Ordering = AtomicOrdering::NotAtomic,
       AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic);
-
   MachineMemOperand *getMachineMemOperand(
-      MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy,
-      Align base_alignment, const AAMDNodes &AAInfo = AAMDNodes(),
+      MachinePointerInfo PtrInfo, MachineMemOperand::Flags F, LocationSize Size,
+      Align BaseAlignment, const AAMDNodes &AAInfo = AAMDNodes(),
       const MDNode *Ranges = nullptr, SyncScope::ID SSID = SyncScope::System,
       AtomicOrdering Ordering = AtomicOrdering::NotAtomic,
       AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic);
+  MachineMemOperand *getMachineMemOperand(
+      MachinePointerInfo PtrInfo, MachineMemOperand::Flags F, uint64_t Size,
+      Align BaseAlignment, const AAMDNodes &AAInfo = AAMDNodes(),
+      const MDNode *Ranges = nullptr, SyncScope::ID SSID = SyncScope::System,
+      AtomicOrdering Ordering = AtomicOrdering::NotAtomic,
+      AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic) {
+    return getMachineMemOperand(PtrInfo, F, LocationSize::precise(Size),
+                                BaseAlignment, AAInfo, Ranges, SSID, Ordering,
+                                FailureOrdering);
+  }
 
   /// getMachineMemOperand - Allocate a new MachineMemOperand by copying
   /// an existing one, adjusting by an offset and using the given size.
@@ -1046,9 +1055,16 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                           int64_t Offset, LLT Ty);
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
-                                          int64_t Offset, uint64_t Size) {
+                                          int64_t Offset, LocationSize Size) {
     return getMachineMemOperand(
-        MMO, Offset, Size == ~UINT64_C(0) ? LLT() : LLT::scalar(8 * Size));
+        MMO, Offset,
+        !Size.hasValue() || Size.isScalable()
+            ? LLT()
+            : LLT::scalar(8 * Size.getValue().getKnownMinValue()));
+  }
+  MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
+                                          int64_t Offset, uint64_t Size) {
+    return getMachineMemOperand(MMO, Offset, LocationSize::precise(Size));
   }
 
   /// getMachineMemOperand - Allocate a new MachineMemOperand by copying
@@ -1057,10 +1073,15 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   /// explicitly deallocated.
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                           const MachinePointerInfo &PtrInfo,
-                                          uint64_t Size);
+                                          LocationSize Size);
   MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
                                           const MachinePointerInfo &PtrInfo,
                                           LLT Ty);
+  MachineMemOperand *getMachineMemOperand(const MachineMemOperand *MMO,
+                                          const MachinePointerInfo &PtrInfo,
+                                          uint64_t Size) {
+    return getMachineMemOperand(MMO, PtrInfo, LocationSize::precise(Size));
+  }
 
   /// Allocate a new MachineMemOperand by copying an existing one,
   /// replacing only AliasAnalysis information. MachineMemOperands are owned
diff --git a/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
index 445c9b1c3bc00f..967c4a70ca4699 100644
--- a/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -89,6 +89,9 @@ class MachineLoop : public LoopBase<MachineBasicBlock, MachineLoop> {
 private:
   friend class LoopInfoBase<MachineBasicBlock, MachineLoop>;
 
+  /// Returns true if the given physreg has no defs inside the loop.
+  bool isLoopInvariantImplicitPhysReg(Register Reg) const;
+
   explicit MachineLoop(MachineBasicBlock *MBB)
     : LoopBase<MachineBasicBlock, MachineLoop>(MBB) {}
 
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 12c18aaea5b26c..da4ca582cb9e4f 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGenTypes/LowLevelType.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -186,7 +187,7 @@ class MachineMemOperand {
   /// and atomic ordering requirements must also be specified. For cmpxchg
   /// atomic operations the atomic ordering requirements when store does not
   /// occur must also be specified.
-  MachineMemOperand(MachinePointerInfo PtrInfo, Flags flags, uint64_t s,
+  MachineMemOperand(MachinePointerInfo PtrInfo, Flags flags, LocationSize TS,
                     Align a, const AAMDNodes &AAInfo = AAMDNodes(),
                     const MDNode *Ranges = nullptr,
                     SyncScope::ID SSID = SyncScope::System,
@@ -235,13 +236,17 @@ class MachineMemOperand {
   LLT getMemoryType() const { return MemoryType; }
 
   /// Return the size in bytes of the memory reference.
-  uint64_t getSize() const {
-    return MemoryType.isValid() ? MemoryType.getSizeInBytes() : ~UINT64_C(0);
+  LocationSize getSize() const {
+    return MemoryType.isValid()
+               ? LocationSize::precise(MemoryType.getSizeInBytes())
+               : LocationSize::beforeOrAfterPointer();
   }
 
   /// Return the size in bits of the memory reference.
-  uint64_t getSizeInBits() const {
-    return MemoryType.isValid() ? MemoryType.getSizeInBits() : ~UINT64_C(0);
+  LocationSize getSizeInBits() const {
+    return MemoryType.isValid()
+               ? LocationSize::precise(MemoryType.getSizeInBits())
+               : LocationSize::beforeOrAfterPointer();
   }
 
   LLT getType() const {
diff --git a/llvm/include/llvm/CodeGen/MachinePassManager.h b/llvm/include/llvm/CodeGen/MachinePassManager.h
index b1166e3a3fd529..3faffe5c4cab29 100644
--- a/llvm/include/llvm/CodeGen/MachinePassManager.h
+++ b/llvm/include/llvm/CodeGen/MachinePassManager.h
@@ -56,7 +56,7 @@ struct MachinePassConcept
 };
 
 template <typename PassT> struct MachinePassModel : MachinePassConcept {
-  explicit MachinePassModel(PassT Pass) : Pass(std::move(Pass)) {}
+  explicit MachinePassModel(PassT &&Pass) : Pass(std::move(Pass)) {}
   // We have to explicitly define all the special member functions because MSVC
   // refuses to generate them.
   MachinePassModel(const MachinePassModel &Arg) : Pass(Arg.Pass) {}
diff --git a/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h b/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
index 3dc0731f5a04ee..4b8eed7bdb1b53 100644
--- a/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
+++ b/llvm/include/llvm/CodeGen/NonRelocatableStringpool.h
@@ -27,10 +27,7 @@ class NonRelocatableStringpool {
   /// order.
   using MapTy = StringMap<DwarfStringPoolEntry, BumpPtrAllocator>;
 
-  NonRelocatableStringpool(
-      std::function<StringRef(StringRef Input)> Translator = nullptr,
-      bool PutEmptyString = false)
-      : Translator(Translator) {
+  NonRelocatableStringpool(bool PutEmptyString = false) {
     if (PutEmptyString)
       getEntry("");
   }
@@ -59,7 +56,6 @@ class NonRelocatableStringpool {
   MapTy Strings;
   uint64_t CurrentEndOffset = 0;
   unsigned NumEntries = 0;
-  std::function<StringRef(StringRef Input)> Translator;
 };
 
 /// Helper for making strong types.
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 96ab6b160a1c77..541c0ecb5be373 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -725,6 +725,18 @@ inline auto m_False() {
       },
       m_Value()};
 }
+
+/// Match a negate as a sub(0, v)
+template <typename ValTy>
+inline BinaryOpc_match<SpecificInt_match, ValTy> m_Neg(const ValTy &V) {
+  return m_Sub(m_Zero(), V);
+}
+
+/// Match a Not as a xor(v, -1) or xor(-1, v)
+template <typename ValTy>
+inline BinaryOpc_match<ValTy, SpecificInt_match, true> m_Not(const ValTy &V) {
+  return m_Xor(V, m_AllOnes());
+}
 } // namespace SDPatternMatch
 } // namespace llvm
 #endif
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 25e6c525b672a1..4785e93d72d1cc 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1299,7 +1299,7 @@ class SelectionDAG {
       EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
       MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
                                        MachineMemOperand::MOStore,
-      uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes());
+      LocationSize Size = 0, const AAMDNodes &AAInfo = AAMDNodes());
 
   inline SDValue getMemIntrinsicNode(
       unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
@@ -1307,7 +1307,7 @@ class SelectionDAG {
       MaybeAlign Alignment = std::nullopt,
       MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad |
                                        MachineMemOperand::MOStore,
-      uint64_t Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
+      LocationSize Size = 0, const AAMDNodes &AAInfo = AAMDNodes()) {
     // Ensure that codegen never sees alignment 0
     return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, PtrInfo,
                                Alignment.value_or(getEVTAlign(MemVT)), Flags,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index d1015630b05d12..261f7e49e5c8ca 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -558,6 +558,7 @@ BEGIN_TWO_BYTE_PACK()
 
   class LoadSDNodeBitfields {
     friend class LoadSDNode;
+    friend class AtomicSDNode;
     friend class VPLoadSDNode;
     friend class VPStridedLoadSDNode;
     friend class MaskedLoadSDNode;
@@ -1475,6 +1476,16 @@ class AtomicSDNode : public MemSDNode {
             MMO->isAtomic()) && "then why are we using an AtomicSDNode?");
   }
 
+  void setExtensionType(ISD::LoadExtType ETy) {
+    assert(getOpcode() == ISD::ATOMIC_LOAD && "Only used for atomic loads.");
+    LoadSDNodeBits.ExtTy = ETy;
+  }
+
+  ISD::LoadExtType getExtensionType() const {
+    assert(getOpcode() == ISD::ATOMIC_LOAD && "Only used for atomic loads.");
+    return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
+  }
+
   const SDValue &getBasePtr() const {
     return getOpcode() == ISD::ATOMIC_STORE ? getOperand(2) : getOperand(1);
   }
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index be4ee5b6f9e29a..9fd0ebe6956fbe 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -204,6 +204,7 @@ class TargetInstrInfo : public MCInstrInfo {
   /// if they exist (-1 otherwise).  Some targets use pseudo instructions in
   /// order to abstract away the difference between operating with a frame
   /// pointer and operating without, through the use of these two instructions.
+  /// A FrameSetup MI in MF implies MFI::AdjustsStack.
   ///
   unsigned getCallFrameSetupOpcode() const { return CallFrameSetupOpcode; }
   unsigned getCallFrameDestroyOpcode() const { return CallFrameDestroyOpcode; }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2f164a460db843..59fad88f91b1d1 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2722,17 +2722,19 @@ class TargetLoweringBase {
   }
 
   /// This represents an addressing mode of:
-  ///    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+  ///    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*vscale
   /// If BaseGV is null,  there is no BaseGV.
   /// If BaseOffs is zero, there is no base offset.
   /// If HasBaseReg is false, there is no base register.
   /// If Scale is zero, there is no ScaleReg.  Scale of 1 indicates a reg with
   /// no scale.
+  /// If ScalableOffset is zero, there is no scalable offset.
   struct AddrMode {
     GlobalValue *BaseGV = nullptr;
     int64_t      BaseOffs = 0;
     bool         HasBaseReg = false;
     int64_t      Scale = 0;
+    int64_t ScalableOffset = 0;
     AddrMode() = default;
   };
 
@@ -2770,6 +2772,12 @@ class TargetLoweringBase {
     return true;
   }
 
+  /// Return true if adding the specified scalable immediate is legal, that is
+  /// the target has add instructions which can add a register with the
+  /// immediate (multiplied by vscale) without having to materialize the
+  /// immediate into a register.
+  virtual bool isLegalAddScalableImmediate(int64_t) const { return false; }
+
   /// Return true if the specified immediate is legal for the value input of a
   /// store instruction.
   virtual bool isLegalStoreImmediate(int64_t Value) const {
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index e7c9ecd2e1851a..117d3f71829747 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -572,6 +572,12 @@ class TargetRegisterInfo : public MCRegisterInfo {
     return false;
   }
 
+  /// Returns true if MachineLoopInfo should analyze the given physreg
+  /// for loop invariance.
+  virtual bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const {
+    return false;
+  }
+
   /// Physical registers that may be modified within a function but are
   /// guaranteed to be restored before any uses. This is useful for targets that
   /// have call sequences where a GOT register may be updated by the caller
diff --git a/llvm/include/llvm/CodeGenTypes/LowLevelType.h b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
index 5a16cffb80b32b..62ee28cfac99c5 100644
--- a/llvm/include/llvm/CodeGenTypes/LowLevelType.h
+++ b/llvm/include/llvm/CodeGenTypes/LowLevelType.h
@@ -45,6 +45,14 @@ class LLT {
                /*AddressSpace=*/0};
   }
 
+  /// Get a low-level token; just a scalar with zero bits (or no size).
+  static constexpr LLT token() {
+    return LLT{/*isPointer=*/false, /*isVector=*/false,
+               /*isScalar=*/true,   ElementCount::getFixed(0),
+               /*SizeInBits=*/0,
+               /*AddressSpace=*/0};
+  }
+
   /// Get a low-level pointer in the given address space.
   static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits) {
     assert(SizeInBits > 0 && "invalid pointer size");
@@ -134,17 +142,17 @@ class LLT {
 
   explicit LLT(MVT VT);
 
-  constexpr bool isValid() const { return IsScalar || IsPointer || IsVector; }
-
+  constexpr bool isValid() const { return IsScalar || RawData != 0; }
   constexpr bool isScalar() const { return IsScalar; }
-
-  constexpr bool isPointer() const { return IsPointer && !IsVector; }
-
-  constexpr bool isPointerVector() const { return IsPointer && IsVector; }
-
-  constexpr bool isPointerOrPointerVector() const { return IsPointer; }
-
-  constexpr bool isVector() const { return IsVector; }
+  constexpr bool isToken() const { return IsScalar && RawData == 0; };
+  constexpr bool isVector() const { return isValid() && IsVector; }
+  constexpr bool isPointer() const {
+    return isValid() && IsPointer && !IsVector;
+  }
+  constexpr bool isPointerVector() const { return IsPointer && isVector(); }
+  constexpr bool isPointerOrPointerVector() const {
+    return IsPointer && isValid();
+  }
 
   /// Returns the number of elements in a vector LLT. Must only be called on
   /// vector types.
@@ -314,6 +322,28 @@ class LLT {
   /// described in static const *Field variables. Each of these variables
   /// is a 2-element array, with the first element describing the bitfield size
   /// and the second element describing the bitfield offset.
+  ///
+  /// +--------+---------+--------+----------+----------------------+
+  /// |isScalar|isPointer|isVector| RawData  |Notes                 |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    0    |   0    |    0     |Invalid               |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    0    |   1    |    0     |Tombstone Key         |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    1    |   0    |    0     |Empty Key             |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   1    |    0    |   0    |    0     |Token                 |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   1    |    0    |   0    | non-zero |Scalar                |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    1    |   0    | non-zero |Pointer               |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    0    |   1    | non-zero |Vector of non-pointer |
+  /// +--------+---------+--------+----------+----------------------+
+  /// |   0    |    1    |   1    | non-zero |Vector of pointer     |
+  /// +--------+---------+--------+----------+----------------------+
+  ///
+  /// Everything else is reserved.
   typedef int BitFieldInfo[2];
   ///
   /// This is how the bitfields are packed per Kind:
diff --git a/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h b/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h
index bebdfd5e60257e..e7a1a3cd838c22 100644
--- a/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h
+++ b/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h
@@ -45,16 +45,13 @@ class DwarfStreamer : public DwarfEmitter {
 public:
   DwarfStreamer(DWARFLinkerBase::OutputFileType OutFileType,
                 raw_pwrite_stream &OutFile,
-                DWARFLinkerBase::TranslatorFuncTy Translator,
                 DWARFLinkerBase::MessageHandlerTy Warning)
-      : OutFile(OutFile), OutFileType(OutFileType), Translator(Translator),
-        WarningHandler(Warning) {}
+      : OutFile(OutFile), OutFileType(OutFileType), WarningHandler(Warning) {}
   virtual ~DwarfStreamer() = default;
 
   static Expected<std::unique_ptr<DwarfStreamer>> createStreamer(
       const Triple &TheTriple, DWARFLinkerBase::OutputFileType FileType,
-      raw_pwrite_stream &OutFile, DWARFLinkerBase::TranslatorFuncTy Translator,
-      DWARFLinkerBase::MessageHandlerTy Warning);
+      raw_pwrite_stream &OutFile, DWARFLinkerBase::MessageHandlerTy Warning);
 
   Error init(Triple TheTriple, StringRef Swift5ReflectionSegmentName);
 
@@ -295,7 +292,6 @@ class DwarfStreamer : public DwarfEmitter {
   /// The output file we stream the linked Dwarf to.
   raw_pwrite_stream &OutFile;
   DWARFLinker::OutputFileType OutFileType = DWARFLinker::OutputFileType::Object;
-  std::function<StringRef(StringRef Input)> Translator;
 
   uint64_t RangesSectionSize = 0;
   uint64_t RngListsSectionSize = 0;
diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h b/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h
index 5c811b668f0a31..70b25cf02df7f8 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h
@@ -82,7 +82,6 @@ class DWARFLinkerBase {
       std::function<void(const DWARFFile &File, llvm::StringRef Output)>;
   using ObjectPrefixMapTy = std::map<std::string, std::string>;
   using CompileUnitHandlerTy = function_ref<void(const DWARFUnit &Unit)>;
-  using TranslatorFuncTy = std::function<StringRef(StringRef)>;
   using SwiftInterfacesMapTy = std::map<std::string, std::string>;
   /// Type of output file.
   enum class OutputFileType : uint8_t {
diff --git a/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h
index 5312712a4a5b81..8acc046d072493 100644
--- a/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h
+++ b/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h
@@ -123,8 +123,7 @@ class DWARFLinker : public DWARFLinkerBase {
 
   /// Creates dwarf linker instance.
   static std::unique_ptr<DWARFLinker>
-  createLinker(MessageHandlerTy ErrorHandler, MessageHandlerTy WarningHandler,
-               TranslatorFuncTy StringsTranslator = nullptr);
+  createLinker(MessageHandlerTy ErrorHandler, MessageHandlerTy WarningHandler);
 
   /// Set output DWARF handler. Result of linking DWARF is set of sections
   /// containing final debug info. DWARFLinkerBase::link() pass generated
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVELFReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
similarity index 88%
rename from llvm/include/llvm/DebugInfo/LogicalView/Readers/LVELFReader.h
rename to llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
index 0837b886a273a6..22e804a459f873 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVELFReader.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
@@ -1,4 +1,4 @@
-//===-- LVELFReader.h -------------------------------------------*- C++ -*-===//
+//===-- LVDWARFReader.h -----------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the LVELFReader class, which is used to describe a
+// This file defines the LVDWARFReader class, which is used to describe a
 // debug information (DWARF) reader.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVELFREADER_H
-#define LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVELFREADER_H
+#ifndef LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVDWARFREADER_H
+#define LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVDWARFREADER_H
 
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
@@ -30,7 +30,7 @@ class LVType;
 
 using AttributeSpec = DWARFAbbreviationDeclaration::AttributeSpec;
 
-class LVELFReader final : public LVBinaryReader {
+class LVDWARFReader final : public LVBinaryReader {
   object::ObjectFile &Obj;
 
   // Indicates if ranges data are available; in the case of split DWARF any
@@ -127,14 +127,14 @@ class LVELFReader final : public LVBinaryReader {
   void sortScopes() override;
 
 public:
-  LVELFReader() = delete;
-  LVELFReader(StringRef Filename, StringRef FileFormatName,
-              object::ObjectFile &Obj, ScopedPrinter &W)
+  LVDWARFReader() = delete;
+  LVDWARFReader(StringRef Filename, StringRef FileFormatName,
+                object::ObjectFile &Obj, ScopedPrinter &W)
       : LVBinaryReader(Filename, FileFormatName, W, LVBinaryType::ELF),
         Obj(Obj) {}
-  LVELFReader(const LVELFReader &) = delete;
-  LVELFReader &operator=(const LVELFReader &) = delete;
-  ~LVELFReader() = default;
+  LVDWARFReader(const LVDWARFReader &) = delete;
+  LVDWARFReader &operator=(const LVDWARFReader &) = delete;
+  ~LVDWARFReader() = default;
 
   LVAddress getCUBaseAddress() const { return CUBaseAddress; }
   void setCUBaseAddress(LVAddress Address) { CUBaseAddress = Address; }
@@ -158,4 +158,4 @@ class LVELFReader final : public LVBinaryReader {
 } // end namespace logicalview
 } // end namespace llvm
 
-#endif // LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVELFREADER_H
+#endif // LLVM_DEBUGINFO_LOGICALVIEW_READERS_LVDWARFREADER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index e928faf4788559..2ffde1f9eb10e8 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -60,9 +60,9 @@ class MachOPlatform : public Platform {
 
     struct BuildVersionOpts {
 
-      // Derive platform from triple.
-      static BuildVersionOpts fromTriple(const Triple &TT, uint32_t MinOS,
-                                         uint32_t SDK);
+      // Derive platform from triple if possible.
+      static std::optional<BuildVersionOpts>
+      fromTriple(const Triple &TT, uint32_t MinOS, uint32_t SDK);
 
       uint32_t Platform; // Platform.
       uint32_t MinOS;    // X.Y.Z is encoded in nibbles xxxx.yy.zz
@@ -73,13 +73,12 @@ class MachOPlatform : public Platform {
     /// will be used.
     std::optional<Dylib> IDDylib;
 
-    /// Override for LC_BUILD_VERSION. If this is nullopt then
-    std::optional<BuildVersionOpts> BuildVersion;
-
     /// List of LC_LOAD_DYLIBs.
     std::vector<Dylib> LoadDylibs;
     /// List of LC_RPATHs.
     std::vector<std::string> RPaths;
+    /// List of LC_BUILD_VERSIONs.
+    std::vector<BuildVersionOpts> BuildVersions;
 
     HeaderOptions() = default;
     HeaderOptions(Dylib D) : IDDylib(std::move(D)) {}
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/VTuneSharedStructs.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/VTuneSharedStructs.h
index 667d3446faff74..3517d0b0917199 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/VTuneSharedStructs.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/VTuneSharedStructs.h
@@ -13,6 +13,10 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_VTUNESHAREDSTRUCTS_H
 #define LLVM_EXECUTIONENGINE_ORC_SHARED_VTUNESHAREDSTRUCTS_H
 
+#include "ExecutorAddress.h"
+#include <utility>
+#include <vector>
+
 namespace llvm {
 namespace orc {
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 1bfaf3718552e5..338b56226f2041 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -72,7 +72,10 @@ enum class IdentFlag {
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
 // Version of the kernel argument format used by the omp runtime.
-#define OMP_KERNEL_ARG_VERSION 2
+#define OMP_KERNEL_ARG_VERSION 3
+
+// Minimum version of the compiler that generates a kernel dynamic pointer.
+#define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3
 
 /// \note This needs to be kept in sync with kmp.h enum sched_type.
 /// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 1ef32bcb121bec..152f781ffa9b30 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -88,9 +88,6 @@ namespace llvm {
   /// info. Return true if module is modified.
   bool UpgradeDebugInfo(Module &M);
 
-  /// Copies module attributes to the functions in the module.
-  void CopyModuleAttrToFunctions(Module &M);
-
   /// Check whether a string looks like an old loop attachment tag.
   inline bool mayBeOldLoopAttachmentTag(StringRef Name) {
     return Name.starts_with("llvm.vectorizer.");
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 71c1a8394896b7..51444c7f8c9ccd 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -14,6 +14,7 @@
 #define LLVM_IR_BASICBLOCK_H
 
 #include "llvm-c/Types.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
@@ -21,7 +22,6 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/DebugProgramInstruction.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/DebugProgramInstruction.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/Value.h"
 #include <cassert>
@@ -38,7 +38,7 @@ class LLVMContext;
 class Module;
 class PHINode;
 class ValueSymbolTable;
-class DPValue;
+class DbgVariableRecord;
 class DPMarker;
 
 /// LLVM Basic Block Representation
@@ -121,10 +121,10 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   DPMarker *getNextMarker(Instruction *I);
 
   /// Insert a DbgRecord into a block at the position given by \p I.
-  void insertDbgRecordAfter(DbgRecord *DPV, Instruction *I);
+  void insertDbgRecordAfter(DbgRecord *DR, Instruction *I);
 
   /// Insert a DbgRecord into a block at the position given by \p Here.
-  void insertDbgRecordBefore(DbgRecord *DPV, InstListType::iterator Here);
+  void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here);
 
   /// Eject any debug-info trailing at the end of a block. DbgRecords can
   /// transiently be located "off the end" of a block if the blocks terminator
@@ -774,6 +774,32 @@ BasicBlock::iterator skipDebugIntrinsics(BasicBlock::iterator It);
 inline void BasicBlock::validateInstrOrdering() const {}
 #endif
 
+// Specialize DenseMapInfo for iterators, so that ththey can be installed into
+// maps and sets. The iterator is made up of its node pointer, and the
+// debug-info "head" bit.
+template <> struct DenseMapInfo<BasicBlock::iterator> {
+  static inline BasicBlock::iterator getEmptyKey() {
+    return BasicBlock::iterator(nullptr);
+  }
+
+  static inline BasicBlock::iterator getTombstoneKey() {
+    BasicBlock::iterator It(nullptr);
+    It.setHeadBit(true);
+    return It;
+  }
+
+  static unsigned getHashValue(const BasicBlock::iterator &It) {
+    return DenseMapInfo<void *>::getHashValue(
+               reinterpret_cast<void *>(It.getNodePtr())) ^
+           It.getHeadBit();
+  }
+
+  static bool isEqual(const BasicBlock::iterator &LHS,
+                      const BasicBlock::iterator &RHS) {
+    return LHS == RHS && LHS.getHeadBit() == RHS.getHeadBit();
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_BASICBLOCK_H
diff --git a/llvm/include/llvm/IR/ConstantFold.h b/llvm/include/llvm/IR/ConstantFold.h
index 77f5f0eb174a2b..9b3c8a0e5a6329 100644
--- a/llvm/include/llvm/IR/ConstantFold.h
+++ b/llvm/include/llvm/IR/ConstantFold.h
@@ -53,7 +53,7 @@ namespace llvm {
   Constant *ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
                                            Constant *C1, Constant *C2);
   Constant *ConstantFoldGetElementPtr(Type *Ty, Constant *C, bool InBounds,
-                                      std::optional<unsigned> InRangeIndex,
+                                      std::optional<ConstantRange> InRange,
                                       ArrayRef<Value *> Idxs);
 } // End llvm namespace
 
diff --git a/llvm/include/llvm/IR/ConstantFolder.h b/llvm/include/llvm/IR/ConstantFolder.h
index c2b30a65e32e25..3e74a563a58421 100644
--- a/llvm/include/llvm/IR/ConstantFolder.h
+++ b/llvm/include/llvm/IR/ConstantFolder.h
@@ -18,8 +18,8 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/ConstantFold.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilderFolder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Operator.h"
@@ -89,7 +89,7 @@ class ConstantFolder final : public IRBuilderFolder {
   }
 
   Value *FoldUnOpFMF(Instruction::UnaryOps Opc, Value *V,
-                      FastMathFlags FMF) const override {
+                     FastMathFlags FMF) const override {
     if (Constant *C = dyn_cast<Constant>(V))
       return ConstantFoldUnaryInstruction(Opc, C);
     return nullptr;
@@ -183,6 +183,12 @@ class ConstantFolder final : public IRBuilderFolder {
     return nullptr;
   }
 
+  Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                             Instruction *FMFSource) const override {
+    // Use TargetFolder or InstSimplifyFolder instead.
+    return nullptr;
+  }
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index c0ac9a4aa6750c..e50cd1f1c73efa 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -26,6 +26,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/OperandTraits.h"
@@ -1195,31 +1196,31 @@ class ConstantExpr : public Constant {
   /// Getelementptr form.  Value* is only accepted for convenience;
   /// all elements must be Constants.
   ///
-  /// \param InRangeIndex the inrange index if present or std::nullopt.
+  /// \param InRange the inrange range if present or std::nullopt.
   /// \param OnlyIfReducedTy see \a getWithOperands() docs.
   static Constant *
   getGetElementPtr(Type *Ty, Constant *C, ArrayRef<Constant *> IdxList,
                    bool InBounds = false,
-                   std::optional<unsigned> InRangeIndex = std::nullopt,
+                   std::optional<ConstantRange> InRange = std::nullopt,
                    Type *OnlyIfReducedTy = nullptr) {
     return getGetElementPtr(
         Ty, C, ArrayRef((Value *const *)IdxList.data(), IdxList.size()),
-        InBounds, InRangeIndex, OnlyIfReducedTy);
+        InBounds, InRange, OnlyIfReducedTy);
   }
   static Constant *
   getGetElementPtr(Type *Ty, Constant *C, Constant *Idx, bool InBounds = false,
-                   std::optional<unsigned> InRangeIndex = std::nullopt,
+                   std::optional<ConstantRange> InRange = std::nullopt,
                    Type *OnlyIfReducedTy = nullptr) {
     // This form of the function only exists to avoid ambiguous overload
     // warnings about whether to convert Idx to ArrayRef<Constant *> or
     // ArrayRef<Value *>.
-    return getGetElementPtr(Ty, C, cast<Value>(Idx), InBounds, InRangeIndex,
+    return getGetElementPtr(Ty, C, cast<Value>(Idx), InBounds, InRange,
                             OnlyIfReducedTy);
   }
   static Constant *
   getGetElementPtr(Type *Ty, Constant *C, ArrayRef<Value *> IdxList,
                    bool InBounds = false,
-                   std::optional<unsigned> InRangeIndex = std::nullopt,
+                   std::optional<ConstantRange> InRange = std::nullopt,
                    Type *OnlyIfReducedTy = nullptr);
 
   /// Create an "inbounds" getelementptr. See the documentation for the
@@ -1289,14 +1290,13 @@ class ConstantExpr : public Constant {
                             Type *SrcTy = nullptr) const;
 
   /// Returns an Instruction which implements the same operation as this
-  /// ConstantExpr. If \p InsertBefore is not null, the new instruction is
-  /// inserted before it, otherwise it is not inserted into any basic block.
+  /// ConstantExpr. It is not inserted into any basic block.
   ///
   /// A better approach to this could be to have a constructor for Instruction
   /// which would take a ConstantExpr parameter, but that would have spread
   /// implementation details of ConstantExpr outside of Constants.cpp, which
   /// would make it harder to remove ConstantExprs altogether.
-  Instruction *getAsInstruction(Instruction *InsertBefore = nullptr) const;
+  Instruction *getAsInstruction() const;
 
   /// Whether creating a constant expression for this binary operator is
   /// desirable.
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index 94af17af8160e9..002f2db6da5447 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -101,9 +101,10 @@ namespace llvm {
     DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
                            BasicBlock *InsertBB, Instruction *InsertBefore);
 
-    /// Internal helper. Track metadata if untracked and insert \p DPV.
-    void insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
-                       Instruction *InsertBefore, bool InsertAtHead = false);
+    /// Internal helper. Track metadata if untracked and insert \p DVR.
+    void insertDbgVariableRecord(DbgVariableRecord *DVR, BasicBlock *InsertBB,
+                                 Instruction *InsertBefore,
+                                 bool InsertAtHead = false);
 
     /// Internal helper with common code used by insertDbg{Value,Addr}Intrinsic.
     Instruction *insertDbgIntrinsic(llvm::Function *Intrinsic, llvm::Value *Val,
@@ -270,6 +271,13 @@ namespace llvm {
                       std::optional<unsigned> DWARFAddressSpace = std::nullopt,
                       StringRef Name = "", DINodeArray Annotations = nullptr);
 
+    /// Create a __ptrauth qualifier.
+    DIDerivedType *createPtrAuthQualifiedType(DIType *FromTy, unsigned Key,
+                                              bool IsAddressDiscriminated,
+                                              unsigned ExtraDiscriminator,
+                                              bool IsaPointer,
+                                              bool authenticatesNullValues);
+
     /// Create debugging information entry for a pointer to member.
     /// \param PointeeTy Type pointed to by this pointer.
     /// \param SizeInBits  Size.
diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 6673908d3ed355..53cede5409e260 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -34,23 +34,25 @@ namespace llvm {
 class DbgDeclareInst;
 class DbgValueInst;
 class DbgVariableIntrinsic;
-class DPValue;
+class DbgVariableRecord;
 class Instruction;
 class Module;
 
 /// Finds dbg.declare intrinsics declaring local variables as living in the
 /// memory that 'V' points to.
 TinyPtrVector<DbgDeclareInst *> findDbgDeclares(Value *V);
-/// As above, for DPVDeclares.
-TinyPtrVector<DPValue *> findDPVDeclares(Value *V);
+/// As above, for DVRDeclares.
+TinyPtrVector<DbgVariableRecord *> findDVRDeclares(Value *V);
 
 /// Finds the llvm.dbg.value intrinsics describing a value.
-void findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues,
-                   Value *V, SmallVectorImpl<DPValue *> *DPValues = nullptr);
+void findDbgValues(
+    SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
+    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords = nullptr);
 
 /// Finds the debug info intrinsics describing a value.
-void findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgInsts,
-                  Value *V, SmallVectorImpl<DPValue *> *DPValues = nullptr);
+void findDbgUsers(
+    SmallVectorImpl<DbgVariableIntrinsic *> &DbgInsts, Value *V,
+    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords = nullptr);
 
 /// Find subprogram that is enclosing this scope.
 DISubprogram *getDISubprogram(const MDNode *Scope);
@@ -58,7 +60,7 @@ DISubprogram *getDISubprogram(const MDNode *Scope);
 /// Produce a DebugLoc to use for each dbg.declare that is promoted to a
 /// dbg.value.
 DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII);
-DebugLoc getDebugValueLoc(DPValue *DPV);
+DebugLoc getDebugValueLoc(DbgVariableRecord *DVR);
 
 /// Strip debug info in the module if it exists.
 ///
@@ -109,7 +111,8 @@ class DebugInfoFinder {
   void processVariable(const Module &M, const DILocalVariable *DVI);
   /// Process debug info location.
   void processLocation(const Module &M, const DILocation *Loc);
-  /// Process a DbgRecord (e.g, treat a DPValue like a DbgVariableIntrinsic).
+  /// Process a DbgRecord (e.g, treat a DbgVariableRecord like a
+  /// DbgVariableIntrinsic).
   void processDbgRecord(const Module &M, const DbgRecord &DR);
 
   /// Process subprogram.
@@ -193,10 +196,10 @@ inline AssignmentInstRange getAssignmentInsts(const DbgAssignIntrinsic *DAI) {
   return getAssignmentInsts(DAI->getAssignID());
 }
 
-inline AssignmentInstRange getAssignmentInsts(const DPValue *DPV) {
-  assert(DPV->isDbgAssign() &&
-         "Can't get assignment instructions for non-assign DPV!");
-  return getAssignmentInsts(DPV->getAssignID());
+inline AssignmentInstRange getAssignmentInsts(const DbgVariableRecord *DVR) {
+  assert(DVR->isDbgAssign() &&
+         "Can't get assignment instructions for non-assign DVR!");
+  return getAssignmentInsts(DVR->getAssignID());
 }
 
 //
@@ -231,9 +234,10 @@ inline AssignmentMarkerRange getAssignmentMarkers(const Instruction *Inst) {
     return make_range(Value::user_iterator(), Value::user_iterator());
 }
 
-inline SmallVector<DPValue *> getDPVAssignmentMarkers(const Instruction *Inst) {
+inline SmallVector<DbgVariableRecord *>
+getDVRAssignmentMarkers(const Instruction *Inst) {
   if (auto *ID = Inst->getMetadata(LLVMContext::MD_DIAssignID))
-    return cast<DIAssignID>(ID)->getAllDPValueUsers();
+    return cast<DIAssignID>(ID)->getAllDbgVariableRecordUsers();
   return {};
 }
 
@@ -261,7 +265,7 @@ bool calculateFragmentIntersect(
     std::optional<DIExpression::FragmentInfo> &Result);
 bool calculateFragmentIntersect(
     const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
-    uint64_t SliceSizeInBits, const DPValue *DPVAssign,
+    uint64_t SliceSizeInBits, const DbgVariableRecord *DVRAssign,
     std::optional<DIExpression::FragmentInfo> &Result);
 
 /// Helper struct for trackAssignments, below. We don't use the similar
@@ -276,8 +280,8 @@ struct VarRecord {
 
   VarRecord(DbgVariableIntrinsic *DVI)
       : Var(DVI->getVariable()), DL(getDebugValueLoc(DVI)) {}
-  VarRecord(DPValue *DPV)
-      : Var(DPV->getVariable()), DL(getDebugValueLoc(DPV)) {}
+  VarRecord(DbgVariableRecord *DVR)
+      : Var(DVR->getVariable()), DL(getDebugValueLoc(DVR)) {}
   VarRecord(DILocalVariable *Var, DILocation *DL) : Var(Var), DL(DL) {}
   friend bool operator<(const VarRecord &LHS, const VarRecord &RHS) {
     return std::tie(LHS.Var, LHS.DL) < std::tie(RHS.Var, RHS.DL);
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 156f6eb49253de..2805f6c4780578 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -65,7 +65,7 @@ enum Tag : uint16_t;
 }
 
 class DbgVariableIntrinsic;
-class DPValue;
+class DbgVariableRecord;
 
 extern cl::opt<bool> EnableFSDiscriminator;
 
@@ -323,8 +323,8 @@ class DIAssignID : public MDNode {
   // This node has no operands to replace.
   void replaceOperandWith(unsigned I, Metadata *New) = delete;
 
-  SmallVector<DPValue *> getAllDPValueUsers() {
-    return Context.getReplaceableUses()->getAllDPValueUsers();
+  SmallVector<DbgVariableRecord *> getAllDbgVariableRecordUsers() {
+    return Context.getReplaceableUses()->getAllDbgVariableRecordUsers();
   }
 
   static DIAssignID *getDistinct(LLVMContext &Context) {
@@ -745,7 +745,7 @@ class DIType : public DIScope {
 
   unsigned getLine() const { return Line; }
   uint64_t getSizeInBits() const { return SizeInBits; }
-  uint32_t getAlignInBits() const { return SubclassData32; }
+  uint32_t getAlignInBits() const;
   uint32_t getAlignInBytes() const { return getAlignInBits() / CHAR_BIT; }
   uint64_t getOffsetInBits() const { return OffsetInBits; }
   DIFlags getFlags() const { return Flags; }
@@ -972,6 +972,35 @@ class DIStringType : public DIType {
 ///
 /// TODO: Split out members (inheritance, fields, methods, etc.).
 class DIDerivedType : public DIType {
+public:
+  /// Pointer authentication (__ptrauth) metadata.
+  struct PtrAuthData {
+    // RawData layout:
+    // - Bits 0..3:  Key
+    // - Bit  4:     IsAddressDiscriminated
+    // - Bits 5..20: ExtraDiscriminator
+    // - Bit  21:    IsaPointer
+    // - Bit  22:    AuthenticatesNullValues
+    unsigned RawData;
+
+    PtrAuthData(unsigned FromRawData) : RawData(FromRawData) {}
+    PtrAuthData(unsigned Key, bool IsDiscr, unsigned Discriminator,
+                bool IsaPointer, bool AuthenticatesNullValues) {
+      assert(Key < 16);
+      assert(Discriminator <= 0xffff);
+      RawData = (Key << 0) | (IsDiscr ? (1 << 4) : 0) | (Discriminator << 5) |
+                (IsaPointer ? (1 << 21) : 0) |
+                (AuthenticatesNullValues ? (1 << 22) : 0);
+    }
+
+    unsigned key() { return (RawData >> 0) & 0b1111; }
+    bool isAddressDiscriminated() { return (RawData >> 4) & 1; }
+    unsigned extraDiscriminator() { return (RawData >> 5) & 0xffff; }
+    bool isaPointer() { return (RawData >> 21) & 1; }
+    bool authenticatesNullValues() { return (RawData >> 22) & 1; }
+  };
+
+private:
   friend class LLVMContextImpl;
   friend class MDNode;
 
@@ -982,59 +1011,70 @@ class DIDerivedType : public DIType {
   DIDerivedType(LLVMContext &C, StorageType Storage, unsigned Tag,
                 unsigned Line, uint64_t SizeInBits, uint32_t AlignInBits,
                 uint64_t OffsetInBits,
-                std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+                std::optional<unsigned> DWARFAddressSpace,
+                std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
                 ArrayRef<Metadata *> Ops)
       : DIType(C, DIDerivedTypeKind, Storage, Tag, Line, SizeInBits,
                AlignInBits, OffsetInBits, Flags, Ops),
-        DWARFAddressSpace(DWARFAddressSpace) {}
+        DWARFAddressSpace(DWARFAddressSpace) {
+    if (PtrAuthData)
+      SubclassData32 = PtrAuthData->RawData;
+  }
   ~DIDerivedType() = default;
   static DIDerivedType *
   getImpl(LLVMContext &Context, unsigned Tag, StringRef Name, DIFile *File,
           unsigned Line, DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
           uint32_t AlignInBits, uint64_t OffsetInBits,
-          std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+          std::optional<unsigned> DWARFAddressSpace,
+          std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
           Metadata *ExtraData, DINodeArray Annotations, StorageType Storage,
           bool ShouldCreate = true) {
     return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
                    Line, Scope, BaseType, SizeInBits, AlignInBits, OffsetInBits,
-                   DWARFAddressSpace, Flags, ExtraData, Annotations.get(),
-                   Storage, ShouldCreate);
+                   DWARFAddressSpace, PtrAuthData, Flags, ExtraData,
+                   Annotations.get(), Storage, ShouldCreate);
   }
   static DIDerivedType *
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
           unsigned Line, Metadata *Scope, Metadata *BaseType,
           uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
-          std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+          std::optional<unsigned> DWARFAddressSpace,
+          std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
           Metadata *ExtraData, Metadata *Annotations, StorageType Storage,
           bool ShouldCreate = true);
 
   TempDIDerivedType cloneImpl() const {
-    return getTemporary(
-        getContext(), getTag(), getName(), getFile(), getLine(), getScope(),
-        getBaseType(), getSizeInBits(), getAlignInBits(), getOffsetInBits(),
-        getDWARFAddressSpace(), getFlags(), getExtraData(), getAnnotations());
+    return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
+                        getScope(), getBaseType(), getSizeInBits(),
+                        getAlignInBits(), getOffsetInBits(),
+                        getDWARFAddressSpace(), getPtrAuthData(), getFlags(),
+                        getExtraData(), getAnnotations());
   }
 
 public:
-  DEFINE_MDNODE_GET(
-      DIDerivedType,
-      (unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
-       Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-       uint32_t AlignInBits, uint64_t OffsetInBits,
-       std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-       Metadata *ExtraData = nullptr, Metadata *Annotations = nullptr),
-      (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
-       OffsetInBits, DWARFAddressSpace, Flags, ExtraData, Annotations))
+  DEFINE_MDNODE_GET(DIDerivedType,
+                    (unsigned Tag, MDString *Name, Metadata *File,
+                     unsigned Line, Metadata *Scope, Metadata *BaseType,
+                     uint64_t SizeInBits, uint32_t AlignInBits,
+                     uint64_t OffsetInBits,
+                     std::optional<unsigned> DWARFAddressSpace,
+                     std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
+                     Metadata *ExtraData = nullptr,
+                     Metadata *Annotations = nullptr),
+                    (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                     AlignInBits, OffsetInBits, DWARFAddressSpace, PtrAuthData,
+                     Flags, ExtraData, Annotations))
   DEFINE_MDNODE_GET(DIDerivedType,
                     (unsigned Tag, StringRef Name, DIFile *File, unsigned Line,
                      DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
                      uint32_t AlignInBits, uint64_t OffsetInBits,
-                     std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+                     std::optional<unsigned> DWARFAddressSpace,
+                     std::optional<PtrAuthData> PtrAuthData, DIFlags Flags,
                      Metadata *ExtraData = nullptr,
                      DINodeArray Annotations = nullptr),
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                     AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                     ExtraData, Annotations))
+                     AlignInBits, OffsetInBits, DWARFAddressSpace, PtrAuthData,
+                     Flags, ExtraData, Annotations))
 
   TempDIDerivedType clone() const { return cloneImpl(); }
 
@@ -1048,6 +1088,8 @@ class DIDerivedType : public DIType {
     return DWARFAddressSpace;
   }
 
+  std::optional<PtrAuthData> getPtrAuthData() const;
+
   /// Get extra data associated with this derived type.
   ///
   /// Class type for pointer-to-members, objective-c property node for ivars,
@@ -1087,6 +1129,16 @@ class DIDerivedType : public DIType {
   }
 };
 
+inline bool operator==(DIDerivedType::PtrAuthData Lhs,
+                       DIDerivedType::PtrAuthData Rhs) {
+  return Lhs.RawData == Rhs.RawData;
+}
+
+inline bool operator!=(DIDerivedType::PtrAuthData Lhs,
+                       DIDerivedType::PtrAuthData Rhs) {
+  return !(Lhs == Rhs);
+}
+
 /// Composite types.
 ///
 /// TODO: Detach from DerivedTypeBase (split out MDEnumType?).
@@ -3788,8 +3840,8 @@ class DIArgList : public Metadata, ReplaceableMetadataImpl {
     return MD->getMetadataID() == DIArgListKind;
   }
 
-  SmallVector<DPValue *> getAllDPValueUsers() {
-    return ReplaceableMetadataImpl::getAllDPValueUsers();
+  SmallVector<DbgVariableRecord *> getAllDbgVariableRecordUsers() {
+    return ReplaceableMetadataImpl::getAllDbgVariableRecordUsers();
   }
 
   void handleChangedOperand(void *Ref, Metadata *New);
@@ -3819,7 +3871,7 @@ class DebugVariable {
 
 public:
   DebugVariable(const DbgVariableIntrinsic *DII);
-  DebugVariable(const DPValue *DPV);
+  DebugVariable(const DbgVariableRecord *DVR);
 
   DebugVariable(const DILocalVariable *Var,
                 std::optional<FragmentInfo> FragmentInfo,
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 971c22fa9aa22a..db41e9acc7be42 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -27,12 +27,13 @@
 //    %bar = void call @ext(%foo)
 //       ; bar->DbgMarker = {
 //       ;   StoredDbgRecords = {
-//       ;     DPValue(metadata i32 %foo, ...)
+//       ;     DbgVariableRecord(metadata i32 %foo, ...)
 //       ;   }
 //       ; }
 //       ;; There is a debug-info record in front of the %bar instruction,
 //       ;; thus it points at a DPMarker object. That DPMarker contains a
-//       ;; DPValue in it's ilist, storing the equivalent information to the
+//       ;; DbgVariableRecord in it's ilist, storing the equivalent information
+//       to the
 //       ;; dbg.value above: the Value, DILocalVariable, etc.
 //
 // This structure separates the two concerns of the position of the debug-info
@@ -66,7 +67,7 @@ class DbgInfoIntrinsic;
 class DbgLabelInst;
 class DIAssignID;
 class DPMarker;
-class DPValue;
+class DbgVariableRecord;
 class raw_ostream;
 
 /// A typed tracking MDNode reference that does not require a definition for its
@@ -219,7 +220,8 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DbgRecord &R) {
 
 /// Records a position in IR for a source label (DILabel). Corresponds to the
 /// llvm.dbg.label intrinsic.
-/// FIXME: Rename DbgLabelRecord when DPValue is renamed to DbgVariableRecord.
+/// FIXME: Rename DbgLabelRecord when DbgVariableRecord is renamed to
+/// DbgVariableRecord.
 class DPLabel : public DbgRecord {
   DbgRecordParamRef<DILabel> Label;
 
@@ -257,7 +259,7 @@ class DPLabel : public DbgRecord {
 ///
 /// This class inherits from DebugValueUser to allow LLVM's metadata facilities
 /// to update our references to metadata beneath our feet.
-class DPValue : public DbgRecord, protected DebugValueUser {
+class DbgVariableRecord : public DbgRecord, protected DebugValueUser {
   friend class DebugValueUser;
 
 public:
@@ -269,9 +271,10 @@ class DPValue : public DbgRecord, protected DebugValueUser {
     End, ///< Marks the end of the concrete types.
     Any, ///< To indicate all LocationTypes in searches.
   };
-  /// Classification of the debug-info record that this DPValue represents.
-  /// Essentially, "is this a dbg.value or dbg.declare?". dbg.declares are not
-  /// currently supported, but it would be trivial to do so.
+  /// Classification of the debug-info record that this DbgVariableRecord
+  /// represents. Essentially, "is this a dbg.value or dbg.declare?".
+  /// dbg.declares are not currently supported, but it would be trivial to do
+  /// so.
   /// FIXME: We could use spare padding bits from DbgRecord for this.
   LocationType Type;
 
@@ -284,63 +287,69 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   DbgRecordParamRef<DIExpression> AddressExpression;
 
 public:
-  /// Create a new DPValue representing the intrinsic \p DVI, for example the
-  /// assignment represented by a dbg.value.
-  DPValue(const DbgVariableIntrinsic *DVI);
-  DPValue(const DPValue &DPV);
-  /// Directly construct a new DPValue representing a dbg.value intrinsic
-  /// assigning \p Location to the DV / Expr / DI variable.
-  DPValue(Metadata *Location, DILocalVariable *DV, DIExpression *Expr,
-          const DILocation *DI, LocationType Type = LocationType::Value);
-  DPValue(Metadata *Value, DILocalVariable *Variable, DIExpression *Expression,
-          DIAssignID *AssignID, Metadata *Address,
-          DIExpression *AddressExpression, const DILocation *DI);
+  /// Create a new DbgVariableRecord representing the intrinsic \p DVI, for
+  /// example the assignment represented by a dbg.value.
+  DbgVariableRecord(const DbgVariableIntrinsic *DVI);
+  DbgVariableRecord(const DbgVariableRecord &DVR);
+  /// Directly construct a new DbgVariableRecord representing a dbg.value
+  /// intrinsic assigning \p Location to the DV / Expr / DI variable.
+  DbgVariableRecord(Metadata *Location, DILocalVariable *DV, DIExpression *Expr,
+                    const DILocation *DI,
+                    LocationType Type = LocationType::Value);
+  DbgVariableRecord(Metadata *Value, DILocalVariable *Variable,
+                    DIExpression *Expression, DIAssignID *AssignID,
+                    Metadata *Address, DIExpression *AddressExpression,
+                    const DILocation *DI);
 
 private:
   /// Private constructor for creating new instances during parsing only. Only
-  /// called through `createUnresolvedDPValue` below, which makes clear that
-  /// this is used for parsing only, and will later return a subclass depending
-  /// on which Type is passed.
-  DPValue(LocationType Type, Metadata *Val, MDNode *Variable,
-          MDNode *Expression, MDNode *AssignID, Metadata *Address,
-          MDNode *AddressExpression, MDNode *DI);
+  /// called through `createUnresolvedDbgVariableRecord` below, which makes
+  /// clear that this is used for parsing only, and will later return a subclass
+  /// depending on which Type is passed.
+  DbgVariableRecord(LocationType Type, Metadata *Val, MDNode *Variable,
+                    MDNode *Expression, MDNode *AssignID, Metadata *Address,
+                    MDNode *AddressExpression, MDNode *DI);
 
 public:
-  /// Used to create DPValues during parsing, where some metadata references may
-  /// still be unresolved. Although for some fields a generic `Metadata*`
-  /// argument is accepted for forward type-references, the verifier and
-  /// accessors will reject incorrect types later on. The function is used for
-  /// all types of DPValues for simplicity while parsing, but asserts if any
-  /// necessary fields are empty or unused fields are not empty, i.e. if the
-  /// #dbg_assign fields are used for a non-dbg-assign type.
-  static DPValue *createUnresolvedDPValue(LocationType Type, Metadata *Val,
-                                          MDNode *Variable, MDNode *Expression,
-                                          MDNode *AssignID, Metadata *Address,
-                                          MDNode *AddressExpression,
-                                          MDNode *DI);
-
-  static DPValue *createDPVAssign(Value *Val, DILocalVariable *Variable,
-                                  DIExpression *Expression,
-                                  DIAssignID *AssignID, Value *Address,
-                                  DIExpression *AddressExpression,
-                                  const DILocation *DI);
-  static DPValue *createLinkedDPVAssign(Instruction *LinkedInstr, Value *Val,
-                                        DILocalVariable *Variable,
-                                        DIExpression *Expression,
-                                        Value *Address,
-                                        DIExpression *AddressExpression,
-                                        const DILocation *DI);
-
-  static DPValue *createDPValue(Value *Location, DILocalVariable *DV,
-                                DIExpression *Expr, const DILocation *DI);
-  static DPValue *createDPValue(Value *Location, DILocalVariable *DV,
-                                DIExpression *Expr, const DILocation *DI,
-                                DPValue &InsertBefore);
-  static DPValue *createDPVDeclare(Value *Address, DILocalVariable *DV,
-                                   DIExpression *Expr, const DILocation *DI);
-  static DPValue *createDPVDeclare(Value *Address, DILocalVariable *DV,
-                                   DIExpression *Expr, const DILocation *DI,
-                                   DPValue &InsertBefore);
+  /// Used to create DbgVariableRecords during parsing, where some metadata
+  /// references may still be unresolved. Although for some fields a generic
+  /// `Metadata*` argument is accepted for forward type-references, the verifier
+  /// and accessors will reject incorrect types later on. The function is used
+  /// for all types of DbgVariableRecords for simplicity while parsing, but
+  /// asserts if any necessary fields are empty or unused fields are not empty,
+  /// i.e. if the #dbg_assign fields are used for a non-dbg-assign type.
+  static DbgVariableRecord *
+  createUnresolvedDbgVariableRecord(LocationType Type, Metadata *Val,
+                                    MDNode *Variable, MDNode *Expression,
+                                    MDNode *AssignID, Metadata *Address,
+                                    MDNode *AddressExpression, MDNode *DI);
+
+  static DbgVariableRecord *
+  createDVRAssign(Value *Val, DILocalVariable *Variable,
+                  DIExpression *Expression, DIAssignID *AssignID,
+                  Value *Address, DIExpression *AddressExpression,
+                  const DILocation *DI);
+  static DbgVariableRecord *
+  createLinkedDVRAssign(Instruction *LinkedInstr, Value *Val,
+                        DILocalVariable *Variable, DIExpression *Expression,
+                        Value *Address, DIExpression *AddressExpression,
+                        const DILocation *DI);
+
+  static DbgVariableRecord *createDbgVariableRecord(Value *Location,
+                                                    DILocalVariable *DV,
+                                                    DIExpression *Expr,
+                                                    const DILocation *DI);
+  static DbgVariableRecord *
+  createDbgVariableRecord(Value *Location, DILocalVariable *DV,
+                          DIExpression *Expr, const DILocation *DI,
+                          DbgVariableRecord &InsertBefore);
+  static DbgVariableRecord *createDVRDeclare(Value *Address,
+                                             DILocalVariable *DV,
+                                             DIExpression *Expr,
+                                             const DILocation *DI);
+  static DbgVariableRecord *
+  createDVRDeclare(Value *Address, DILocalVariable *DV, DIExpression *Expr,
+                   const DILocation *DI, DbgVariableRecord &InsertBefore);
 
   /// Iterator for ValueAsMetadata that internally uses direct pointer iteration
   /// over either a ValueAsMetadata* or a ValueAsMetadata**, dereferencing to the
@@ -412,7 +421,8 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   unsigned getNumVariableLocationOps() const;
 
   bool hasArgList() const { return isa<DIArgList>(getRawLocation()); }
-  /// Returns true if this DPValue has no empty MDNodes in its location list.
+  /// Returns true if this DbgVariableRecord has no empty MDNodes in its
+  /// location list.
   bool hasValidLocation() const { return getVariableLocationOp(0) != nullptr; }
 
   /// Does this describe the address of a local variable. True for dbg.addr
@@ -445,10 +455,10 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   /// replaceVariableLocationOp and addVariableLocationOps should be used where
   /// possible to avoid creating invalid state.
   void setRawLocation(Metadata *NewLocation) {
-    assert(
-        (isa<ValueAsMetadata>(NewLocation) || isa<DIArgList>(NewLocation) ||
-         isa<MDNode>(NewLocation)) &&
-        "Location for a DPValue must be either ValueAsMetadata or DIArgList");
+    assert((isa<ValueAsMetadata>(NewLocation) || isa<DIArgList>(NewLocation) ||
+            isa<MDNode>(NewLocation)) &&
+           "Location for a DbgVariableRecord must be either ValueAsMetadata or "
+           "DIArgList");
     resetDebugValue(0, NewLocation);
   }
 
@@ -456,12 +466,12 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   /// is described.
   std::optional<uint64_t> getFragmentSizeInBits() const;
 
-  bool isEquivalentTo(const DPValue &Other) const {
+  bool isEquivalentTo(const DbgVariableRecord &Other) const {
     return DbgLoc == Other.DbgLoc && isIdenticalToWhenDefined(Other);
   }
   // Matches the definition of the Instruction version, equivalent to above but
   // without checking DbgLoc.
-  bool isIdenticalToWhenDefined(const DPValue &Other) const {
+  bool isIdenticalToWhenDefined(const DbgVariableRecord &Other) const {
     return std::tie(Type, DebugValues, Variable, Expression,
                     AddressExpression) ==
            std::tie(Other.Type, Other.DebugValues, Other.Variable,
@@ -497,10 +507,10 @@ class DPValue : public DbgRecord, protected DebugValueUser {
 
   /// @}
 
-  DPValue *clone() const;
-  /// Convert this DPValue back into a dbg.value intrinsic.
+  DbgVariableRecord *clone() const;
+  /// Convert this DbgVariableRecord back into a dbg.value intrinsic.
   /// \p InsertBefore Optional position to insert this intrinsic.
-  /// \returns A new dbg.value intrinsic representiung this DPValue.
+  /// \returns A new dbg.value intrinsic representiung this DbgVariableRecord.
   DbgVariableIntrinsic *createDebugIntrinsic(Module *M,
                                              Instruction *InsertBefore) const;
 
@@ -517,12 +527,13 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   }
 };
 
-/// Filter the DbgRecord range to DPValue types only and downcast.
+/// Filter the DbgRecord range to DbgVariableRecord types only and downcast.
 static inline auto
 filterDbgVars(iterator_range<simple_ilist<DbgRecord>::iterator> R) {
   return map_range(
-      make_filter_range(R, [](DbgRecord &E) { return isa<DPValue>(E); }),
-      [](DbgRecord &E) { return std::ref(cast<DPValue>(E)); });
+      make_filter_range(R,
+                        [](DbgRecord &E) { return isa<DbgVariableRecord>(E); }),
+      [](DbgRecord &E) { return std::ref(cast<DbgVariableRecord>(E)); });
 }
 
 /// Per-instruction record of debug-info. If an Instruction is the position of
@@ -645,6 +656,8 @@ getDbgRecordRange(DPMarker *DbgMarker) {
   return DbgMarker->getDbgRecordRange();
 }
 
+DEFINE_ISA_CONVERSION_FUNCTIONS(DbgRecord, LLVMDbgRecordRef)
+
 } // namespace llvm
 
 #endif // LLVM_IR_DEBUGPROGRAMINSTRUCTION_H
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index f2922311097e9b..c07ffea7115115 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -962,9 +962,9 @@ class IRBuilderBase {
 
   /// Create a call to intrinsic \p ID with 2 operands which is mangled on the
   /// first type.
-  CallInst *CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS,
-                                  Instruction *FMFSource = nullptr,
-                                  const Twine &Name = "");
+  Value *CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS,
+                               Instruction *FMFSource = nullptr,
+                               const Twine &Name = "");
 
   /// Create a call to intrinsic \p ID with \p Args, mangled using \p Types. If
   /// \p FMFSource is provided, copy fast-math-flags from that instruction to
@@ -983,7 +983,7 @@ class IRBuilderBase {
                             const Twine &Name = "");
 
   /// Create call to the minnum intrinsic.
-  CallInst *CreateMinNum(Value *LHS, Value *RHS, const Twine &Name = "") {
+  Value *CreateMinNum(Value *LHS, Value *RHS, const Twine &Name = "") {
     if (IsFPConstrained) {
       return CreateConstrainedFPUnroundedBinOp(
           Intrinsic::experimental_constrained_minnum, LHS, RHS, nullptr, Name);
@@ -993,7 +993,7 @@ class IRBuilderBase {
   }
 
   /// Create call to the maxnum intrinsic.
-  CallInst *CreateMaxNum(Value *LHS, Value *RHS, const Twine &Name = "") {
+  Value *CreateMaxNum(Value *LHS, Value *RHS, const Twine &Name = "") {
     if (IsFPConstrained) {
       return CreateConstrainedFPUnroundedBinOp(
           Intrinsic::experimental_constrained_maxnum, LHS, RHS, nullptr, Name);
@@ -1003,19 +1003,19 @@ class IRBuilderBase {
   }
 
   /// Create call to the minimum intrinsic.
-  CallInst *CreateMinimum(Value *LHS, Value *RHS, const Twine &Name = "") {
+  Value *CreateMinimum(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS, nullptr, Name);
   }
 
   /// Create call to the maximum intrinsic.
-  CallInst *CreateMaximum(Value *LHS, Value *RHS, const Twine &Name = "") {
+  Value *CreateMaximum(Value *LHS, Value *RHS, const Twine &Name = "") {
     return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name);
   }
 
   /// Create call to the copysign intrinsic.
-  CallInst *CreateCopySign(Value *LHS, Value *RHS,
-                           Instruction *FMFSource = nullptr,
-                           const Twine &Name = "") {
+  Value *CreateCopySign(Value *LHS, Value *RHS,
+                        Instruction *FMFSource = nullptr,
+                        const Twine &Name = "") {
     return CreateBinaryIntrinsic(Intrinsic::copysign, LHS, RHS, FMFSource,
                                  Name);
   }
diff --git a/llvm/include/llvm/IR/IRBuilderFolder.h b/llvm/include/llvm/IR/IRBuilderFolder.h
index bd2324dfc5f1ba..3020f2684ee457 100644
--- a/llvm/include/llvm/IR/IRBuilderFolder.h
+++ b/llvm/include/llvm/IR/IRBuilderFolder.h
@@ -73,6 +73,10 @@ class IRBuilderFolder {
   virtual Value *FoldCast(Instruction::CastOps Op, Value *V,
                           Type *DestTy) const = 0;
 
+  virtual Value *
+  FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                      Instruction *FMFSource = nullptr) const = 0;
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 144298fd7c0162..091f9b38107989 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1528,6 +1528,12 @@ def int_umax : DefaultAttrsIntrinsic<
 def int_umin : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
     [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_scmp : DefaultAttrsIntrinsic<
+    [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_ucmp : DefaultAttrsIntrinsic<
+    [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 7229292e377a83..1164b241ba7b0d 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -21,20 +21,33 @@ def int_dx_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">,
     Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
 
 def int_dx_any  : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
+def int_dx_clamp : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+def int_dx_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; 
 
 def int_dx_dot : 
     Intrinsic<[LLVMVectorElementType<0>], 
-    [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+    [llvm_anyfloat_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+    [IntrNoMem, IntrWillReturn, Commutative] >;
+def int_dx_sdot : 
+    Intrinsic<[LLVMVectorElementType<0>], 
+    [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+    [IntrNoMem, IntrWillReturn, Commutative] >;
+def int_dx_udot : 
+    Intrinsic<[LLVMVectorElementType<0>], 
+    [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
     [IntrNoMem, IntrWillReturn, Commutative] >;
 
 def int_dx_frac  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 
-def int_dx_lerp :
-    Intrinsic<[LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
-    [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>,LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+def int_dx_isinf :
+    DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+    [llvm_anyfloat_ty]>;
+
+def int_dx_lerp : Intrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], 
     [IntrNoMem, IntrWillReturn] >;
 
 def int_dx_imad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 def int_dx_umad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 def int_dx_rcp  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+def int_dx_rsqrt  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 766b3b542a9e06..0eb09b1699aff4 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -40,6 +40,18 @@ let TargetPrefix = "spv" in {
   def int_spv_assume : Intrinsic<[], [llvm_i1_ty]>;
   def int_spv_expect : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
 
+  // Memory Use Markers
+  def int_spv_lifetime_start : Intrinsic<[],
+                                [llvm_i64_ty, llvm_anyptr_ty],
+                                [IntrArgMemOnly, IntrWillReturn,
+                                NoCapture<ArgIndex<1>>,
+                                ImmArg<ArgIndex<0>>]>;
+  def int_spv_lifetime_end   : Intrinsic<[],
+                                [llvm_i64_ty, llvm_anyptr_ty],
+                                [IntrArgMemOnly, IntrWillReturn,
+                                NoCapture<ArgIndex<1>>,
+                                ImmArg<ArgIndex<0>>]>;
+
   // The following intrinsic(s) are mirrored from IntrinsicsDirectX.td for HLSL support.
   def int_spv_thread_id : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrWillReturn]>;
   def int_spv_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">,
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index da6744fdd09166..22da54a1f03c59 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -43,7 +43,7 @@ namespace llvm {
 class Module;
 class ModuleSlotTracker;
 class raw_ostream;
-class DPValue;
+class DbgVariableRecord;
 template <typename T> class StringMapEntry;
 template <typename ValueTy> class StringMapEntryStorage;
 class Type;
@@ -205,23 +205,23 @@ class MetadataAsValue : public Value {
 /// Base class for tracking ValueAsMetadata/DIArgLists with user lookups and
 /// Owner callbacks outside of ValueAsMetadata.
 ///
-/// Currently only inherited by DPValue; if other classes need to use it, then
-/// a SubclassID will need to be added (either as a new field or by making
-/// DebugValue into a PointerIntUnion) to discriminate between the subclasses in
-/// lookup and callback handling.
+/// Currently only inherited by DbgVariableRecord; if other classes need to use
+/// it, then a SubclassID will need to be added (either as a new field or by
+/// making DebugValue into a PointerIntUnion) to discriminate between the
+/// subclasses in lookup and callback handling.
 class DebugValueUser {
 protected:
   // Capacity to store 3 debug values.
   // TODO: Not all DebugValueUser instances need all 3 elements, if we
-  // restructure the DPValue class then we can template parameterize this array
-  // size.
+  // restructure the DbgVariableRecord class then we can template parameterize
+  // this array size.
   std::array<Metadata *, 3> DebugValues;
 
   ArrayRef<Metadata *> getDebugValues() const { return DebugValues; }
 
 public:
-  DPValue *getUser();
-  const DPValue *getUser() const;
+  DbgVariableRecord *getUser();
+  const DbgVariableRecord *getUser() const;
   /// To be called by ReplaceableMetadataImpl::replaceAllUsesWith, where `Old`
   /// is a pointer to one of the pointers in `DebugValues` (so should be type
   /// Metadata**), and `NewDebugValue` is the new Metadata* that is replacing
@@ -407,8 +407,8 @@ class ReplaceableMetadataImpl {
   static void SalvageDebugInfo(const Constant &C); 
   /// Returns the list of all DIArgList users of this.
   SmallVector<Metadata *> getAllArgListUsers();
-  /// Returns the list of all DPValue users of this.
-  SmallVector<DPValue *> getAllDPValueUsers();
+  /// Returns the list of all DbgVariableRecord users of this.
+  SmallVector<DbgVariableRecord *> getAllDbgVariableRecordUsers();
 
   /// Resolve all uses of this.
   ///
@@ -494,8 +494,8 @@ class ValueAsMetadata : public Metadata, ReplaceableMetadataImpl {
   SmallVector<Metadata *> getAllArgListUsers() {
     return ReplaceableMetadataImpl::getAllArgListUsers();
   }
-  SmallVector<DPValue *> getAllDPValueUsers() {
-    return ReplaceableMetadataImpl::getAllDPValueUsers();
+  SmallVector<DbgVariableRecord *> getAllDbgVariableRecordUsers() {
+    return ReplaceableMetadataImpl::getAllDbgVariableRecordUsers();
   }
 
   static void handleDeletion(Value *V);
diff --git a/llvm/include/llvm/IR/NoFolder.h b/llvm/include/llvm/IR/NoFolder.h
index a612f98465aeaa..7bb5d5e696e9ea 100644
--- a/llvm/include/llvm/IR/NoFolder.h
+++ b/llvm/include/llvm/IR/NoFolder.h
@@ -112,6 +112,11 @@ class NoFolder final : public IRBuilderFolder {
     return nullptr;
   }
 
+  Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
+                             Instruction *FMFSource) const override {
+    return nullptr;
+  }
+
   //===--------------------------------------------------------------------===//
   // Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
index e5264983390109..e14a916b474935 100644
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -404,7 +404,6 @@ class GEPOperator
 
   enum {
     IsInBounds = (1 << 0),
-    // InRangeIndex: bits 1-6
   };
 
   void setIsInBounds(bool B) {
@@ -423,11 +422,7 @@ class GEPOperator
 
   /// Returns the offset of the index with an inrange attachment, or
   /// std::nullopt if none.
-  std::optional<unsigned> getInRangeIndex() const {
-    if (SubclassOptionalData >> 1 == 0)
-      return std::nullopt;
-    return (SubclassOptionalData >> 1) - 1;
-  }
+  std::optional<ConstantRange> getInRange() const;
 
   inline op_iterator       idx_begin()       { return op_begin()+1; }
   inline const_op_iterator idx_begin() const { return op_begin()+1; }
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 487ae170210de9..382009d9df785d 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -68,6 +68,22 @@ template <typename T> inline OneUse_match<T> m_OneUse(const T &SubPattern) {
   return SubPattern;
 }
 
+template <typename SubPattern_t> struct AllowReassoc_match {
+  SubPattern_t SubPattern;
+
+  AllowReassoc_match(const SubPattern_t &SP) : SubPattern(SP) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    auto *I = dyn_cast<FPMathOperator>(V);
+    return I && I->hasAllowReassoc() && SubPattern.match(I);
+  }
+};
+
+template <typename T>
+inline AllowReassoc_match<T> m_AllowReassoc(const T &SubPattern) {
+  return SubPattern;
+}
+
 template <typename Class> struct class_match {
   template <typename ITy> bool match(ITy *V) { return isa<Class>(V); }
 };
diff --git a/llvm/include/llvm/MC/MCInstrAnalysis.h b/llvm/include/llvm/MC/MCInstrAnalysis.h
index e3ddf0b8b8939c..87db57f74f520e 100644
--- a/llvm/include/llvm/MC/MCInstrAnalysis.h
+++ b/llvm/include/llvm/MC/MCInstrAnalysis.h
@@ -123,14 +123,14 @@ class MCInstrAnalysis {
   /// broken. Each bit of the mask is associated with a specific input operand.
   /// Bits associated with explicit input operands are laid out first in the
   /// mask; implicit operands come after explicit operands.
-  /// 
+  ///
   /// Dependencies are broken only for operands that have their corresponding bit
   /// set. Operands that have their bit cleared, or that don't have a
   /// corresponding bit in the mask don't have their dependency broken.  Note
   /// that Mask may not be big enough to describe all operands.  The assumption
   /// for operands that don't have a correspondent bit in the mask is that those
   /// are still data dependent.
-  /// 
+  ///
   /// The only exception to the rule is for when Mask has all zeroes.
   /// A zero mask means: dependencies are broken for all explicit register
   /// operands.
diff --git a/llvm/include/llvm/MC/MCSymbolXCOFF.h b/llvm/include/llvm/MC/MCSymbolXCOFF.h
index 11c3b8831ba517..3bf4491e8fe2f8 100644
--- a/llvm/include/llvm/MC/MCSymbolXCOFF.h
+++ b/llvm/include/llvm/MC/MCSymbolXCOFF.h
@@ -26,6 +26,8 @@ class MCSymbolXCOFF : public MCSymbol {
 
   static bool classof(const MCSymbol *S) { return S->isXCOFF(); }
 
+  enum CodeModel : uint8_t { CM_Small, CM_Large };
+
   static StringRef getUnqualifiedName(StringRef Name) {
     if (Name.back() == ']') {
       StringRef Lhs, Rhs;
@@ -72,8 +74,22 @@ class MCSymbolXCOFF : public MCSymbol {
 
   void setEHInfo() const { modifyFlags(SF_EHInfo, SF_EHInfo); }
 
+  bool hasPerSymbolCodeModel() const { return PerSymbolCodeModel.has_value(); }
+
+  CodeModel getPerSymbolCodeModel() const {
+    assert(hasPerSymbolCodeModel() &&
+           "Requested code model for symbol without one");
+    return *PerSymbolCodeModel;
+  }
+
+  void setPerSymbolCodeModel(MCSymbolXCOFF::CodeModel Model) {
+    PerSymbolCodeModel = Model;
+  }
+
 private:
   std::optional<XCOFF::StorageClass> StorageClass;
+  std::optional<CodeModel> PerSymbolCodeModel;
+
   MCSectionXCOFF *RepresentedCsect = nullptr;
   XCOFF::VisibilityType VisibilityType = XCOFF::SYM_V_UNSPECIFIED;
   StringRef SymbolTableName;
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index c9227da65708cc..f57a7ab8882ad2 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -60,6 +60,7 @@ class ELFObjectFileBase : public ObjectFile {
 
   SubtargetFeatures getMIPSFeatures() const;
   SubtargetFeatures getARMFeatures() const;
+  SubtargetFeatures getHexagonFeatures() const;
   Expected<SubtargetFeatures> getRISCVFeatures() const;
   SubtargetFeatures getLoongArchFeatures() const;
 
@@ -389,25 +390,38 @@ template <class ELFT> class ELFObjectFile : public ELFObjectFileBase {
   }
 
   Error getBuildAttributes(ELFAttributeParser &Attributes) const override {
+    uint32_t Type;
+    switch (getEMachine()) {
+    case ELF::EM_ARM:
+      Type = ELF::SHT_ARM_ATTRIBUTES;
+      break;
+    case ELF::EM_RISCV:
+      Type = ELF::SHT_RISCV_ATTRIBUTES;
+      break;
+    case ELF::EM_HEXAGON:
+      Type = ELF::SHT_HEXAGON_ATTRIBUTES;
+      break;
+    default:
+      return Error::success();
+    }
+
     auto SectionsOrErr = EF.sections();
     if (!SectionsOrErr)
       return SectionsOrErr.takeError();
-
     for (const Elf_Shdr &Sec : *SectionsOrErr) {
-      if (Sec.sh_type == ELF::SHT_ARM_ATTRIBUTES ||
-          Sec.sh_type == ELF::SHT_RISCV_ATTRIBUTES) {
-        auto ErrorOrContents = EF.getSectionContents(Sec);
-        if (!ErrorOrContents)
-          return ErrorOrContents.takeError();
-
-        auto Contents = ErrorOrContents.get();
-        if (Contents[0] != ELFAttrs::Format_Version || Contents.size() == 1)
-          return Error::success();
-
-        if (Error E = Attributes.parse(Contents, ELFT::TargetEndianness))
-          return E;
-        break;
-      }
+      if (Sec.sh_type != Type)
+        continue;
+      auto ErrorOrContents = EF.getSectionContents(Sec);
+      if (!ErrorOrContents)
+        return ErrorOrContents.takeError();
+
+      auto Contents = ErrorOrContents.get();
+      if (Contents[0] != ELFAttrs::Format_Version || Contents.size() == 1)
+        return Error::success();
+
+      if (Error E = Attributes.parse(Contents, ELFT::TargetEndianness))
+        return E;
+      break;
     }
     return Error::success();
   }
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 497f82bbd0f32a..f7f8d5e6bf4722 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -56,7 +56,7 @@ struct DXILProgram {
   std::optional<std::vector<llvm::yaml::Hex8>> DXIL;
 };
 
-#define SHADER_FEATURE_FLAG(Num, Val, Str) bool Val = false;
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str) bool Val = false;
 struct ShaderFeatureFlags {
   ShaderFeatureFlags() = default;
   ShaderFeatureFlags(uint64_t FlagData);
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 6822cfdb4957b6..8817a2585646ca 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -626,6 +626,52 @@ class PassBuilder {
   void invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM,
                                                     OptimizationLevel Level);
 
+  static bool checkParametrizedPassName(StringRef Name, StringRef PassName) {
+    if (!Name.consume_front(PassName))
+      return false;
+    // normal pass name w/o parameters == default parameters
+    if (Name.empty())
+      return true;
+    return Name.starts_with("<") && Name.ends_with(">");
+  }
+
+  /// This performs customized parsing of pass name with parameters.
+  ///
+  /// We do not need parametrization of passes in textual pipeline very often,
+  /// yet on a rare occasion ability to specify parameters right there can be
+  /// useful.
+  ///
+  /// \p Name - parameterized specification of a pass from a textual pipeline
+  /// is a string in a form of :
+  ///      PassName '<' parameter-list '>'
+  ///
+  /// Parameter list is being parsed by the parser callable argument, \p Parser,
+  /// It takes a string-ref of parameters and returns either StringError or a
+  /// parameter list in a form of a custom parameters type, all wrapped into
+  /// Expected<> template class.
+  ///
+  template <typename ParametersParseCallableT>
+  static auto parsePassParameters(ParametersParseCallableT &&Parser,
+                                  StringRef Name, StringRef PassName)
+      -> decltype(Parser(StringRef{})) {
+    using ParametersT = typename decltype(Parser(StringRef{}))::value_type;
+
+    StringRef Params = Name;
+    if (!Params.consume_front(PassName)) {
+      llvm_unreachable(
+          "unable to strip pass name from parametrized pass specification");
+    }
+    if (!Params.empty() &&
+        (!Params.consume_front("<") || !Params.consume_back(">"))) {
+      llvm_unreachable("invalid format for parametrized pass name");
+    }
+
+    Expected<ParametersT> Result = Parser(Params);
+    assert((Result || Result.template errorIsA<StringError>()) &&
+           "Pass parameter parser can only return StringErrors.");
+    return Result;
+  }
+
 private:
   // O1 pass pipeline
   FunctionPassManager
diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
index 84cac3ef700e05..a119b0724d677f 100644
--- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h
@@ -46,9 +46,12 @@
 
 // Sets bits for specified bit mask in specified destination.
 #ifndef AMDHSA_BITS_SET
-#define AMDHSA_BITS_SET(DST, MSK, VAL)  \
-  DST &= ~MSK;                          \
-  DST |= ((VAL << MSK ## _SHIFT) & MSK)
+#define AMDHSA_BITS_SET(DST, MSK, VAL)                                         \
+  do {                                                                         \
+    auto local = VAL;                                                          \
+    DST &= ~MSK;                                                               \
+    DST |= ((local << MSK##_SHIFT) & MSK);                                     \
+  } while (0)
 #endif // AMDHSA_BITS_SET
 
 namespace llvm {
diff --git a/llvm/include/llvm/Support/HexagonAttributeParser.h b/llvm/include/llvm/Support/HexagonAttributeParser.h
new file mode 100644
index 00000000000000..1116dd42b1ad01
--- /dev/null
+++ b/llvm/include/llvm/Support/HexagonAttributeParser.h
@@ -0,0 +1,37 @@
+//===-- HexagonAttributeParser.h - Hexagon Attribute Parser -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_HEXAGONATTRIBUTEPARSER_H
+#define LLVM_SUPPORT_HEXAGONATTRIBUTEPARSER_H
+
+#include "llvm/Support/ELFAttributeParser.h"
+#include "llvm/Support/HexagonAttributes.h"
+
+namespace llvm {
+class HexagonAttributeParser : public ELFAttributeParser {
+  struct DisplayHandler {
+    HexagonAttrs::AttrType Attribute;
+    Error (HexagonAttributeParser::*Routine)(unsigned);
+  };
+
+  static const DisplayHandler DisplayRoutines[];
+
+  Error handler(uint64_t Tag, bool &Handled) override;
+
+public:
+  HexagonAttributeParser(ScopedPrinter *SP)
+      : ELFAttributeParser(SP, HexagonAttrs::getHexagonAttributeTags(),
+                           "hexagon") {}
+  HexagonAttributeParser()
+      : ELFAttributeParser(HexagonAttrs::getHexagonAttributeTags(), "hexagon") {
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/HexagonAttributes.h b/llvm/include/llvm/Support/HexagonAttributes.h
new file mode 100644
index 00000000000000..8a50d8993e633b
--- /dev/null
+++ b/llvm/include/llvm/Support/HexagonAttributes.h
@@ -0,0 +1,32 @@
+//===-- HexagonAttributes.h - Qualcomm Hexagon Attributes -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_HEXAGONATTRIBUTES_H
+#define LLVM_SUPPORT_HEXAGONATTRIBUTES_H
+
+#include "llvm/Support/ELFAttributes.h"
+
+namespace llvm {
+namespace HexagonAttrs {
+
+const TagNameMap &getHexagonAttributeTags();
+
+enum AttrType : unsigned {
+  ARCH = 4,
+  HVXARCH = 5,
+  HVXIEEEFP = 6,
+  HVXQFLOAT = 7,
+  ZREG = 8,
+  AUDIO = 9,
+  CABAC = 10
+};
+
+} // namespace HexagonAttrs
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
index 7fc572b6ff06ef..c4e741549f3ff1 100644
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -200,6 +200,20 @@ inline int64_t decodeSLEB128(const uint8_t *p, unsigned *n = nullptr,
   return Value;
 }
 
+inline uint64_t decodeULEB128AndInc(const uint8_t *&p) {
+  unsigned n;
+  auto ret = decodeULEB128(p, &n);
+  p += n;
+  return ret;
+}
+
+inline int64_t decodeSLEB128AndInc(const uint8_t *&p) {
+  unsigned n;
+  auto ret = decodeSLEB128(p, &n);
+  p += n;
+  return ret;
+}
+
 /// Utility function to get the size of the ULEB128-encoded value.
 extern unsigned getULEB128Size(uint64_t Value);
 
diff --git a/llvm/include/llvm/Support/TimeProfiler.h b/llvm/include/llvm/Support/TimeProfiler.h
index 454a65f70231f4..31f7df10916db9 100644
--- a/llvm/include/llvm/Support/TimeProfiler.h
+++ b/llvm/include/llvm/Support/TimeProfiler.h
@@ -86,6 +86,8 @@ class raw_pwrite_stream;
 struct TimeTraceProfiler;
 TimeTraceProfiler *getTimeTraceProfilerInstance();
 
+struct TimeTraceProfilerEntry;
+
 /// Initialize the time trace profiler.
 /// This sets up the global \p TimeTraceProfilerInstance
 /// variable to be the profiler instance.
@@ -120,19 +122,30 @@ Error timeTraceProfilerWrite(StringRef PreferredFileName,
 /// Profiler copies the string data, so the pointers can be given into
 /// temporaries. Time sections can be hierarchical; every Begin must have a
 /// matching End pair but they can nest.
-void timeTraceProfilerBegin(StringRef Name, StringRef Detail);
-void timeTraceProfilerBegin(StringRef Name,
-                            llvm::function_ref<std::string()> Detail);
+TimeTraceProfilerEntry *timeTraceProfilerBegin(StringRef Name,
+                                               StringRef Detail);
+TimeTraceProfilerEntry *
+timeTraceProfilerBegin(StringRef Name,
+                       llvm::function_ref<std::string()> Detail);
+
+/// Manually begin a time section, with the given \p Name and \p Detail.
+/// This starts Async Events having \p Name as a category which is shown
+/// separately from other traces. See
+/// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview#heading=h.jh64i9l3vwa1
+/// for more details.
+TimeTraceProfilerEntry *timeTraceAsyncProfilerBegin(StringRef Name,
+                                                    StringRef Detail);
 
 /// Manually end the last time section.
 void timeTraceProfilerEnd();
+void timeTraceProfilerEnd(TimeTraceProfilerEntry *E);
 
 /// The TimeTraceScope is a helper class to call the begin and end functions
 /// of the time trace profiler.  When the object is constructed, it begins
 /// the section; and when it is destroyed, it stops it. If the time profiler
 /// is not initialized, the overhead is a single branch.
-struct TimeTraceScope {
-
+class TimeTraceScope {
+public:
   TimeTraceScope() = delete;
   TimeTraceScope(const TimeTraceScope &) = delete;
   TimeTraceScope &operator=(const TimeTraceScope &) = delete;
@@ -141,20 +154,23 @@ struct TimeTraceScope {
 
   TimeTraceScope(StringRef Name) {
     if (getTimeTraceProfilerInstance() != nullptr)
-      timeTraceProfilerBegin(Name, StringRef(""));
+      Entry = timeTraceProfilerBegin(Name, StringRef(""));
   }
   TimeTraceScope(StringRef Name, StringRef Detail) {
     if (getTimeTraceProfilerInstance() != nullptr)
-      timeTraceProfilerBegin(Name, Detail);
+      Entry = timeTraceProfilerBegin(Name, Detail);
   }
   TimeTraceScope(StringRef Name, llvm::function_ref<std::string()> Detail) {
     if (getTimeTraceProfilerInstance() != nullptr)
-      timeTraceProfilerBegin(Name, Detail);
+      Entry = timeTraceProfilerBegin(Name, Detail);
   }
   ~TimeTraceScope() {
     if (getTimeTraceProfilerInstance() != nullptr)
-      timeTraceProfilerEnd();
+      timeTraceProfilerEnd(Entry);
   }
+
+private:
+  TimeTraceProfilerEntry *Entry = nullptr;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index ef1fac92c2fa4a..770ca8764426a4 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -929,12 +929,12 @@ class RedirectingFileSystem
   /// Canonicalize path by removing ".", "..", "./", components. This is
   /// a VFS request, do not bother about symlinks in the path components
   /// but canonicalize in order to perform the correct entry search.
-  std::error_code makeCanonical(SmallVectorImpl<char> &Path) const;
+  std::error_code makeCanonicalForLookup(SmallVectorImpl<char> &Path) const;
 
   /// Get the File status, or error, from the underlying external file system.
   /// This returns the status with the originally requested name, while looking
-  /// up the entry using the canonical path.
-  ErrorOr<Status> getExternalStatus(const Twine &CanonicalPath,
+  /// up the entry using a potentially different path.
+  ErrorOr<Status> getExternalStatus(const Twine &LookupPath,
                                     const Twine &OriginalPath) const;
 
   /// Make \a Path an absolute path.
@@ -1022,7 +1022,7 @@ class RedirectingFileSystem
                  llvm::SmallVectorImpl<Entry *> &Entries) const;
 
   /// Get the status for a path with the provided \c LookupResult.
-  ErrorOr<Status> status(const Twine &CanonicalPath, const Twine &OriginalPath,
+  ErrorOr<Status> status(const Twine &LookupPath, const Twine &OriginalPath,
                          const LookupResult &Result);
 
 public:
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 0577c58f8da2df..1e40cc49040d6f 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -947,8 +947,6 @@ class AsmOperandClass {
   /// instruction if it hasn't matched all the operands yet.  However, this
   /// error will be suppressed if all of the remaining unmatched operands are
   /// marked as IsOptional.
-  ///
-  /// Optional arguments must be at the end of the operand list.
   bit IsOptional = false;
 
   /// The name of the method on the target specific asm parser that returns the
@@ -1564,6 +1562,20 @@ class AsmParser {
   // method shall be called for all operands as opposed to only those
   // that have their own specified custom parsers.
   bit CallCustomParserForAllOperands = false;
+
+  // PreferSmallerInstructions - Should the assembly matcher prefer the smaller
+  // instructions.
+  // 
+  // This is useful for the ARM instructions set where smaller encodings must
+  // be preferentially selected.
+  //
+  // The preference order is: 
+  //    Instrution size (if this option is enabled, smallest first)
+  //    Number of Operands (least first), 
+  //    Operand Classes (lexicographically by operand), 
+  //    (Optional) Instruction id (see AsmMatcherEmitter.cpp for details), 
+  //    Number of required features (least first)
+  bit PreferSmallerInstructions = false;
 }
 def DefaultAsmParser : AsmParser;
 
@@ -1865,3 +1877,8 @@ include "llvm/Target/GlobalISel/SelectionDAGCompat.td"
 // Pull in the common support for Pfm Counters generation.
 //
 include "llvm/Target/TargetPfmCounters.td"
+
+//===----------------------------------------------------------------------===//
+// Pull in the common support for macro fusion.
+//
+include "llvm/Target/TargetMacroFusion.td"
diff --git a/llvm/include/llvm/Target/TargetMacroFusion.td b/llvm/include/llvm/Target/TargetMacroFusion.td
new file mode 100644
index 00000000000000..0306794ef50d1f
--- /dev/null
+++ b/llvm/include/llvm/Target/TargetMacroFusion.td
@@ -0,0 +1,153 @@
+//===-- TargetMacroFusion.td - Target Macro Fusion ---------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TableGen-based macro fusion classes.
+
+// The target instruction that FusionPredicate will be evaluated on.
+class FusionTarget;
+def first_fusion_target : FusionTarget;
+def second_fusion_target : FusionTarget;
+def both_fusion_target : FusionTarget;
+
+// Base class of FusionPredicate, etc. The avaliable variables are:
+// * const TargetInstrInfo &TII
+// * const TargetSubtargetInfo &STI
+// * const MachineRegisterInfo &MRI
+// * const MachineInstr *FirstMI
+// * const MachineInstr &SecondMI
+class FusionPredicate<FusionTarget target> {
+  FusionTarget Target = target;
+}
+class FirstFusionPredicate: FusionPredicate<first_fusion_target>;
+class SecondFusionPredicate: FusionPredicate<second_fusion_target>;
+class BothFusionPredicate: FusionPredicate<both_fusion_target>;
+
+// FusionPredicate with raw code predicate.
+class FusionPredicateWithCode<code pred> : FusionPredicate<both_fusion_target> {
+  code Predicate = pred;
+}
+
+// FusionPredicate with MCInstPredicate.
+class FusionPredicateWithMCInstPredicate<FusionTarget target, MCInstPredicate pred>
+  : FusionPredicate<target> {
+  MCInstPredicate Predicate = pred;
+}
+class FirstFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
+  : FusionPredicateWithMCInstPredicate<first_fusion_target, pred>;
+class SecondFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
+  : FusionPredicateWithMCInstPredicate<second_fusion_target, pred>;
+// The pred will be applied on both firstMI and secondMI.
+class BothFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
+  : FusionPredicateWithMCInstPredicate<both_fusion_target, pred>;
+
+// Tie firstOpIdx and secondOpIdx. The operand of `FirstMI` at position
+// `firstOpIdx` should be the same as the operand of `SecondMI` at position
+// `secondOpIdx`.
+// If the fusion has `IsCommutable` being true and the operand at `secondOpIdx`
+// has commutable operand, then the commutable operand will be checked too.
+class TieReg<int firstOpIdx, int secondOpIdx> : BothFusionPredicate {
+  int FirstOpIdx = firstOpIdx;
+  int SecondOpIdx = secondOpIdx;
+}
+
+// The operand of `SecondMI` at position `firstOpIdx` should be the same as the
+// operand at position `secondOpIdx`.
+// If the fusion has `IsCommutable` being true and the operand at `secondOpIdx`
+// has commutable operand, then the commutable operand will be checked too.
+class SameReg<int firstOpIdx, int secondOpIdx> : SecondFusionPredicate {
+  int FirstOpIdx = firstOpIdx;
+  int SecondOpIdx = secondOpIdx;
+}
+
+// A predicate for wildcard. The generated code will be like:
+// ```
+// if (!FirstMI)
+//   return ReturnValue;
+// ```
+class WildcardPred<bit ret> : FirstFusionPredicate {
+  bit ReturnValue = ret;
+}
+def WildcardFalse : WildcardPred<0>;
+def WildcardTrue : WildcardPred<1>;
+
+// Indicates that the destination register of `FirstMI` should have one use if
+// it is a virtual register.
+class OneUsePred : FirstFusionPredicate;
+def OneUse : OneUsePred;
+
+// Handled by MacroFusionPredicatorEmitter backend.
+// The generated predicator will be like:
+// ```
+// bool isNAME(const TargetInstrInfo &TII,
+//             const TargetSubtargetInfo &STI,
+//             const MachineInstr *FirstMI,
+//             const MachineInstr &SecondMI) {
+//   auto &MRI = SecondMI.getMF()->getRegInfo();
+//   /* Predicates */
+//   return true;
+// }
+// ```
+//
+// `IsCommutable` means whether we should handle commutable operands.
+class Fusion<string name, string fieldName, string desc, list<FusionPredicate> predicates>
+  : SubtargetFeature<name, fieldName, "true", desc> {
+  list<FusionPredicate> Predicates = predicates;
+  bit IsCommutable = 0;
+}
+
+// The generated predicator will be like:
+// ```
+// bool isNAME(const TargetInstrInfo &TII,
+//             const TargetSubtargetInfo &STI,
+//             const MachineInstr *FirstMI,
+//             const MachineInstr &SecondMI) {
+//   auto &MRI = SecondMI.getMF()->getRegInfo();
+//   /* Prolog */
+//   /* Predicate for `SecondMI` */
+//   /* Wildcard */
+//   /* Predicate for `FirstMI` */
+//   /* Check same registers */
+//   /* Check One Use */
+//   /* Tie registers */
+//   /* Epilog */
+//   return true;
+// }
+// ```
+class SimpleFusion<string name, string fieldName, string desc,
+                   MCInstPredicate firstPred, MCInstPredicate secondPred,
+                   list<FusionPredicate> prolog = [],
+                   list<FusionPredicate> epilog = []>
+  : Fusion<name, fieldName, desc,
+           !listconcat(
+              prolog,
+              [
+                SecondFusionPredicateWithMCInstPredicate<secondPred>,
+                WildcardTrue,
+                FirstFusionPredicateWithMCInstPredicate<firstPred>,
+                SameReg<0, 1>,
+                OneUse,
+                TieReg<0, 1>,
+              ],
+              epilog)>;
+
+class SingleFusion<string name, string fieldName, string desc,
+                   Instruction firstInst, Instruction secondInst,
+                   MCInstPredicate firstInstPred = TruePred,
+                   MCInstPredicate secondInstPred = TruePred,
+                   list<FusionPredicate> prolog = [],
+                   list<FusionPredicate> epilog = []>
+  : SimpleFusion<name, fieldName, desc,
+                 CheckAll<!listconcat(
+                            [CheckOpcode<[firstInst]>],
+                            [firstInstPred])>,
+                 CheckAll<!listconcat(
+                            [CheckOpcode<[secondInst]>],
+                            [secondInstPred])>,
+                 prolog, epilog> {
+  let IsCommutable = secondInst.isCommutable;
+}
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index 069eb2900bfe68..2562ed0901303f 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -581,119 +581,3 @@ class MemoryQueue<ProcResourceKind PR> {
 
 class LoadQueue<ProcResourceKind LDQueue> : MemoryQueue<LDQueue>;
 class StoreQueue<ProcResourceKind STQueue> : MemoryQueue<STQueue>;
-
-// The target instruction that FusionPredicate will be evaluated on.
-class FusionTarget;
-def first_fusion_target : FusionTarget;
-def second_fusion_target : FusionTarget;
-def both_fusion_target : FusionTarget;
-
-// Base class of FusionPredicate, etc. The avaliable variables are:
-// * const TargetInstrInfo &TII
-// * const TargetSubtargetInfo &STI
-// * const MachineRegisterInfo &MRI
-// * const MachineInstr *FirstMI
-// * const MachineInstr &SecondMI
-class FusionPredicate<FusionTarget target> {
-  FusionTarget Target = target;
-}
-class FirstFusionPredicate: FusionPredicate<first_fusion_target>;
-class SecondFusionPredicate: FusionPredicate<second_fusion_target>;
-class BothFusionPredicate: FusionPredicate<both_fusion_target>;
-
-// FusionPredicate with raw code predicate.
-class FusionPredicateWithCode<code pred> : FusionPredicate<both_fusion_target> {
-  code Predicate = pred;
-}
-
-// FusionPredicate with MCInstPredicate.
-class FusionPredicateWithMCInstPredicate<FusionTarget target, MCInstPredicate pred>
-  : FusionPredicate<target> {
-  MCInstPredicate Predicate = pred;
-}
-class FirstFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
-  : FusionPredicateWithMCInstPredicate<first_fusion_target, pred>;
-class SecondFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
-  : FusionPredicateWithMCInstPredicate<second_fusion_target, pred>;
-// The pred will be applied on both firstMI and secondMI.
-class BothFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
-  : FusionPredicateWithMCInstPredicate<both_fusion_target, pred>;
-
-// Tie firstOpIdx and secondOpIdx. The operand of `FirstMI` at position
-// `firstOpIdx` should be the same as the operand of `SecondMI` at position
-// `secondOpIdx`.
-class TieReg<int firstOpIdx, int secondOpIdx> : BothFusionPredicate {
-  int FirstOpIdx = firstOpIdx;
-  int SecondOpIdx = secondOpIdx;
-}
-
-// A predicate for wildcard. The generated code will be like:
-// ```
-// if (!FirstMI)
-//   return ReturnValue;
-// ```
-class WildcardPred<bit ret> : FirstFusionPredicate {
-  bit ReturnValue = ret;
-}
-def WildcardFalse : WildcardPred<0>;
-def WildcardTrue : WildcardPred<1>;
-
-// Indicates that the destination register of `FirstMI` should have one use if
-// it is a virtual register.
-class OneUsePred : FirstFusionPredicate;
-def OneUse : OneUsePred;
-
-// Handled by MacroFusionPredicatorEmitter backend.
-// The generated predicator will be like:
-// ```
-// bool isNAME(const TargetInstrInfo &TII,
-//             const TargetSubtargetInfo &STI,
-//             const MachineInstr *FirstMI,
-//             const MachineInstr &SecondMI) {
-//   auto &MRI = SecondMI.getMF()->getRegInfo();
-//   /* Predicates */
-//   return true;
-// }
-// ```
-class Fusion<string name, string fieldName, string desc, list<FusionPredicate> predicates>
-  : SubtargetFeature<name, fieldName, "true", desc> {
-  list<FusionPredicate> Predicates = predicates;
-}
-
-// The generated predicator will be like:
-// ```
-// bool isNAME(const TargetInstrInfo &TII,
-//             const TargetSubtargetInfo &STI,
-//             const MachineInstr *FirstMI,
-//             const MachineInstr &SecondMI) {
-//   auto &MRI = SecondMI.getMF()->getRegInfo();
-//   /* Prolog */
-//   /* Predicate for `SecondMI` */
-//   /* Wildcard */
-//   /* Predicate for `FirstMI` */
-//   /* Check One Use */
-//   /* Tie registers */
-//   /* Epilog */
-//   return true;
-// }
-// ```
-class SimpleFusion<string name, string fieldName, string desc,
-                   MCInstPredicate firstPred, MCInstPredicate secondPred,
-                   list<FusionPredicate> prolog = [],
-                   list<FusionPredicate> epilog = []>
-  : Fusion<name, fieldName, desc,
-           !listconcat(
-              prolog,
-              [
-                SecondFusionPredicateWithMCInstPredicate<secondPred>,
-                WildcardTrue,
-                FirstFusionPredicateWithMCInstPredicate<firstPred>,
-                SecondFusionPredicateWithMCInstPredicate<
-                  CheckAny<[
-                    CheckIsVRegOperand<0>,
-                    CheckSameRegOperand<0, 1>
-                  ]>>,
-                OneUse,
-                TieReg<0, 1>,
-              ],
-              epilog)>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index d7bf8c35ee1072..ea3520835fa07d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -318,7 +318,7 @@ def SDTAtomicStore : SDTypeProfile<0, 2, [
   SDTCisInt<0>, SDTCisPtrTy<1>
 ]>;
 def SDTAtomicLoad : SDTypeProfile<1, 1, [
-  SDTCisInt<0>, SDTCisPtrTy<1>
+  SDTCisPtrTy<1>
 ]>;
 
 class SDCallSeqStart<list<SDTypeConstraint> constraints> :
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index f4bb94f98bcd77..805b963a7a13c7 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -554,6 +554,11 @@ inline constexpr CpuInfo CpuInfos[] = {
          {AArch64::AEK_SB, AArch64::AEK_SSBS, AArch64::AEK_MTE,
           AArch64::AEK_FP16FML, AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
           AArch64::AEK_FLAGM, AArch64::AEK_PERFMON, AArch64::AEK_PREDRES})},
+    {"cortex-a520ae", ARMV9_2A,
+     AArch64::ExtensionBitset(
+         {AArch64::AEK_SB, AArch64::AEK_SSBS, AArch64::AEK_MTE,
+          AArch64::AEK_FP16FML, AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
+          AArch64::AEK_FLAGM, AArch64::AEK_PERFMON, AArch64::AEK_PREDRES})},
     {"cortex-a57", ARMV8A,
      AArch64::ExtensionBitset(
          {AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_CRC})},
@@ -621,6 +626,12 @@ inline constexpr CpuInfo CpuInfos[] = {
                                AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
                                AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
                                AArch64::AEK_PREDRES, AArch64::AEK_PROFILE})},
+    {"cortex-a720ae", ARMV9_2A,
+     AArch64::ExtensionBitset({AArch64::AEK_SB, AArch64::AEK_SSBS,
+                               AArch64::AEK_MTE, AArch64::AEK_FP16FML,
+                               AArch64::AEK_PAUTH, AArch64::AEK_SVE2BITPERM,
+                               AArch64::AEK_FLAGM, AArch64::AEK_PERFMON,
+                               AArch64::AEK_PREDRES, AArch64::AEK_PROFILE})},
     {"cortex-r82", ARMV8R, AArch64::ExtensionBitset({AArch64::AEK_LSE})},
     {"cortex-x1", ARMV8_2A,
      AArch64::ExtensionBitset({AArch64::AEK_AES, AArch64::AEK_SHA2,
diff --git a/llvm/include/llvm/TextAPI/Record.h b/llvm/include/llvm/TextAPI/Record.h
index 98639b064eaadd..ef152ce433877c 100644
--- a/llvm/include/llvm/TextAPI/Record.h
+++ b/llvm/include/llvm/TextAPI/Record.h
@@ -51,7 +51,8 @@ class Record {
 public:
   Record() = default;
   Record(StringRef Name, RecordLinkage Linkage, SymbolFlags Flags)
-      : Name(Name), Linkage(Linkage), Flags(mergeFlags(Flags, Linkage)) {}
+      : Name(Name), Linkage(Linkage), Flags(mergeFlags(Flags, Linkage)),
+        Verified(false) {}
 
   bool isWeakDefined() const {
     return (Flags & SymbolFlags::WeakDefined) == SymbolFlags::WeakDefined;
@@ -79,6 +80,9 @@ class Record {
   bool isExported() const { return Linkage >= RecordLinkage::Rexported; }
   bool isRexported() const { return Linkage == RecordLinkage::Rexported; }
 
+  bool isVerified() const { return Verified; }
+  void setVerify(bool V = true) { Verified = V; }
+
   StringRef getName() const { return Name; }
   SymbolFlags getFlags() const { return Flags; }
 
@@ -89,6 +93,7 @@ class Record {
   StringRef Name;
   RecordLinkage Linkage;
   SymbolFlags Flags;
+  bool Verified;
 
   friend class RecordsSlice;
 };
diff --git a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
index a4d1653fdf4bc6..48f8421c1813c8 100644
--- a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
+++ b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
@@ -172,11 +172,12 @@ class ConstantHoistingPass : public PassInfoMixin<ConstantHoistingPass> {
 
   void collectMatInsertPts(
       const consthoist::RebasedConstantListType &RebasedConstants,
-      SmallVectorImpl<Instruction *> &MatInsertPts) const;
-  Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const;
-  SetVector<Instruction *>
-  findConstantInsertionPoint(const consthoist::ConstantInfo &ConstInfo,
-                             const ArrayRef<Instruction *> MatInsertPts) const;
+      SmallVectorImpl<BasicBlock::iterator> &MatInsertPts) const;
+  BasicBlock::iterator findMatInsertPt(Instruction *Inst,
+                                       unsigned Idx = ~0U) const;
+  SetVector<BasicBlock::iterator> findConstantInsertionPoint(
+      const consthoist::ConstantInfo &ConstInfo,
+      const ArrayRef<BasicBlock::iterator> MatInsertPts) const;
   void collectConstantCandidates(ConstCandMapType &ConstCandMap,
                                  Instruction *Inst, unsigned Idx,
                                  ConstantInt *ConstInt);
@@ -203,9 +204,9 @@ class ConstantHoistingPass : public PassInfoMixin<ConstantHoistingPass> {
   struct UserAdjustment {
     Constant *Offset;
     Type *Ty;
-    Instruction *MatInsertPt;
+    BasicBlock::iterator MatInsertPt;
     const consthoist::ConstantUser User;
-    UserAdjustment(Constant *O, Type *T, Instruction *I,
+    UserAdjustment(Constant *O, Type *T, BasicBlock::iterator I,
                    consthoist::ConstantUser U)
         : Offset(O), Ty(T), MatInsertPt(I), User(U) {}
   };
diff --git a/llvm/include/llvm/Transforms/Scalar/Float2Int.h b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
index 337e229efcf379..83be329bed60ba 100644
--- a/llvm/include/llvm/Transforms/Scalar/Float2Int.h
+++ b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
@@ -44,7 +44,7 @@ class Float2IntPass : public PassInfoMixin<Float2IntPass> {
   std::optional<ConstantRange> calcRange(Instruction *I);
   void walkBackwards();
   void walkForwards();
-  bool validateAndTransform(const DataLayout &DL);
+  bool validateAndTransform();
   Value *convert(Instruction *I, Type *ToTy);
   void cleanup();
 
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index cc133628743629..9ae026fa95d21a 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -262,21 +262,21 @@ CallInst *changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr);
 /// that has an associated llvm.dbg.declare intrinsic.
 void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
                                      StoreInst *SI, DIBuilder &Builder);
-void ConvertDebugDeclareToDebugValue(DPValue *DPV, StoreInst *SI,
+void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, StoreInst *SI,
                                      DIBuilder &Builder);
 
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
 /// that has an associated llvm.dbg.declare intrinsic.
 void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
                                      LoadInst *LI, DIBuilder &Builder);
-void ConvertDebugDeclareToDebugValue(DPValue *DPV, LoadInst *LI,
+void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, LoadInst *LI,
                                      DIBuilder &Builder);
 
 /// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
 /// llvm.dbg.declare intrinsic.
 void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
                                      PHINode *LI, DIBuilder &Builder);
-void ConvertDebugDeclareToDebugValue(DPValue *DPV, PHINode *LI,
+void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, PHINode *LI,
                                      DIBuilder &Builder);
 
 /// Lowers llvm.dbg.declare intrinsics into appropriate set of
@@ -313,7 +313,7 @@ void salvageDebugInfo(Instruction &I);
 /// Mark undef if salvaging cannot be completed.
 void salvageDebugInfoForDbgValues(Instruction &I,
                                   ArrayRef<DbgVariableIntrinsic *> Insns,
-                                  ArrayRef<DPValue *> DPInsns);
+                                  ArrayRef<DbgVariableRecord *> DPInsns);
 
 /// Given an instruction \p I and DIExpression \p DIExpr operating on
 /// it, append the effects of \p I to the DIExpression operand list
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 345e09dce0b2b1..187ace3a0cbedf 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -372,6 +372,15 @@ RecurKind getMinMaxReductionRecurKind(Intrinsic::ID RdxID);
 /// Returns the comparison predicate used when expanding a min/max reduction.
 CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK);
 
+/// See RecurrenceDescriptor::isAnyOfPattern for a description of the pattern we
+/// are trying to match. In this pattern, we are only ever selecting between two
+/// values: 1) an initial start value \p StartVal of the reduction PHI, and 2) a
+/// loop invariant value. If any of lane value in \p Left, \p Right is not equal
+/// to \p StartVal, select the loop invariant value. This is done by selecting
+/// \p Right iff \p Left is equal to \p StartVal.
+Value *createAnyOfOp(IRBuilderBase &Builder, Value *StartVal, RecurKind RK,
+                     Value *Left, Value *Right);
+
 /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
 /// The Builder's fast-math-flags must be set to propagate the expected values.
 Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index eb00e6c4e856df..0a0e16d2a9e6e7 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/Alignment.h"
 
 namespace llvm {
@@ -53,7 +54,7 @@ struct AllocaInfo {
   SmallVector<IntrinsicInst *, 2> LifetimeEnd;
   SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
   // Non-intrinsic records of variable locations.
-  SmallVector<DPValue *, 2> DbgVariableRecords;
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
 };
 
 struct StackInfo {
@@ -78,6 +79,11 @@ class StackInfoBuilder {
 
 uint64_t getAllocaSizeInBytes(const AllocaInst &AI);
 void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Align);
+bool isLifetimeIntrinsic(Value *V);
+
+Value *readRegister(IRBuilder<> &IRB, StringRef Name);
+Value *getFP(IRBuilder<> &IRB);
+Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB);
 
 } // namespace memtag
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index c79d436fc676bb..29d96a0ab6bf5b 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -23,7 +23,7 @@ class BasicBlock;
 class Instruction;
 class LoadInst;
 class PHINode;
-class DPValue;
+class DbgVariableRecord;
 template <typename T> class SmallVectorImpl;
 template <typename T> class SSAUpdaterTraits;
 class Type;
@@ -124,7 +124,8 @@ class SSAUpdater {
   void UpdateDebugValues(Instruction *I);
   void UpdateDebugValues(Instruction *I,
                          SmallVectorImpl<DbgValueInst *> &DbgValues);
-  void UpdateDebugValues(Instruction *I, SmallVectorImpl<DPValue *> &DbgValues);
+  void UpdateDebugValues(Instruction *I,
+                         SmallVectorImpl<DbgVariableRecord *> &DbgValues);
 
   /// Rewrite a use like \c RewriteUse but handling in-block definitions.
   ///
@@ -136,7 +137,7 @@ class SSAUpdater {
 private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
   void UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue);
-  void UpdateDebugValue(Instruction *I, DPValue *DbgValue);
+  void UpdateDebugValue(Instruction *I, DbgVariableRecord *DbgValue);
 };
 
 /// Helper class for promoting a collection of loads and stores into SSA
diff --git a/llvm/include/llvm/Transforms/Utils/ValueMapper.h b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
index dd183bc04f0c24..54e3e62dc3af50 100644
--- a/llvm/include/llvm/Transforms/Utils/ValueMapper.h
+++ b/llvm/include/llvm/Transforms/Utils/ValueMapper.h
@@ -180,8 +180,9 @@ class ValueMapper {
   Constant *mapConstant(const Constant &C);
 
   void remapInstruction(Instruction &I);
-  void remapDPValue(Module *M, DPValue &V);
-  void remapDPValueRange(Module *M, iterator_range<DbgRecordIterator> Range);
+  void remapDbgVariableRecord(Module *M, DbgVariableRecord &V);
+  void remapDbgVariableRecordRange(Module *M,
+                                   iterator_range<DbgRecordIterator> Range);
   void remapFunction(Function &F);
   void remapGlobalObjectMetadata(GlobalObject &GO);
 
@@ -267,21 +268,26 @@ inline void RemapInstruction(Instruction *I, ValueToValueMapTy &VM,
   ValueMapper(VM, Flags, TypeMapper, Materializer).remapInstruction(*I);
 }
 
-/// Remap the Values used in the DPValue \a V using the value map \a VM.
-inline void RemapDPValue(Module *M, DPValue *V, ValueToValueMapTy &VM,
-                         RemapFlags Flags = RF_None,
-                         ValueMapTypeRemapper *TypeMapper = nullptr,
-                         ValueMaterializer *Materializer = nullptr) {
-  ValueMapper(VM, Flags, TypeMapper, Materializer).remapDPValue(M, *V);
+/// Remap the Values used in the DbgVariableRecord \a V using the value map \a
+/// VM.
+inline void RemapDbgVariableRecord(Module *M, DbgVariableRecord *V,
+                                   ValueToValueMapTy &VM,
+                                   RemapFlags Flags = RF_None,
+                                   ValueMapTypeRemapper *TypeMapper = nullptr,
+                                   ValueMaterializer *Materializer = nullptr) {
+  ValueMapper(VM, Flags, TypeMapper, Materializer)
+      .remapDbgVariableRecord(M, *V);
 }
 
-/// Remap the Values used in the DPValue \a V using the value map \a VM.
-inline void RemapDPValueRange(Module *M,
-                              iterator_range<DbgRecordIterator> Range,
-                              ValueToValueMapTy &VM, RemapFlags Flags = RF_None,
-                              ValueMapTypeRemapper *TypeMapper = nullptr,
-                              ValueMaterializer *Materializer = nullptr) {
-  ValueMapper(VM, Flags, TypeMapper, Materializer).remapDPValueRange(M, Range);
+/// Remap the Values used in the DbgVariableRecord \a V using the value map \a
+/// VM.
+inline void
+RemapDbgVariableRecordRange(Module *M, iterator_range<DbgRecordIterator> Range,
+                            ValueToValueMapTy &VM, RemapFlags Flags = RF_None,
+                            ValueMapTypeRemapper *TypeMapper = nullptr,
+                            ValueMaterializer *Materializer = nullptr) {
+  ValueMapper(VM, Flags, TypeMapper, Materializer)
+      .remapDbgVariableRecordRange(M, Range);
 }
 
 /// Remap the operands, metadata, arguments, and instructions of a function.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 8b7031e7fe4a6f..6139b5be85be34 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -828,7 +828,7 @@ Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0, Constant *Op1,
 /// that they aren't implicitly casted by the getelementptr.
 Constant *CastGEPIndices(Type *SrcElemTy, ArrayRef<Constant *> Ops,
                          Type *ResultTy, bool InBounds,
-                         std::optional<unsigned> InRangeIndex,
+                         std::optional<ConstantRange> InRange,
                          const DataLayout &DL, const TargetLibraryInfo *TLI) {
   Type *IntIdxTy = DL.getIndexType(ResultTy);
   Type *IntIdxScalarTy = IntIdxTy->getScalarType();
@@ -856,8 +856,8 @@ Constant *CastGEPIndices(Type *SrcElemTy, ArrayRef<Constant *> Ops,
   if (!Any)
     return nullptr;
 
-  Constant *C = ConstantExpr::getGetElementPtr(
-      SrcElemTy, Ops[0], NewIdxs, InBounds, InRangeIndex);
+  Constant *C = ConstantExpr::getGetElementPtr(SrcElemTy, Ops[0], NewIdxs,
+                                               InBounds, InRange);
   return ConstantFoldConstant(C, DL, TLI);
 }
 
@@ -866,7 +866,6 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
                                   ArrayRef<Constant *> Ops,
                                   const DataLayout &DL,
                                   const TargetLibraryInfo *TLI) {
-  const GEPOperator *InnermostGEP = GEP;
   bool InBounds = GEP->isInBounds();
 
   Type *SrcElemTy = GEP->getSourceElementType();
@@ -875,9 +874,8 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
   if (!SrcElemTy->isSized() || isa<ScalableVectorType>(SrcElemTy))
     return nullptr;
 
-  if (Constant *C = CastGEPIndices(SrcElemTy, Ops, ResTy,
-                                   GEP->isInBounds(), GEP->getInRangeIndex(),
-                                   DL, TLI))
+  if (Constant *C = CastGEPIndices(SrcElemTy, Ops, ResTy, GEP->isInBounds(),
+                                   GEP->getInRange(), DL, TLI))
     return C;
 
   Constant *Ptr = Ops[0];
@@ -896,9 +894,12 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
       DL.getIndexedOffsetInType(
           SrcElemTy, ArrayRef((Value *const *)Ops.data() + 1, Ops.size() - 1)));
 
+  std::optional<ConstantRange> InRange = GEP->getInRange();
+  if (InRange)
+    InRange = InRange->sextOrTrunc(BitWidth);
+
   // If this is a GEP of a GEP, fold it all into a single GEP.
   while (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
-    InnermostGEP = GEP;
     InBounds &= GEP->isInBounds();
 
     SmallVector<Value *, 4> NestedOps(llvm::drop_begin(GEP->operands()));
@@ -913,6 +914,14 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
     if (!AllConstantInt)
       break;
 
+    // TODO: Try to intersect two inrange attributes?
+    if (!InRange) {
+      InRange = GEP->getInRange();
+      if (InRange)
+        // Adjust inrange by offset until now.
+        InRange = InRange->sextOrTrunc(BitWidth).subtract(Offset);
+    }
+
     Ptr = cast<Constant>(GEP->getOperand(0));
     SrcElemTy = GEP->getSourceElementType();
     Offset += APInt(BitWidth, DL.getIndexedOffsetInType(SrcElemTy, NestedOps));
@@ -971,21 +980,8 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
     NewIdxs.push_back(ConstantInt::get(
         Type::getIntNTy(Ptr->getContext(), Index.getBitWidth()), Index));
 
-  // Preserve the inrange index from the innermost GEP if possible. We must
-  // have calculated the same indices up to and including the inrange index.
-  std::optional<unsigned> InRangeIndex;
-  if (std::optional<unsigned> LastIRIndex = InnermostGEP->getInRangeIndex())
-    if (SrcElemTy == InnermostGEP->getSourceElementType() &&
-        NewIdxs.size() > *LastIRIndex) {
-      InRangeIndex = LastIRIndex;
-      for (unsigned I = 0; I <= *LastIRIndex; ++I)
-        if (NewIdxs[I] != InnermostGEP->getOperand(I + 1))
-          return nullptr;
-    }
-
-  // Create a GEP.
   return ConstantExpr::getGetElementPtr(SrcElemTy, Ptr, NewIdxs, InBounds,
-                                        InRangeIndex);
+                                        InRange);
 }
 
 /// Attempt to constant fold an instruction with the
@@ -1033,8 +1029,7 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
       return C;
 
     return ConstantExpr::getGetElementPtr(SrcElemTy, Ops[0], Ops.slice(1),
-                                          GEP->isInBounds(),
-                                          GEP->getInRangeIndex());
+                                          GEP->isInBounds(), GEP->getInRange());
   }
 
   if (auto *CE = dyn_cast<ConstantExpr>(InstOrCE)) {
@@ -2534,12 +2529,73 @@ static Constant *evaluateCompare(const APFloat &Op1, const APFloat &Op2,
   return nullptr;
 }
 
-static Constant *ConstantFoldScalarCall2(StringRef Name,
-                                         Intrinsic::ID IntrinsicID,
-                                         Type *Ty,
-                                         ArrayRef<Constant *> Operands,
-                                         const TargetLibraryInfo *TLI,
-                                         const CallBase *Call) {
+static Constant *ConstantFoldLibCall2(StringRef Name, Type *Ty,
+                                      ArrayRef<Constant *> Operands,
+                                      const TargetLibraryInfo *TLI) {
+  if (!TLI)
+    return nullptr;
+
+  LibFunc Func = NotLibFunc;
+  if (!TLI->getLibFunc(Name, Func))
+    return nullptr;
+
+  const auto *Op1 = dyn_cast<ConstantFP>(Operands[0]);
+  if (!Op1)
+    return nullptr;
+
+  const auto *Op2 = dyn_cast<ConstantFP>(Operands[1]);
+  if (!Op2)
+    return nullptr;
+
+  const APFloat &Op1V = Op1->getValueAPF();
+  const APFloat &Op2V = Op2->getValueAPF();
+
+  switch (Func) {
+  default:
+    break;
+  case LibFunc_pow:
+  case LibFunc_powf:
+  case LibFunc_pow_finite:
+  case LibFunc_powf_finite:
+    if (TLI->has(Func))
+      return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
+    break;
+  case LibFunc_fmod:
+  case LibFunc_fmodf:
+    if (TLI->has(Func)) {
+      APFloat V = Op1->getValueAPF();
+      if (APFloat::opStatus::opOK == V.mod(Op2->getValueAPF()))
+        return ConstantFP::get(Ty->getContext(), V);
+    }
+    break;
+  case LibFunc_remainder:
+  case LibFunc_remainderf:
+    if (TLI->has(Func)) {
+      APFloat V = Op1->getValueAPF();
+      if (APFloat::opStatus::opOK == V.remainder(Op2->getValueAPF()))
+        return ConstantFP::get(Ty->getContext(), V);
+    }
+    break;
+  case LibFunc_atan2:
+  case LibFunc_atan2f:
+    // atan2(+/-0.0, +/-0.0) is known to raise an exception on some libm
+    // (Solaris), so we do not assume a known result for that.
+    if (Op1V.isZero() && Op2V.isZero())
+      return nullptr;
+    [[fallthrough]];
+  case LibFunc_atan2_finite:
+  case LibFunc_atan2f_finite:
+    if (TLI->has(Func))
+      return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
+    break;
+  }
+
+  return nullptr;
+}
+
+static Constant *ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
+                                            ArrayRef<Constant *> Operands,
+                                            const CallBase *Call) {
   assert(Operands.size() == 2 && "Wrong number of operands.");
 
   if (Ty->isFloatingPointTy()) {
@@ -2569,7 +2625,8 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return nullptr;
       const APFloat &Op2V = Op2->getValueAPF();
 
-      if (const auto *ConstrIntr = dyn_cast<ConstrainedFPIntrinsic>(Call)) {
+      if (const auto *ConstrIntr =
+              dyn_cast_if_present<ConstrainedFPIntrinsic>(Call)) {
         RoundingMode RM = getEvaluationRoundingMode(ConstrIntr);
         APFloat Res = Op1V;
         APFloat::opStatus St;
@@ -2632,52 +2689,6 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return ConstantFP::get(Ty->getContext(), Op1V * Op2V);
       }
 
-      if (!TLI)
-        return nullptr;
-
-      LibFunc Func = NotLibFunc;
-      if (!TLI->getLibFunc(Name, Func))
-        return nullptr;
-
-      switch (Func) {
-      default:
-        break;
-      case LibFunc_pow:
-      case LibFunc_powf:
-      case LibFunc_pow_finite:
-      case LibFunc_powf_finite:
-        if (TLI->has(Func))
-          return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
-        break;
-      case LibFunc_fmod:
-      case LibFunc_fmodf:
-        if (TLI->has(Func)) {
-          APFloat V = Op1->getValueAPF();
-          if (APFloat::opStatus::opOK == V.mod(Op2->getValueAPF()))
-            return ConstantFP::get(Ty->getContext(), V);
-        }
-        break;
-      case LibFunc_remainder:
-      case LibFunc_remainderf:
-        if (TLI->has(Func)) {
-          APFloat V = Op1->getValueAPF();
-          if (APFloat::opStatus::opOK == V.remainder(Op2->getValueAPF()))
-            return ConstantFP::get(Ty->getContext(), V);
-        }
-        break;
-      case LibFunc_atan2:
-      case LibFunc_atan2f:
-        // atan2(+/-0.0, +/-0.0) is known to raise an exception on some libm
-        // (Solaris), so we do not assume a known result for that.
-        if (Op1V.isZero() && Op2V.isZero())
-          return nullptr;
-        [[fallthrough]];
-      case LibFunc_atan2_finite:
-      case LibFunc_atan2f_finite:
-        if (TLI->has(Func))
-          return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
-        break;
-      }
     } else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
       switch (IntrinsicID) {
       case Intrinsic::ldexp: {
@@ -3168,8 +3179,13 @@ static Constant *ConstantFoldScalarCall(StringRef Name,
   if (Operands.size() == 1)
     return ConstantFoldScalarCall1(Name, IntrinsicID, Ty, Operands, TLI, Call);
 
-  if (Operands.size() == 2)
-    return ConstantFoldScalarCall2(Name, IntrinsicID, Ty, Operands, TLI, Call);
+  if (Operands.size() == 2) {
+    if (Constant *FoldedLibCall =
+            ConstantFoldLibCall2(Name, Ty, Operands, TLI)) {
+      return FoldedLibCall;
+    }
+    return ConstantFoldIntrinsicCall2(IntrinsicID, Ty, Operands, Call);
+  }
 
   if (Operands.size() == 3)
     return ConstantFoldScalarCall3(Name, IntrinsicID, Ty, Operands, TLI, Call);
@@ -3376,6 +3392,13 @@ ConstantFoldStructCall(StringRef Name, Intrinsic::ID IntrinsicID,
 
 } // end anonymous namespace
 
+Constant *llvm::ConstantFoldBinaryIntrinsic(Intrinsic::ID ID, Constant *LHS,
+                                            Constant *RHS, Type *Ty,
+                                            Instruction *FMFSource) {
+  return ConstantFoldIntrinsicCall2(ID, Ty, {LHS, RHS},
+                                    dyn_cast_if_present<CallBase>(FMFSource));
+}
+
 Constant *llvm::ConstantFoldCall(const CallBase *Call, Function *F,
                                  ArrayRef<Constant *> Operands,
                                  const TargetLibraryInfo *TLI) {
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index ce651783caf16b..06283cb4bb934b 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5767,11 +5767,16 @@ static Value *simplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
     if (FMF.noNaNs() && FMF.noSignedZeros())
       return ConstantFP::getZero(Op0->getType());
 
-    // +normal number * (-)0.0 --> (-)0.0
     KnownFPClass Known =
         computeKnownFPClass(Op0, FMF, fcInf | fcNan, /*Depth=*/0, Q);
-    if (Known.SignBit == false && Known.isKnownNever(fcInf | fcNan))
-      return Op1;
+    if (Known.isKnownNever(fcInf | fcNan)) {
+      // +normal number * (-)0.0 --> (-)0.0
+      if (Known.SignBit == false)
+        return Op1;
+      // -normal number * (-)0.0 --> -(-)0.0
+      if (Known.SignBit == true)
+        return foldConstant(Instruction::FNeg, Op1, Q);
+    }
   }
 
   // sqrt(X) * sqrt(X) --> X, if we can:
@@ -6384,11 +6389,10 @@ static Value *foldMinimumMaximumSharedOp(Intrinsic::ID IID, Value *Op0,
   return nullptr;
 }
 
-static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
-                                      const SimplifyQuery &Q,
-                                      const CallBase *Call) {
-  Intrinsic::ID IID = F->getIntrinsicID();
-  Type *ReturnType = F->getReturnType();
+Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
+                                     Value *Op0, Value *Op1,
+                                     const SimplifyQuery &Q,
+                                     const CallBase *Call) {
   unsigned BitWidth = ReturnType->getScalarSizeInBits();
   switch (IID) {
   case Intrinsic::abs:
@@ -6646,19 +6650,21 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     // float, if the ninf flag is set.
     const APFloat *C;
     if (match(Op1, m_APFloat(C)) &&
-        (C->isInfinity() || (Call->hasNoInfs() && C->isLargest()))) {
+        (C->isInfinity() || (Call && Call->hasNoInfs() && C->isLargest()))) {
       // minnum(X, -inf) -> -inf
       // maxnum(X, +inf) -> +inf
       // minimum(X, -inf) -> -inf if nnan
       // maximum(X, +inf) -> +inf if nnan
-      if (C->isNegative() == IsMin && (!PropagateNaN || Call->hasNoNaNs()))
+      if (C->isNegative() == IsMin &&
+          (!PropagateNaN || (Call && Call->hasNoNaNs())))
         return ConstantFP::get(ReturnType, *C);
 
       // minnum(X, +inf) -> X if nnan
       // maxnum(X, -inf) -> X if nnan
       // minimum(X, +inf) -> X
       // maximum(X, -inf) -> X
-      if (C->isNegative() != IsMin && (PropagateNaN || Call->hasNoNaNs()))
+      if (C->isNegative() != IsMin &&
+          (PropagateNaN || (Call && Call->hasNoNaNs())))
         return Op0;
     }
 
@@ -6672,8 +6678,6 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     break;
   }
   case Intrinsic::vector_extract: {
-    Type *ReturnType = F->getReturnType();
-
     // (extract_vector (insert_vector _, X, 0), 0) -> X
     unsigned IdxN = cast<ConstantInt>(Op1)->getZExtValue();
     Value *X = nullptr;
@@ -6720,7 +6724,8 @@ static Value *simplifyIntrinsic(CallBase *Call, Value *Callee,
     return simplifyUnaryIntrinsic(F, Args[0], Q, Call);
 
   if (NumOperands == 2)
-    return simplifyBinaryIntrinsic(F, Args[0], Args[1], Q, Call);
+    return simplifyBinaryIntrinsic(IID, F->getReturnType(), Args[0], Args[1], Q,
+                                   Call);
 
   // Handle intrinsics with 3 or more arguments.
   switch (IID) {
diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp
index 5700fd664a4cd3..d16dec0d42927d 100644
--- a/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/llvm/lib/Analysis/PHITransAddr.cpp
@@ -369,7 +369,7 @@ Value *PHITransAddr::insertTranslatedSubExpr(
     // Otherwise insert a cast at the end of PredBB.
     CastInst *New = CastInst::Create(Cast->getOpcode(), OpVal, InVal->getType(),
                                      InVal->getName() + ".phi.trans.insert",
-                                     PredBB->getTerminator());
+                                     PredBB->getTerminator()->getIterator());
     New->setDebugLoc(Inst->getDebugLoc());
     NewInsts.push_back(New);
     return New;
@@ -387,7 +387,8 @@ Value *PHITransAddr::insertTranslatedSubExpr(
 
     GetElementPtrInst *Result = GetElementPtrInst::Create(
         GEP->getSourceElementType(), GEPOps[0], ArrayRef(GEPOps).slice(1),
-        InVal->getName() + ".phi.trans.insert", PredBB->getTerminator());
+        InVal->getName() + ".phi.trans.insert",
+        PredBB->getTerminator()->getIterator());
     Result->setDebugLoc(Inst->getDebugLoc());
     Result->setIsInBounds(GEP->isInBounds());
     NewInsts.push_back(Result);
@@ -408,9 +409,9 @@ Value *PHITransAddr::insertTranslatedSubExpr(
     if (OpVal == nullptr)
       return nullptr;
 
-    BinaryOperator *Res = BinaryOperator::CreateAdd(OpVal, Inst->getOperand(1),
-                                           InVal->getName()+".phi.trans.insert",
-                                                    PredBB->getTerminator());
+    BinaryOperator *Res = BinaryOperator::CreateAdd(
+        OpVal, Inst->getOperand(1), InVal->getName() + ".phi.trans.insert",
+        PredBB->getTerminator()->getIterator());
     Res->setHasNoSignedWrap(cast<BinaryOperator>(Inst)->hasNoSignedWrap());
     Res->setHasNoUnsignedWrap(cast<BinaryOperator>(Inst)->hasNoUnsignedWrap());
     NewInsts.push_back(Res);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 15311be4dba277..5f933b4587843c 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Dominators.h"
@@ -395,6 +396,10 @@ bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
   return TTIImpl->isLegalAddImmediate(Imm);
 }
 
+bool TargetTransformInfo::isLegalAddScalableImmediate(int64_t Imm) const {
+  return TTIImpl->isLegalAddScalableImmediate(Imm);
+}
+
 bool TargetTransformInfo::isLegalICmpImmediate(int64_t Imm) const {
   return TTIImpl->isLegalICmpImmediate(Imm);
 }
@@ -403,9 +408,10 @@ bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                                 int64_t BaseOffset,
                                                 bool HasBaseReg, int64_t Scale,
                                                 unsigned AddrSpace,
-                                                Instruction *I) const {
+                                                Instruction *I,
+                                                int64_t ScalableOffset) const {
   return TTIImpl->isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
-                                        Scale, AddrSpace, I);
+                                        Scale, AddrSpace, I, ScalableOffset);
 }
 
 bool TargetTransformInfo::isLSRCostLess(const LSRCost &C1,
@@ -874,7 +880,22 @@ TargetTransformInfo::getOperandInfo(const Value *V) {
 InstructionCost TargetTransformInfo::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     OperandValueInfo Op1Info, OperandValueInfo Op2Info,
-    ArrayRef<const Value *> Args, const Instruction *CxtI) const {
+    ArrayRef<const Value *> Args, const Instruction *CxtI,
+    const TargetLibraryInfo *TLibInfo) const {
+
+  // Use call cost for frem intructions that have platform specific vector math
+  // functions, as those will be replaced with calls later by SelectionDAG or
+  // ReplaceWithVecLib pass.
+  if (TLibInfo && Opcode == Instruction::FRem) {
+    VectorType *VecTy = dyn_cast<VectorType>(Ty);
+    LibFunc Func;
+    if (VecTy &&
+        TLibInfo->getLibFunc(Instruction::FRem, Ty->getScalarType(), Func) &&
+        TLibInfo->isFunctionVectorizable(TLibInfo->getName(Func),
+                                         VecTy->getElementCount()))
+      return getCallInstrCost(nullptr, VecTy, {VecTy, VecTy}, CostKind);
+  }
+
   InstructionCost Cost =
       TTIImpl->getArithmeticInstrCost(Opcode, Ty, CostKind,
                                       Op1Info, Op2Info,
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index edbeede910d7f7..58bc68b576b0b2 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -2759,27 +2759,33 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     auto *LI = cast<LoadInst>(I);
     // A Load tagged with nonnull or dereferenceable with null pointer undefined
     // is never null.
-    if (auto *PtrT = dyn_cast<PointerType>(I->getType()))
+    if (auto *PtrT = dyn_cast<PointerType>(I->getType())) {
       if (Q.IIQ.getMetadata(LI, LLVMContext::MD_nonnull) ||
           (Q.IIQ.getMetadata(LI, LLVMContext::MD_dereferenceable) &&
            !NullPointerIsDefined(LI->getFunction(), PtrT->getAddressSpace())))
         return true;
+    } else if (MDNode *Ranges = Q.IIQ.getMetadata(LI, LLVMContext::MD_range)) {
+      return rangeMetadataExcludesValue(Ranges, APInt::getZero(BitWidth));
+    }
 
     // No need to fall through to computeKnownBits as range metadata is already
     // handled in isKnownNonZero.
     return false;
   }
   case Instruction::Call:
-  case Instruction::Invoke:
+  case Instruction::Invoke: {
+    const auto *Call = cast<CallBase>(I);
     if (I->getType()->isPointerTy()) {
-      const auto *Call = cast<CallBase>(I);
       if (Call->isReturnNonNull())
         return true;
       if (const auto *RP = getArgumentAliasingToReturnedPointer(Call, true))
         return isKnownNonZero(RP, Depth, Q);
-    } else if (const Value *RV = cast<CallBase>(I)->getReturnedArgOperand()) {
-      if (RV->getType() == I->getType() && isKnownNonZero(RV, Depth, Q))
-        return true;
+    } else {
+      if (MDNode *Ranges = Q.IIQ.getMetadata(Call, LLVMContext::MD_range))
+        return rangeMetadataExcludesValue(Ranges, APInt::getZero(BitWidth));
+      if (const Value *RV = Call->getReturnedArgOperand())
+        if (RV->getType() == I->getType() && isKnownNonZero(RV, Depth, Q))
+          return true;
     }
 
     if (auto *II = dyn_cast<IntrinsicInst>(I)) {
@@ -2849,6 +2855,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
 
     return false;
   }
+  }
 
   KnownBits Known(BitWidth);
   computeKnownBits(I, DemandedElts, Known, Depth, Q);
@@ -2863,9 +2870,9 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
 /// Supports values with integer or pointer type and vectors of integers.
 bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
                     const SimplifyQuery &Q) {
+  Type *Ty = V->getType();
 
 #ifndef NDEBUG
-  Type *Ty = V->getType();
   assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
 
   if (auto *FVTy = dyn_cast<FixedVectorType>(Ty)) {
@@ -2887,7 +2894,7 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
 
     // For constant vectors, check that all elements are undefined or known
     // non-zero to determine that the whole vector is known non-zero.
-    if (auto *VecTy = dyn_cast<FixedVectorType>(C->getType())) {
+    if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
       for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) {
         if (!DemandedElts[i])
           continue;
@@ -2914,18 +2921,6 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
       return false;
   }
 
-  if (auto *I = dyn_cast<Instruction>(V)) {
-    if (MDNode *Ranges = Q.IIQ.getMetadata(I, LLVMContext::MD_range)) {
-      // If the possible ranges don't contain zero, then the value is
-      // definitely non-zero.
-      if (auto *Ty = dyn_cast<IntegerType>(V->getType())) {
-        const APInt ZeroValue(Ty->getBitWidth(), 0);
-        if (rangeMetadataExcludesValue(Ranges, ZeroValue))
-          return true;
-      }
-    }
-  }
-
   if (!isa<Constant>(V) && isKnownNonZeroFromAssume(V, Q))
     return true;
 
@@ -2935,7 +2930,7 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
 
   // Check for pointer simplifications.
 
-  if (PointerType *PtrTy = dyn_cast<PointerType>(V->getType())) {
+  if (PointerType *PtrTy = dyn_cast<PointerType>(Ty)) {
     // A byval, inalloca may not be null in a non-default addres space. A
     // nonnull argument is assumed never 0.
     if (const Argument *A = dyn_cast<Argument>(V)) {
@@ -3972,10 +3967,16 @@ std::tuple<Value *, FPClassTest, FPClassTest>
 llvm::fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
                        FPClassTest RHSClass, bool LookThroughSrc) {
   assert(RHSClass != fcNone);
+  Value *Src = LHS;
+
+  if (Pred == FCmpInst::FCMP_TRUE)
+    return exactClass(Src, fcAllFlags);
+
+  if (Pred == FCmpInst::FCMP_FALSE)
+    return exactClass(Src, fcNone);
 
   const FPClassTest OrigClass = RHSClass;
 
-  Value *Src = LHS;
   const bool IsNegativeRHS = (RHSClass & fcNegative) == RHSClass;
   const bool IsPositiveRHS = (RHSClass & fcPositive) == RHSClass;
   const bool IsNaN = (RHSClass & ~fcNan) == fcNone;
@@ -3994,12 +3995,6 @@ llvm::fcmpImpliesClass(CmpInst::Predicate Pred, const Function &F, Value *LHS,
   if (Pred == FCmpInst::FCMP_UNO)
     return exactClass(Src, fcNan);
 
-  if (Pred == FCmpInst::FCMP_TRUE)
-    return exactClass(Src, fcAllFlags);
-
-  if (Pred == FCmpInst::FCMP_FALSE)
-    return exactClass(Src, fcNone);
-
   const bool IsFabs = LookThroughSrc && match(LHS, m_FAbs(m_Value(Src)));
   if (IsFabs)
     RHSClass = llvm::inverse_fabs(RHSClass);
@@ -8471,26 +8466,12 @@ isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
   }
 }
 
-/// Return true if the operands of two compares (expanded as "L0 pred L1" and
-/// "R0 pred R1") match. IsSwappedOps is true when the operands match, but are
-/// swapped.
-static bool areMatchingOperands(const Value *L0, const Value *L1, const Value *R0,
-                           const Value *R1, bool &AreSwappedOps) {
-  bool AreMatchingOps = (L0 == R0 && L1 == R1);
-  AreSwappedOps = (L0 == R1 && L1 == R0);
-  return AreMatchingOps || AreSwappedOps;
-}
-
 /// Return true if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is true.
 /// Return false if "icmp1 LPred X, Y" implies "icmp2 RPred X, Y" is false.
 /// Otherwise, return std::nullopt if we can't infer anything.
 static std::optional<bool>
 isImpliedCondMatchingOperands(CmpInst::Predicate LPred,
-                              CmpInst::Predicate RPred, bool AreSwappedOps) {
-  // Canonicalize the predicate as if the operands were not commuted.
-  if (AreSwappedOps)
-    RPred = ICmpInst::getSwappedPredicate(RPred);
-
+                              CmpInst::Predicate RPred) {
   if (CmpInst::isImpliedTrueByMatchingCmp(LPred, RPred))
     return true;
   if (CmpInst::isImpliedFalseByMatchingCmp(LPred, RPred))
@@ -8532,6 +8513,26 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
   CmpInst::Predicate LPred =
       LHSIsTrue ? LHS->getPredicate() : LHS->getInversePredicate();
 
+  // We can have non-canonical operands, so try to normalize any common operand
+  // to L0/R0.
+  if (L0 == R1) {
+    std::swap(R0, R1);
+    RPred = ICmpInst::getSwappedPredicate(RPred);
+  }
+  if (R0 == L1) {
+    std::swap(L0, L1);
+    LPred = ICmpInst::getSwappedPredicate(LPred);
+  }
+  if (L1 == R1) {
+    // If we have L0 == R0 and L1 == R1, then make L1/R1 the constants.
+    if (L0 != R0 || match(L0, m_ImmConstant())) {
+      std::swap(L0, L1);
+      LPred = ICmpInst::getSwappedPredicate(LPred);
+      std::swap(R0, R1);
+      RPred = ICmpInst::getSwappedPredicate(RPred);
+    }
+  }
+
   // Can we infer anything when the 0-operands match and the 1-operands are
   // constants (not necessarily matching)?
   const APInt *LC, *RC;
@@ -8539,32 +8540,15 @@ static std::optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
     return isImpliedCondCommonOperandWithConstants(LPred, *LC, RPred, *RC);
 
   // Can we infer anything when the two compares have matching operands?
-  bool AreSwappedOps;
-  if (areMatchingOperands(L0, L1, R0, R1, AreSwappedOps))
-    return isImpliedCondMatchingOperands(LPred, RPred, AreSwappedOps);
+  if (L0 == R0 && L1 == R1)
+    return isImpliedCondMatchingOperands(LPred, RPred);
 
   // L0 = R0 = L1 + R1, L0 >=u L1 implies R0 >=u R1, L0 <u L1 implies R0 <u R1
-  if (ICmpInst::isUnsigned(LPred) && ICmpInst::isUnsigned(RPred)) {
-    if (L0 == R1) {
-      std::swap(R0, R1);
-      RPred = ICmpInst::getSwappedPredicate(RPred);
-    }
-    if (L1 == R0) {
-      std::swap(L0, L1);
-      LPred = ICmpInst::getSwappedPredicate(LPred);
-    }
-    if (L1 == R1) {
-      std::swap(L0, L1);
-      LPred = ICmpInst::getSwappedPredicate(LPred);
-      std::swap(R0, R1);
-      RPred = ICmpInst::getSwappedPredicate(RPred);
-    }
-    if (L0 == R0 &&
-        (LPred == ICmpInst::ICMP_ULT || LPred == ICmpInst::ICMP_UGE) &&
-        (RPred == ICmpInst::ICMP_ULT || RPred == ICmpInst::ICMP_UGE) &&
-        match(L0, m_c_Add(m_Specific(L1), m_Specific(R1))))
-      return LPred == RPred;
-  }
+  if (L0 == R0 &&
+      (LPred == ICmpInst::ICMP_ULT || LPred == ICmpInst::ICMP_UGE) &&
+      (RPred == ICmpInst::ICMP_ULT || RPred == ICmpInst::ICMP_UGE) &&
+      match(L0, m_c_Add(m_Specific(L1), m_Specific(R1))))
+    return LPred == RPred;
 
   if (LPred == RPred)
     return isImpliedCondOperands(LPred, L0, L1, R0, R1, DL, Depth);
@@ -8622,6 +8606,10 @@ llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred,
   assert(LHS->getType()->isIntOrIntVectorTy(1) &&
          "Expected integer type only!");
 
+  // Match not
+  if (match(LHS, m_Not(m_Value(LHS))))
+    LHSIsTrue = !LHSIsTrue;
+
   // Both LHS and RHS are icmps.
   const ICmpInst *LHSCmp = dyn_cast<ICmpInst>(LHS);
   if (LHSCmp)
@@ -8648,10 +8636,21 @@ std::optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
   if (LHS == RHS)
     return LHSIsTrue;
 
-  if (const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS))
-    return isImpliedCondition(LHS, RHSCmp->getPredicate(),
-                              RHSCmp->getOperand(0), RHSCmp->getOperand(1), DL,
-                              LHSIsTrue, Depth);
+  // Match not
+  bool InvertRHS = false;
+  if (match(RHS, m_Not(m_Value(RHS)))) {
+    if (LHS == RHS)
+      return !LHSIsTrue;
+    InvertRHS = true;
+  }
+
+  if (const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS)) {
+    if (auto Implied = isImpliedCondition(
+            LHS, RHSCmp->getPredicate(), RHSCmp->getOperand(0),
+            RHSCmp->getOperand(1), DL, LHSIsTrue, Depth))
+      return InvertRHS ? !*Implied : *Implied;
+    return std::nullopt;
+  }
 
   if (Depth == MaxAnalysisRecursionDepth)
     return std::nullopt;
@@ -8663,21 +8662,21 @@ std::optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
     if (std::optional<bool> Imp =
             isImpliedCondition(LHS, RHS1, DL, LHSIsTrue, Depth + 1))
       if (*Imp == true)
-        return true;
+        return !InvertRHS;
     if (std::optional<bool> Imp =
             isImpliedCondition(LHS, RHS2, DL, LHSIsTrue, Depth + 1))
       if (*Imp == true)
-        return true;
+        return !InvertRHS;
   }
   if (match(RHS, m_LogicalAnd(m_Value(RHS1), m_Value(RHS2)))) {
     if (std::optional<bool> Imp =
             isImpliedCondition(LHS, RHS1, DL, LHSIsTrue, Depth + 1))
       if (*Imp == false)
-        return false;
+        return InvertRHS;
     if (std::optional<bool> Imp =
             isImpliedCondition(LHS, RHS2, DL, LHSIsTrue, Depth + 1))
       if (*Imp == false)
-        return false;
+        return InvertRHS;
   }
 
   return std::nullopt;
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 2e0f5ba82220c9..8738cb47dd9ccb 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4169,11 +4169,32 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
     unsigned Opc = Lex.getUIntVal();
     SmallVector<Constant*, 16> Elts;
     bool InBounds = false;
+    bool HasInRange = false;
+    APSInt InRangeStart;
+    APSInt InRangeEnd;
     Type *Ty;
     Lex.Lex();
 
-    if (Opc == Instruction::GetElementPtr)
+    if (Opc == Instruction::GetElementPtr) {
       InBounds = EatIfPresent(lltok::kw_inbounds);
+      if (EatIfPresent(lltok::kw_inrange)) {
+        if (parseToken(lltok::lparen, "expected '('"))
+          return true;
+        if (Lex.getKind() != lltok::APSInt)
+          return tokError("expected integer");
+        InRangeStart = Lex.getAPSIntVal();
+        Lex.Lex();
+        if (parseToken(lltok::comma, "expected ','"))
+          return true;
+        if (Lex.getKind() != lltok::APSInt)
+          return tokError("expected integer");
+        InRangeEnd = Lex.getAPSIntVal();
+        Lex.Lex();
+        if (parseToken(lltok::rparen, "expected ')'"))
+          return true;
+        HasInRange = true;
+      }
+    }
 
     if (parseToken(lltok::lparen, "expected '(' in constantexpr"))
       return true;
@@ -4184,9 +4205,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
         return true;
     }
 
-    std::optional<unsigned> InRangeOp;
-    if (parseGlobalValueVector(
-            Elts, Opc == Instruction::GetElementPtr ? &InRangeOp : nullptr) ||
+    if (parseGlobalValueVector(Elts) ||
         parseToken(lltok::rparen, "expected ')' in constantexpr"))
       return true;
 
@@ -4196,6 +4215,17 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
         return error(ID.Loc, "base of getelementptr must be a pointer");
 
       Type *BaseType = Elts[0]->getType();
+      std::optional<ConstantRange> InRange;
+      if (HasInRange) {
+        unsigned IndexWidth =
+            M->getDataLayout().getIndexTypeSizeInBits(BaseType);
+        InRangeStart = InRangeStart.extOrTrunc(IndexWidth);
+        InRangeEnd = InRangeEnd.extOrTrunc(IndexWidth);
+        if (InRangeStart.sge(InRangeEnd))
+          return error(ID.Loc, "expected end to be larger than start");
+        InRange = ConstantRange::getNonEmpty(InRangeStart, InRangeEnd);
+      }
+
       unsigned GEPWidth =
           BaseType->isVectorTy()
               ? cast<FixedVectorType>(BaseType)->getNumElements()
@@ -4225,15 +4255,8 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
       if (!GetElementPtrInst::getIndexedType(Ty, Indices))
         return error(ID.Loc, "invalid getelementptr indices");
 
-      if (InRangeOp) {
-        if (*InRangeOp == 0)
-          return error(ID.Loc,
-                       "inrange keyword may not appear on pointer operand");
-        --*InRangeOp;
-      }
-
       ID.ConstantVal = ConstantExpr::getGetElementPtr(Ty, Elts[0], Indices,
-                                                      InBounds, InRangeOp);
+                                                      InBounds, InRange);
     } else if (Opc == Instruction::ShuffleVector) {
       if (Elts.size() != 3)
         return error(ID.Loc, "expected three operands to shufflevector");
@@ -4309,9 +4332,8 @@ bool LLParser::parseOptionalComdat(StringRef GlobalName, Comdat *&C) {
 
 /// parseGlobalValueVector
 ///   ::= /*empty*/
-///   ::= [inrange] TypeAndValue (',' [inrange] TypeAndValue)*
-bool LLParser::parseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
-                                      std::optional<unsigned> *InRangeOp) {
+///   ::= TypeAndValue (',' TypeAndValue)*
+bool LLParser::parseGlobalValueVector(SmallVectorImpl<Constant *> &Elts) {
   // Empty list.
   if (Lex.getKind() == lltok::rbrace ||
       Lex.getKind() == lltok::rsquare ||
@@ -4320,8 +4342,9 @@ bool LLParser::parseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
     return false;
 
   do {
-    if (InRangeOp && !*InRangeOp && EatIfPresent(lltok::kw_inrange))
-      *InRangeOp = Elts.size();
+    // Let the caller deal with inrange.
+    if (Lex.getKind() == lltok::kw_inrange)
+      return false;
 
     Constant *C;
     if (parseGlobalTypeAndValue(C))
@@ -5184,7 +5207,11 @@ bool LLParser::parseDIStringType(MDNode *&Result, bool IsDistinct) {
 ///   ::= !DIDerivedType(tag: DW_TAG_pointer_type, name: "int", file: !0,
 ///                      line: 7, scope: !1, baseType: !2, size: 32,
 ///                      align: 32, offset: 0, flags: 0, extraData: !3,
-///                      dwarfAddressSpace: 3)
+///                      dwarfAddressSpace: 3, ptrAuthKey: 1,
+///                      ptrAuthIsAddressDiscriminated: true,
+///                      ptrAuthExtraDiscriminator: 0x1234,
+///                      ptrAuthIsaPointer: 1, ptrAuthAuthenticatesNullValues:1
+///                      )
 bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
@@ -5199,19 +5226,30 @@ bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(extraData, MDField, );                                              \
   OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));      \
-  OPTIONAL(annotations, MDField, );
+  OPTIONAL(annotations, MDField, );                                            \
+  OPTIONAL(ptrAuthKey, MDUnsignedField, (0, 7));                               \
+  OPTIONAL(ptrAuthIsAddressDiscriminated, MDBoolField, );                      \
+  OPTIONAL(ptrAuthExtraDiscriminator, MDUnsignedField, (0, 0xffff));           \
+  OPTIONAL(ptrAuthIsaPointer, MDBoolField, );                                  \
+  OPTIONAL(ptrAuthAuthenticatesNullValues, MDBoolField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
   std::optional<unsigned> DWARFAddressSpace;
   if (dwarfAddressSpace.Val != UINT32_MAX)
     DWARFAddressSpace = dwarfAddressSpace.Val;
+  std::optional<DIDerivedType::PtrAuthData> PtrAuthData;
+  if (ptrAuthKey.Val)
+    PtrAuthData.emplace(
+        (unsigned)ptrAuthKey.Val, ptrAuthIsAddressDiscriminated.Val,
+        (unsigned)ptrAuthExtraDiscriminator.Val, ptrAuthIsaPointer.Val,
+        ptrAuthAuthenticatesNullValues.Val);
 
   Result = GET_OR_DISTINCT(DIDerivedType,
                            (Context, tag.Val, name.Val, file.Val, line.Val,
                             scope.Val, baseType.Val, size.Val, align.Val,
-                            offset.Val, DWARFAddressSpace, flags.Val,
-                            extraData.Val, annotations.Val));
+                            offset.Val, DWARFAddressSpace, PtrAuthData,
+                            flags.Val, extraData.Val, annotations.Val));
   return false;
 }
 
@@ -6543,10 +6581,10 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
 ///                 (MDNode ',' Metadata ',' Metadata ',')? MDNode ')'
 bool LLParser::parseDebugRecord(DbgRecord *&DR, PerFunctionState &PFS) {
   using RecordKind = DbgRecord::Kind;
-  using LocType = DPValue::LocationType;
-  LocTy DPVLoc = Lex.getLoc();
+  using LocType = DbgVariableRecord::LocationType;
+  LocTy DVRLoc = Lex.getLoc();
   if (Lex.getKind() != lltok::DbgRecordType)
-    return error(DPVLoc, "expected debug record type here");
+    return error(DVRLoc, "expected debug record type here");
   RecordKind RecordType = StringSwitch<RecordKind>(Lex.getStrVal())
                               .Case("declare", RecordKind::ValueKind)
                               .Case("value", RecordKind::ValueKind)
@@ -6554,7 +6592,7 @@ bool LLParser::parseDebugRecord(DbgRecord *&DR, PerFunctionState &PFS) {
                               .Case("label", RecordKind::LabelKind);
 
   // Parsing labels is trivial; parse here and early exit, otherwise go into the
-  // full DPValue processing stage.
+  // full DbgVariableRecord processing stage.
   if (RecordType == RecordKind::LabelKind) {
     Lex.Lex();
     if (parseToken(lltok::lparen, "Expected '(' here"))
@@ -6634,9 +6672,9 @@ bool LLParser::parseDebugRecord(DbgRecord *&DR, PerFunctionState &PFS) {
 
   if (parseToken(lltok::rparen, "Expected ')' here"))
     return true;
-  DR = DPValue::createUnresolvedDPValue(ValueType, ValLocMD, Variable,
-                                        Expression, AssignID, AddressLocation,
-                                        AddressExpression, DebugLoc);
+  DR = DbgVariableRecord::createUnresolvedDbgVariableRecord(
+      ValueType, ValLocMD, Variable, Expression, AssignID, AddressLocation,
+      AddressExpression, DebugLoc);
   return false;
 }
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 7005011980ebc9..c085c715179ba6 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -270,6 +270,11 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FUNC_CODE, INST_CMPXCHG)
       STRINGIFY_CODE(FUNC_CODE, INST_CALLBR)
       STRINGIFY_CODE(FUNC_CODE, BLOCKADDR_USERS)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_DECLARE)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_VALUE)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_ASSIGN)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_VALUE_SIMPLE)
+      STRINGIFY_CODE(FUNC_CODE, DEBUG_RECORD_LABEL)
     }
   case bitc::VALUE_SYMTAB_BLOCK_ID:
     switch (CodeID) {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 9c63116114f3c5..fa7038acc5245b 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -100,8 +100,14 @@ static cl::opt<bool> ExpandConstantExprs(
     cl::desc(
         "Expand constant expressions to instructions for testing purposes"));
 
-// Declare external flag for whether we're using the new debug-info format.
-extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+/// Load bitcode directly into RemoveDIs format (use debug records instead
+/// of debug intrinsics). UNSET is treated as FALSE, so the default action
+/// is to do nothing. Individual tools can override this to incrementally add
+/// support for the RemoveDIs format.
+cl::opt<cl::boolOrDefault> LoadBitcodeIntoNewDbgInforFormat(
+    "load-bitcode-into-experimental-debuginfo-iterators", cl::Hidden,
+    cl::desc("Load bitcode directly into the new debug info format (regardless "
+             "of input format)"));
 
 namespace {
 
@@ -514,25 +520,31 @@ class BitcodeConstant final : public Value,
   struct ExtraInfo {
     uint8_t Opcode;
     uint8_t Flags;
-    unsigned Extra;
-    Type *SrcElemTy;
+    unsigned BlockAddressBB = 0;
+    Type *SrcElemTy = nullptr;
+    std::optional<ConstantRange> InRange;
 
-    ExtraInfo(uint8_t Opcode, uint8_t Flags = 0, unsigned Extra = 0,
-              Type *SrcElemTy = nullptr)
-        : Opcode(Opcode), Flags(Flags), Extra(Extra), SrcElemTy(SrcElemTy) {}
+    ExtraInfo(uint8_t Opcode, uint8_t Flags = 0, Type *SrcElemTy = nullptr,
+              std::optional<ConstantRange> InRange = std::nullopt)
+        : Opcode(Opcode), Flags(Flags), SrcElemTy(SrcElemTy),
+          InRange(std::move(InRange)) {}
+
+    ExtraInfo(uint8_t Opcode, uint8_t Flags, unsigned BlockAddressBB)
+        : Opcode(Opcode), Flags(Flags), BlockAddressBB(BlockAddressBB) {}
   };
 
   uint8_t Opcode;
   uint8_t Flags;
   unsigned NumOperands;
-  unsigned Extra;  // GEP inrange index or blockaddress BB id.
+  unsigned BlockAddressBB;
   Type *SrcElemTy; // GEP source element type.
+  std::optional<ConstantRange> InRange; // GEP inrange attribute.
 
 private:
   BitcodeConstant(Type *Ty, const ExtraInfo &Info, ArrayRef<unsigned> OpIDs)
       : Value(Ty, SubclassID), Opcode(Info.Opcode), Flags(Info.Flags),
-        NumOperands(OpIDs.size()), Extra(Info.Extra),
-        SrcElemTy(Info.SrcElemTy) {
+        NumOperands(OpIDs.size()), BlockAddressBB(Info.BlockAddressBB),
+        SrcElemTy(Info.SrcElemTy), InRange(Info.InRange) {
     std::uninitialized_copy(OpIDs.begin(), OpIDs.end(),
                             getTrailingObjects<unsigned>());
   }
@@ -554,11 +566,9 @@ class BitcodeConstant final : public Value,
     return ArrayRef(getTrailingObjects<unsigned>(), NumOperands);
   }
 
-  std::optional<unsigned> getInRangeIndex() const {
+  std::optional<ConstantRange> getInRange() const {
     assert(Opcode == Instruction::GetElementPtr);
-    if (Extra == (unsigned)-1)
-      return std::nullopt;
-    return Extra;
+    return InRange;
   }
 
   const char *getOpcodeName() const {
@@ -1553,7 +1563,7 @@ Expected<Value *> BitcodeReader::materializeValue(unsigned StartValID,
           // If the function is already parsed we can insert the block address
           // right away.
           BasicBlock *BB;
-          unsigned BBID = BC->Extra;
+          unsigned BBID = BC->BlockAddressBB;
           if (!BBID)
             // Invalid reference to entry block.
             return error("Invalid ID");
@@ -1596,7 +1606,7 @@ Expected<Value *> BitcodeReader::materializeValue(unsigned StartValID,
         case Instruction::GetElementPtr:
           C = ConstantExpr::getGetElementPtr(BC->SrcElemTy, ConstOps[0],
                                              ArrayRef(ConstOps).drop_front(),
-                                             BC->Flags, BC->getInRangeIndex());
+                                             BC->Flags, BC->getInRange());
           break;
         case Instruction::ExtractElement:
           C = ConstantExpr::getExtractElement(ConstOps[0], ConstOps[1]);
@@ -3302,22 +3312,34 @@ Error BitcodeReader::parseConstants() {
     }
     case bitc::CST_CODE_CE_INBOUNDS_GEP: // [ty, n x operands]
     case bitc::CST_CODE_CE_GEP: // [ty, n x operands]
-    case bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX: { // [ty, flags, n x
-                                                     // operands]
+    case bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX_OLD: // [ty, flags, n x
+                                                       // operands]
+    case bitc::CST_CODE_CE_GEP_WITH_INRANGE: { // [ty, flags, start, end, n x
+                                               // operands]
       if (Record.size() < 2)
         return error("Constant GEP record must have at least two elements");
       unsigned OpNum = 0;
       Type *PointeeType = nullptr;
-      if (BitCode == bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX ||
-          Record.size() % 2)
+      if (BitCode == bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX_OLD ||
+          BitCode == bitc::CST_CODE_CE_GEP_WITH_INRANGE || Record.size() % 2)
         PointeeType = getTypeByID(Record[OpNum++]);
 
       bool InBounds = false;
-      std::optional<unsigned> InRangeIndex;
-      if (BitCode == bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX) {
+      std::optional<ConstantRange> InRange;
+      if (BitCode == bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX_OLD) {
+        uint64_t Op = Record[OpNum++];
+        InBounds = Op & 1;
+        unsigned InRangeIndex = Op >> 1;
+        // "Upgrade" inrange by dropping it. The feature is too niche to
+        // bother.
+        (void)InRangeIndex;
+      } else if (BitCode == bitc::CST_CODE_CE_GEP_WITH_INRANGE) {
         uint64_t Op = Record[OpNum++];
         InBounds = Op & 1;
-        InRangeIndex = Op >> 1;
+        Expected<ConstantRange> MaybeInRange = readConstantRange(Record, OpNum);
+        if (!MaybeInRange)
+          return MaybeInRange.takeError();
+        InRange = MaybeInRange.get();
       } else if (BitCode == bitc::CST_CODE_CE_INBOUNDS_GEP)
         InBounds = true;
 
@@ -3350,10 +3372,9 @@ Error BitcodeReader::parseConstants() {
           return error("Missing element type for old-style constant GEP");
       }
 
-      V = BitcodeConstant::create(Alloc, CurTy,
-                                  {Instruction::GetElementPtr, InBounds,
-                                   InRangeIndex.value_or(-1), PointeeType},
-                                  Elts);
+      V = BitcodeConstant::create(
+          Alloc, CurTy,
+          {Instruction::GetElementPtr, InBounds, PointeeType, InRange}, Elts);
       break;
     }
     case bitc::CST_CODE_CE_SELECT: {  // CE_SELECT: [opval#, opval#, opval#]
@@ -4279,6 +4300,12 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 Error BitcodeReader::parseModule(uint64_t ResumeBit,
                                  bool ShouldLazyLoadMetadata,
                                  ParserCallbacks Callbacks) {
+  // Load directly into RemoveDIs format if LoadBitcodeIntoNewDbgInforFormat
+  // has been set to true (default action: load into the old debug format).
+  TheModule->IsNewDbgInfoFormat =
+      UseNewDbgInfoFormat &&
+      LoadBitcodeIntoNewDbgInforFormat == cl::boolOrDefault::BOU_TRUE;
+
   this->ValueTypeCallback = std::move(Callbacks.ValueType);
   if (ResumeBit) {
     if (Error JumpFailed = Stream.JumpToBit(ResumeBit))
@@ -6398,6 +6425,91 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     }
+    case bitc::FUNC_CODE_DEBUG_RECORD_LABEL: {
+      // DPLabels are placed after the Instructions that they are attached to.
+      Instruction *Inst = getLastInstruction();
+      if (!Inst)
+        return error("Invalid dbg record: missing instruction");
+      DILocation *DIL = cast<DILocation>(getFnMetadataByID(Record[0]));
+      DILabel *Label = cast<DILabel>(getFnMetadataByID(Record[1]));
+      Inst->getParent()->insertDbgRecordBefore(
+          new DPLabel(Label, DebugLoc(DIL)), Inst->getIterator());
+      continue; // This isn't an instruction.
+    }
+    case bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE:
+    case bitc::FUNC_CODE_DEBUG_RECORD_VALUE:
+    case bitc::FUNC_CODE_DEBUG_RECORD_DECLARE:
+    case bitc::FUNC_CODE_DEBUG_RECORD_ASSIGN: {
+      // DbgVariableRecords are placed after the Instructions that they are
+      // attached to.
+      Instruction *Inst = getLastInstruction();
+      if (!Inst)
+        return error("Invalid dbg record: missing instruction");
+
+      // First 3 fields are common to all kinds:
+      //   DILocation, DILocalVariable, DIExpression
+      // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE)
+      //   ..., LocationMetadata
+      // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE - abbrev'd)
+      //   ..., Value
+      // dbg_declare (FUNC_CODE_DEBUG_RECORD_DECLARE)
+      //   ..., LocationMetadata
+      // dbg_assign (FUNC_CODE_DEBUG_RECORD_ASSIGN)
+      //   ..., LocationMetadata, DIAssignID, DIExpression, LocationMetadata
+      unsigned Slot = 0;
+      // Common fields (0-2).
+      DILocation *DIL = cast<DILocation>(getFnMetadataByID(Record[Slot++]));
+      DILocalVariable *Var =
+          cast<DILocalVariable>(getFnMetadataByID(Record[Slot++]));
+      DIExpression *Expr =
+          cast<DIExpression>(getFnMetadataByID(Record[Slot++]));
+
+      // Union field (3: LocationMetadata | Value).
+      Metadata *RawLocation = nullptr;
+      if (BitCode == bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE) {
+        Value *V = nullptr;
+        unsigned TyID = 0;
+        // We never expect to see a fwd reference value here because
+        // use-before-defs are encoded with the standard non-abbrev record
+        // type (they'd require encoding the type too, and they're rare). As a
+        // result, getValueTypePair only ever increments Slot by one here (once
+        // for the value, never twice for value and type).
+        unsigned SlotBefore = Slot;
+        if (getValueTypePair(Record, Slot, NextValueNo, V, TyID, CurBB))
+          return error("Invalid dbg record: invalid value");
+        (void)SlotBefore;
+        assert((SlotBefore == Slot - 1) && "unexpected fwd ref");
+        RawLocation = ValueAsMetadata::get(V);
+      } else {
+        RawLocation = getFnMetadataByID(Record[Slot++]);
+      }
+
+      DbgVariableRecord *DVR = nullptr;
+      switch (BitCode) {
+      case bitc::FUNC_CODE_DEBUG_RECORD_VALUE:
+      case bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE:
+        DVR = new DbgVariableRecord(RawLocation, Var, Expr, DIL,
+                                    DbgVariableRecord::LocationType::Value);
+        break;
+      case bitc::FUNC_CODE_DEBUG_RECORD_DECLARE:
+        DVR = new DbgVariableRecord(RawLocation, Var, Expr, DIL,
+                                    DbgVariableRecord::LocationType::Declare);
+        break;
+      case bitc::FUNC_CODE_DEBUG_RECORD_ASSIGN: {
+        DIAssignID *ID = cast<DIAssignID>(getFnMetadataByID(Record[Slot++]));
+        DIExpression *AddrExpr =
+            cast<DIExpression>(getFnMetadataByID(Record[Slot++]));
+        Metadata *Addr = getFnMetadataByID(Record[Slot++]);
+        DVR = new DbgVariableRecord(RawLocation, Var, Expr, ID, Addr, AddrExpr,
+                                    DIL);
+        break;
+      }
+      default:
+        llvm_unreachable("Unknown DbgVariableRecord bitcode");
+      }
+      Inst->getParent()->insertDbgRecordBefore(DVR, Inst->getIterator());
+      continue; // This isn't an instruction.
+    }
     case bitc::FUNC_CODE_INST_CALL: {
       // CALL: [paramattrs, cc, fmf, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
@@ -6677,10 +6789,22 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
   // Move the bit stream to the saved position of the deferred function body.
   if (Error JumpFailed = Stream.JumpToBit(DFII->second))
     return JumpFailed;
+
+  // Set the debug info mode to "new", possibly creating a mismatch between
+  // module and function debug modes. This is okay because we'll convert
+  // everything back to the old mode after parsing if needed.
+  // FIXME: Remove this once all tools support RemoveDIs.
+  F->IsNewDbgInfoFormat = true;
+
   if (Error Err = parseFunctionBody(F))
     return Err;
   F->setIsMaterializable(false);
 
+  // Convert new debug info records into intrinsics.
+  // FIXME: Remove this once all tools support RemoveDIs.
+  if (!F->getParent()->IsNewDbgInfoFormat)
+    F->convertFromNewDbgValues();
+
   if (StripDebugInfo)
     stripDebugInfo(*F);
 
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 770eb83af17f9b..9102f3a60cffc2 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -609,17 +609,25 @@ class MetadataLoader::MetadataLoaderImpl {
     if (!NeedDeclareExpressionUpgrade)
       return;
 
+    auto UpdateDeclareIfNeeded = [&](auto *Declare) {
+      auto *DIExpr = Declare->getExpression();
+      if (!DIExpr || !DIExpr->startsWithDeref() ||
+          !isa_and_nonnull<Argument>(Declare->getAddress()))
+        return;
+      SmallVector<uint64_t, 8> Ops;
+      Ops.append(std::next(DIExpr->elements_begin()), DIExpr->elements_end());
+      Declare->setExpression(DIExpression::get(Context, Ops));
+    };
+
     for (auto &BB : F)
-      for (auto &I : BB)
+      for (auto &I : BB) {
+        for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+          if (DVR.isDbgDeclare())
+            UpdateDeclareIfNeeded(&DVR);
+        }
         if (auto *DDI = dyn_cast<DbgDeclareInst>(&I))
-          if (auto *DIExpr = DDI->getExpression())
-            if (DIExpr->startsWithDeref() &&
-                isa_and_nonnull<Argument>(DDI->getAddress())) {
-              SmallVector<uint64_t, 8> Ops;
-              Ops.append(std::next(DIExpr->elements_begin()),
-                         DIExpr->elements_end());
-              DDI->setExpression(DIExpression::get(Context, Ops));
-            }
+          UpdateDeclareIfNeeded(DDI);
+      }
   }
 
   /// Upgrade the expression from previous versions.
@@ -1556,7 +1564,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_DERIVED_TYPE: {
-    if (Record.size() < 12 || Record.size() > 14)
+    if (Record.size() < 12 || Record.size() > 15)
       return error("Invalid record");
 
     // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
@@ -1566,8 +1574,17 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       DWARFAddressSpace = Record[12] - 1;
 
     Metadata *Annotations = nullptr;
-    if (Record.size() > 13 && Record[13])
-      Annotations = getMDOrNull(Record[13]);
+    std::optional<DIDerivedType::PtrAuthData> PtrAuthData;
+
+    // Only look for annotations/ptrauth if both are allocated.
+    // If not, we can't tell which was intended to be embedded, as both ptrauth
+    // and annotations have been expected at Record[13] at various times.
+    if (Record.size() > 14) {
+      if (Record[13])
+        Annotations = getMDOrNull(Record[13]);
+      if (Record[14])
+        PtrAuthData.emplace(Record[14]);
+    }
 
     IsDistinct = Record[0];
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
@@ -1577,7 +1594,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getMDOrNull(Record[3]), Record[4],
                          getDITypeRefOrNull(Record[5]),
                          getDITypeRefOrNull(Record[6]), Record[7], Record[8],
-                         Record[9], DWARFAddressSpace, Flags,
+                         Record[9], DWARFAddressSpace, PtrAuthData, Flags,
                          getDITypeRefOrNull(Record[11]), Annotations)),
         NextMetadataNo);
     NextMetadataNo++;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 597f49332fad25..a1ee02918dfa67 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -99,6 +99,9 @@ namespace llvm {
 extern FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold;
 }
 
+extern bool WriteNewDbgInfoFormatToBitcode;
+extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
+
 namespace {
 
 /// These are manifest constants used by the bitcode writer. They do not need to
@@ -128,6 +131,7 @@ enum {
   FUNCTION_INST_RET_VAL_ABBREV,
   FUNCTION_INST_UNREACHABLE_ABBREV,
   FUNCTION_INST_GEP_ABBREV,
+  FUNCTION_DEBUG_RECORD_VALUE_ABBREV,
 };
 
 /// Abstract class to manage the bitcode writing, subclassed for each bitcode
@@ -1825,6 +1829,11 @@ void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
 
   Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
 
+  if (auto PtrAuthData = N->getPtrAuthData())
+    Record.push_back(PtrAuthData->RawData);
+  else
+    Record.push_back(0);
+
   Stream.EmitRecord(bitc::METADATA_DERIVED_TYPE, Record, Abbrev);
   Record.clear();
 }
@@ -2742,9 +2751,10 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
         Code = bitc::CST_CODE_CE_GEP;
         const auto *GO = cast<GEPOperator>(C);
         Record.push_back(VE.getTypeID(GO->getSourceElementType()));
-        if (std::optional<unsigned> Idx = GO->getInRangeIndex()) {
-          Code = bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX;
-          Record.push_back((*Idx << 1) | GO->isInBounds());
+        if (std::optional<ConstantRange> Range = GO->getInRange()) {
+          Code = bitc::CST_CODE_CE_GEP_WITH_INRANGE;
+          Record.push_back(GO->isInBounds());
+          emitConstantRange(Record, *Range);
         } else if (GO->isInBounds())
           Code = bitc::CST_CODE_CE_INBOUNDS_GEP;
         for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i) {
@@ -3512,25 +3522,96 @@ void ModuleBitcodeWriter::writeFunction(
       NeedsMetadataAttachment |= I.hasMetadataOtherThanDebugLoc();
 
       // If the instruction has a debug location, emit it.
-      DILocation *DL = I.getDebugLoc();
-      if (!DL)
-        continue;
-
-      if (DL == LastDL) {
-        // Just repeat the same debug loc as last time.
-        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC_AGAIN, Vals);
-        continue;
+      if (DILocation *DL = I.getDebugLoc()) {
+        if (DL == LastDL) {
+          // Just repeat the same debug loc as last time.
+          Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC_AGAIN, Vals);
+        } else {
+          Vals.push_back(DL->getLine());
+          Vals.push_back(DL->getColumn());
+          Vals.push_back(VE.getMetadataOrNullID(DL->getScope()));
+          Vals.push_back(VE.getMetadataOrNullID(DL->getInlinedAt()));
+          Vals.push_back(DL->isImplicitCode());
+          Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC, Vals);
+          Vals.clear();
+          LastDL = DL;
+        }
       }
 
-      Vals.push_back(DL->getLine());
-      Vals.push_back(DL->getColumn());
-      Vals.push_back(VE.getMetadataOrNullID(DL->getScope()));
-      Vals.push_back(VE.getMetadataOrNullID(DL->getInlinedAt()));
-      Vals.push_back(DL->isImplicitCode());
-      Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC, Vals);
-      Vals.clear();
-
-      LastDL = DL;
+      // If the instruction has DbgRecords attached to it, emit them. Note that
+      // they come after the instruction so that it's easy to attach them again
+      // when reading the bitcode, even though conceptually the debug locations
+      // start "before" the instruction.
+      if (I.hasDbgRecords() && WriteNewDbgInfoFormatToBitcode) {
+        /// Try to push the value only (unwrapped), otherwise push the
+        /// metadata wrapped value. Returns true if the value was pushed
+        /// without the ValueAsMetadata wrapper.
+        auto PushValueOrMetadata = [&Vals, InstID,
+                                    this](Metadata *RawLocation) {
+          assert(RawLocation &&
+                 "RawLocation unexpectedly null in DbgVariableRecord");
+          if (ValueAsMetadata *VAM = dyn_cast<ValueAsMetadata>(RawLocation)) {
+            SmallVector<unsigned, 2> ValAndType;
+            // If the value is a fwd-ref the type is also pushed. We don't
+            // want the type, so fwd-refs are kept wrapped (pushValueAndType
+            // returns false if the value is pushed without type).
+            if (!pushValueAndType(VAM->getValue(), InstID, ValAndType)) {
+              Vals.push_back(ValAndType[0]);
+              return true;
+            }
+          }
+          // The metadata is a DIArgList, or ValueAsMetadata wrapping a
+          // fwd-ref. Push the metadata ID.
+          Vals.push_back(VE.getMetadataID(RawLocation));
+          return false;
+        };
+
+        // Write out non-instruction debug information attached to this
+        // instruction. Write it after the instruction so that it's easy to
+        // re-attach to the instruction reading the records in.
+        for (DbgRecord &DR : I.DbgMarker->getDbgRecordRange()) {
+          if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+            Vals.push_back(VE.getMetadataID(&*DPL->getDebugLoc()));
+            Vals.push_back(VE.getMetadataID(DPL->getLabel()));
+            Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_LABEL, Vals);
+            Vals.clear();
+            continue;
+          }
+
+          // First 3 fields are common to all kinds:
+          //   DILocation, DILocalVariable, DIExpression
+          // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE)
+          //   ..., LocationMetadata
+          // dbg_value (FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE - abbrev'd)
+          //   ..., Value
+          // dbg_declare (FUNC_CODE_DEBUG_RECORD_DECLARE)
+          //   ..., LocationMetadata
+          // dbg_assign (FUNC_CODE_DEBUG_RECORD_ASSIGN)
+          //   ..., LocationMetadata, DIAssignID, DIExpression, LocationMetadata
+          DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
+          Vals.push_back(VE.getMetadataID(&*DVR.getDebugLoc()));
+          Vals.push_back(VE.getMetadataID(DVR.getVariable()));
+          Vals.push_back(VE.getMetadataID(DVR.getExpression()));
+          if (DVR.isDbgValue()) {
+            if (PushValueOrMetadata(DVR.getRawLocation()))
+              Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE, Vals,
+                                FUNCTION_DEBUG_RECORD_VALUE_ABBREV);
+            else
+              Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_VALUE, Vals);
+          } else if (DVR.isDbgDeclare()) {
+            Vals.push_back(VE.getMetadataID(DVR.getRawLocation()));
+            Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_DECLARE, Vals);
+          } else {
+            assert(DVR.isDbgAssign() && "Unexpected DbgRecord kind");
+            Vals.push_back(VE.getMetadataID(DVR.getRawLocation()));
+            Vals.push_back(VE.getMetadataID(DVR.getAssignID()));
+            Vals.push_back(VE.getMetadataID(DVR.getAddressExpression()));
+            Vals.push_back(VE.getMetadataID(DVR.getRawAddress()));
+            Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_RECORD_ASSIGN, Vals);
+          }
+          Vals.clear();
+        }
+      }
     }
 
     if (BlockAddress *BA = BlockAddress::lookup(&BB)) {
@@ -3771,7 +3852,17 @@ void ModuleBitcodeWriter::writeBlockInfo() {
         FUNCTION_INST_GEP_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
-
+  {
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_DEBUG_RECORD_VALUE_SIMPLE));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7)); // dbgloc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7)); // var
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 7)); // expr
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // val
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_DEBUG_RECORD_VALUE_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering! 1");
+  }
   Stream.ExitBlock();
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 0eb9c246f2a9b6..de2396f31f6669 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -18,11 +18,12 @@
 #include "llvm/Pass.h"
 using namespace llvm;
 
+extern bool WriteNewDbgInfoFormatToBitcode;
+
 PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
-  // RemoveDIs: there's no bitcode representation of the DbgRecord debug-info,
-  // convert to dbg.values before writing out.
-  bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
+  bool ConvertToOldDbgFormatForWrite =
+      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
+  if (ConvertToOldDbgFormatForWrite)
     M.convertFromNewDbgValues();
 
   const ModuleSummaryIndex *Index =
@@ -30,7 +31,7 @@ PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
                        : nullptr;
   WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, Index, EmitModuleHash);
 
-  if (IsNewDbgInfoFormat)
+  if (ConvertToOldDbgFormatForWrite)
     M.convertToNewDbgValues();
 
   return PreservedAnalyses::all();
@@ -56,16 +57,15 @@ namespace {
     StringRef getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      // RemoveDIs: there's no bitcode representation of the DbgRecord
-      // debug-info, convert to dbg.values before writing out.
-      bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-      if (IsNewDbgInfoFormat)
+      bool ConvertToOldDbgFormatForWrite =
+          M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
+      if (ConvertToOldDbgFormatForWrite)
         M.convertFromNewDbgValues();
 
       WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, /*Index=*/nullptr,
                          /*EmitModuleHash=*/false);
 
-      if (IsNewDbgInfoFormat)
+      if (ConvertToOldDbgFormatForWrite)
         M.convertToNewDbgValues();
       return false;
     }
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index fccb2a606f7ed9..3209dca253febd 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -134,20 +134,28 @@ static OrderMap orderModule(const Module &M) {
     // Metadata used by instructions is decoded before the actual instructions,
     // so visit any constants used by it beforehand.
     for (const BasicBlock &BB : F)
-      for (const Instruction &I : BB)
-        for (const Value *V : I.operands()) {
-          if (const auto *MAV = dyn_cast<MetadataAsValue>(V)) {
-            if (const auto *VAM =
-                    dyn_cast<ValueAsMetadata>(MAV->getMetadata())) {
+      for (const Instruction &I : BB) {
+        auto OrderConstantFromMetadata = [&](Metadata *MD) {
+          if (const auto *VAM = dyn_cast<ValueAsMetadata>(MD)) {
+            orderConstantValue(VAM->getValue());
+          } else if (const auto *AL = dyn_cast<DIArgList>(MD)) {
+            for (const auto *VAM : AL->getArgs())
               orderConstantValue(VAM->getValue());
-            } else if (const auto *AL =
-                           dyn_cast<DIArgList>(MAV->getMetadata())) {
-              for (const auto *VAM : AL->getArgs())
-                orderConstantValue(VAM->getValue());
-            }
           }
+        };
+
+        for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+          OrderConstantFromMetadata(DVR.getRawLocation());
+          if (DVR.isDbgAssign())
+            OrderConstantFromMetadata(DVR.getRawAddress());
         }
 
+        for (const Value *V : I.operands()) {
+          if (const auto *MAV = dyn_cast<MetadataAsValue>(V))
+            OrderConstantFromMetadata(MAV->getMetadata());
+        }
+      }
+
     for (const Argument &A : F.args())
       orderValue(&A, OM);
     for (const BasicBlock &BB : F)
@@ -261,33 +269,39 @@ static UseListOrderStack predictUseListOrder(const Module &M) {
   // constants in the last Function they're used in.  Module-level constants
   // have already been visited above.
   for (const Function &F : llvm::reverse(M)) {
+    auto PredictValueOrderFromMetadata = [&](Metadata *MD) {
+      if (const auto *VAM = dyn_cast<ValueAsMetadata>(MD)) {
+        predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
+      } else if (const auto *AL = dyn_cast<DIArgList>(MD)) {
+        for (const auto *VAM : AL->getArgs())
+          predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
+      }
+    };
     if (F.isDeclaration())
       continue;
     for (const BasicBlock &BB : F)
       predictValueUseListOrder(&BB, &F, OM, Stack);
     for (const Argument &A : F.args())
       predictValueUseListOrder(&A, &F, OM, Stack);
-    for (const BasicBlock &BB : F)
+    for (const BasicBlock &BB : F) {
       for (const Instruction &I : BB) {
+        for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+          PredictValueOrderFromMetadata(DVR.getRawLocation());
+          if (DVR.isDbgAssign())
+            PredictValueOrderFromMetadata(DVR.getRawAddress());
+        }
         for (const Value *Op : I.operands()) {
           if (isa<Constant>(*Op) || isa<InlineAsm>(*Op)) // Visit GlobalValues.
             predictValueUseListOrder(Op, &F, OM, Stack);
-          if (const auto *MAV = dyn_cast<MetadataAsValue>(Op)) {
-            if (const auto *VAM =
-                    dyn_cast<ValueAsMetadata>(MAV->getMetadata())) {
-              predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
-            } else if (const auto *AL =
-                           dyn_cast<DIArgList>(MAV->getMetadata())) {
-              for (const auto *VAM : AL->getArgs())
-                predictValueUseListOrder(VAM->getValue(), &F, OM, Stack);
-            }
-          }
+          if (const auto *MAV = dyn_cast<MetadataAsValue>(Op))
+            PredictValueOrderFromMetadata(MAV->getMetadata());
         }
         if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
           predictValueUseListOrder(SVI->getShuffleMaskForBitcode(), &F, OM,
                                    Stack);
         predictValueUseListOrder(&I, &F, OM, Stack);
       }
+    }
   }
 
   // Visit globals last, since the module-level use-list block will be seen
@@ -409,6 +423,41 @@ ValueEnumerator::ValueEnumerator(const Module &M,
 
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB) {
+        // Local metadata is enumerated during function-incorporation, but
+        // any ConstantAsMetadata arguments in a DIArgList should be examined
+        // now.
+        auto EnumerateNonLocalValuesFromMetadata = [&](Metadata *MD) {
+          assert(MD && "Metadata unexpectedly null");
+          if (const auto *AL = dyn_cast<DIArgList>(MD)) {
+            for (const auto *VAM : AL->getArgs()) {
+              if (isa<ConstantAsMetadata>(VAM))
+                EnumerateMetadata(&F, VAM);
+            }
+            return;
+          }
+
+          if (!isa<LocalAsMetadata>(MD))
+            EnumerateMetadata(&F, MD);
+        };
+
+        for (DbgRecord &DR : I.getDbgRecordRange()) {
+          if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+            EnumerateMetadata(&F, DPL->getLabel());
+            EnumerateMetadata(&F, &*DPL->getDebugLoc());
+            continue;
+          }
+          // Enumerate non-local location metadata.
+          DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
+          EnumerateNonLocalValuesFromMetadata(DVR.getRawLocation());
+          EnumerateMetadata(&F, DVR.getExpression());
+          EnumerateMetadata(&F, DVR.getVariable());
+          EnumerateMetadata(&F, &*DVR.getDebugLoc());
+          if (DVR.isDbgAssign()) {
+            EnumerateNonLocalValuesFromMetadata(DVR.getRawAddress());
+            EnumerateMetadata(&F, DVR.getAssignID());
+            EnumerateMetadata(&F, DVR.getAddressExpression());
+          }
+        }
         for (const Use &Op : I.operands()) {
           auto *MD = dyn_cast<MetadataAsValue>(&Op);
           if (!MD) {
@@ -416,19 +465,7 @@ ValueEnumerator::ValueEnumerator(const Module &M,
             continue;
           }
 
-          // Local metadata is enumerated during function-incorporation, but
-          // any ConstantAsMetadata arguments in a DIArgList should be examined
-          // now.
-          if (isa<LocalAsMetadata>(MD->getMetadata()))
-            continue;
-          if (auto *AL = dyn_cast<DIArgList>(MD->getMetadata())) {
-            for (auto *VAM : AL->getArgs())
-              if (isa<ConstantAsMetadata>(VAM))
-                EnumerateMetadata(&F, VAM);
-            continue;
-          }
-
-          EnumerateMetadata(&F, MD->getMetadata());
+          EnumerateNonLocalValuesFromMetadata(MD->getMetadata());
         }
         if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
           EnumerateType(SVI->getShuffleMaskForBitcode()->getType());
@@ -1064,27 +1101,43 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
 
   SmallVector<LocalAsMetadata *, 8> FnLocalMDVector;
   SmallVector<DIArgList *, 8> ArgListMDVector;
+
+  auto AddFnLocalMetadata = [&](Metadata *MD) {
+    if (!MD)
+      return;
+    if (auto *Local = dyn_cast<LocalAsMetadata>(MD)) {
+      // Enumerate metadata after the instructions they might refer to.
+      FnLocalMDVector.push_back(Local);
+    } else if (auto *ArgList = dyn_cast<DIArgList>(MD)) {
+      ArgListMDVector.push_back(ArgList);
+      for (ValueAsMetadata *VMD : ArgList->getArgs()) {
+        if (auto *Local = dyn_cast<LocalAsMetadata>(VMD)) {
+          // Enumerate metadata after the instructions they might refer
+          // to.
+          FnLocalMDVector.push_back(Local);
+        }
+      }
+    }
+  };
+
   // Add all of the instructions.
   for (const BasicBlock &BB : F) {
     for (const Instruction &I : BB) {
       for (const Use &OI : I.operands()) {
-        if (auto *MD = dyn_cast<MetadataAsValue>(&OI)) {
-          if (auto *Local = dyn_cast<LocalAsMetadata>(MD->getMetadata())) {
-            // Enumerate metadata after the instructions they might refer to.
-            FnLocalMDVector.push_back(Local);
-          } else if (auto *ArgList = dyn_cast<DIArgList>(MD->getMetadata())) {
-            ArgListMDVector.push_back(ArgList);
-            for (ValueAsMetadata *VMD : ArgList->getArgs()) {
-              if (auto *Local = dyn_cast<LocalAsMetadata>(VMD)) {
-                // Enumerate metadata after the instructions they might refer
-                // to.
-                FnLocalMDVector.push_back(Local);
-              }
-            }
-          }
+        if (auto *MD = dyn_cast<MetadataAsValue>(&OI))
+          AddFnLocalMetadata(MD->getMetadata());
+      }
+      /// RemoveDIs: Add non-instruction function-local metadata uses.
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        assert(DVR.getRawLocation() &&
+               "DbgVariableRecord location unexpectedly null");
+        AddFnLocalMetadata(DVR.getRawLocation());
+        if (DVR.isDbgAssign()) {
+          assert(DVR.getRawAddress() &&
+                 "DbgVariableRecord location unexpectedly null");
+          AddFnLocalMetadata(DVR.getRawAddress());
         }
       }
-
       if (!I.getType()->isVoidTy())
         EnumerateValue(&I);
     }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 0efe7a0e733672..a15538755d73b3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -538,8 +538,10 @@ bool AsmPrinter::doInitialization(Module &M) {
   if (!M.getModuleInlineAsm().empty()) {
     OutStreamer->AddComment("Start of file scope inline assembly");
     OutStreamer->addBlankLine();
-    emitInlineAsm(M.getModuleInlineAsm() + "\n", *TM.getMCSubtargetInfo(),
-                  TM.Options.MCOptions);
+    emitInlineAsm(
+        M.getModuleInlineAsm() + "\n", *TM.getMCSubtargetInfo(),
+        TM.Options.MCOptions, nullptr,
+        InlineAsm::AsmDialect(TM.getMCAsmInfo()->getAssemblerDialect()));
     OutStreamer->AddComment("End of file scope inline assembly");
     OutStreamer->addBlankLine();
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index d462859e489465..c40beeeb925e0e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -803,6 +803,18 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   if (DTy->getDWARFAddressSpace())
     addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4,
             *DTy->getDWARFAddressSpace());
+  if (auto PtrAuthData = DTy->getPtrAuthData()) {
+    addUInt(Buffer, dwarf::DW_AT_LLVM_ptrauth_key, dwarf::DW_FORM_data1,
+            PtrAuthData->key());
+    if (PtrAuthData->isAddressDiscriminated())
+      addFlag(Buffer, dwarf::DW_AT_LLVM_ptrauth_address_discriminated);
+    addUInt(Buffer, dwarf::DW_AT_LLVM_ptrauth_extra_discriminator,
+            dwarf::DW_FORM_data2, PtrAuthData->extraDiscriminator());
+    if (PtrAuthData->isaPointer())
+      addFlag(Buffer, dwarf::DW_AT_LLVM_ptrauth_isa_pointer);
+    if (PtrAuthData->authenticatesNullValues())
+      addFlag(Buffer, dwarf::DW_AT_LLVM_ptrauth_authenticates_null_values);
+  }
 }
 
 void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 52774c7d73225f..09177950fc8246 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -215,7 +215,8 @@ void FunctionVarLocs::init(FunctionVarLocsBuilder &Builder) {
 
   // Insert a contiguous block of VarLocInfos for each instruction, mapping it
   // to the start and end position in the vector with VarLocsBeforeInst. This
-  // block includes VarLocs for any DPValues attached to that instruction.
+  // block includes VarLocs for any DbgVariableRecords attached to that
+  // instruction.
   for (auto &P : Builder.VarLocsBeforeInst) {
     // Process VarLocs attached to a DbgRecord alongside their marker
     // Instruction.
@@ -226,12 +227,12 @@ void FunctionVarLocs::init(FunctionVarLocsBuilder &Builder) {
     // Any VarLocInfos attached to a DbgRecord should now be remapped to their
     // marker Instruction, in order of DbgRecord appearance and prior to any
     // VarLocInfos attached directly to that instruction.
-    for (const DPValue &DPV : filterDbgVars(I->getDbgRecordRange())) {
-      // Even though DPV defines a variable location, VarLocsBeforeInst can
+    for (const DbgVariableRecord &DVR : filterDbgVars(I->getDbgRecordRange())) {
+      // Even though DVR defines a variable location, VarLocsBeforeInst can
       // still be empty if that VarLoc was redundant.
-      if (!Builder.VarLocsBeforeInst.count(&DPV))
+      if (!Builder.VarLocsBeforeInst.count(&DVR))
         continue;
-      for (const VarLocInfo &VarLoc : Builder.VarLocsBeforeInst[&DPV])
+      for (const VarLocInfo &VarLoc : Builder.VarLocsBeforeInst[&DVR])
         VarLocRecords.emplace_back(VarLoc);
     }
     for (const VarLocInfo &VarLoc : P.second)
@@ -830,10 +831,10 @@ class MemLocFragmentFill {
   void process(BasicBlock &BB, VarFragMap &LiveSet) {
     BBInsertBeforeMap[&BB].clear();
     for (auto &I : BB) {
-      for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange())) {
-        if (const auto *Locs = FnVarLocs->getWedge(&DPV)) {
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (const auto *Locs = FnVarLocs->getWedge(&DVR)) {
           for (const VarLocInfo &Loc : *Locs) {
-            addDef(Loc, &DPV, *I.getParent(), LiveSet);
+            addDef(Loc, &DVR, *I.getParent(), LiveSet);
           }
         }
       }
@@ -1027,7 +1028,7 @@ class AssignmentTrackingLowering {
   /// i.e. for all values x and y where x != y:
   /// join(x, x) = x
   /// join(x, y) = NoneOrPhi
-  using AssignRecord = PointerUnion<DbgAssignIntrinsic *, DPValue *>;
+  using AssignRecord = PointerUnion<DbgAssignIntrinsic *, DbgVariableRecord *>;
   struct Assignment {
     enum S { Known, NoneOrPhi } Status;
     /// ID of the assignment. nullptr if Status is not Known.
@@ -1054,16 +1055,16 @@ class AssignmentTrackingLowering {
       else if (isa<DbgAssignIntrinsic *>(Source))
         OS << Source.get<DbgAssignIntrinsic *>();
       else
-        OS << Source.get<DPValue *>();
+        OS << Source.get<DbgVariableRecord *>();
       OS << ")";
     }
 
     static Assignment make(DIAssignID *ID, DbgAssignIntrinsic *Source) {
       return Assignment(Known, ID, Source);
     }
-    static Assignment make(DIAssignID *ID, DPValue *Source) {
+    static Assignment make(DIAssignID *ID, DbgVariableRecord *Source) {
       assert(Source->isDbgAssign() &&
-             "Cannot make an assignment from a non-assign DPValue");
+             "Cannot make an assignment from a non-assign DbgVariableRecord");
       return Assignment(Known, ID, Source);
     }
     static Assignment make(DIAssignID *ID, AssignRecord Source) {
@@ -1084,7 +1085,7 @@ class AssignmentTrackingLowering {
       // If the Status is Known then we expect there to be an assignment ID.
       assert(Status == NoneOrPhi || ID);
     }
-    Assignment(S Status, DIAssignID *ID, DPValue *Source)
+    Assignment(S Status, DIAssignID *ID, DbgVariableRecord *Source)
         : Status(Status), ID(ID), Source(Source) {
       // If the Status is Known then we expect there to be an assignment ID.
       assert(Status == NoneOrPhi || ID);
@@ -1119,10 +1120,10 @@ class AssignmentTrackingLowering {
   /// Clear the location definitions currently cached for insertion after /p
   /// After.
   void resetInsertionPoint(Instruction &After);
-  void resetInsertionPoint(DPValue &After);
+  void resetInsertionPoint(DbgVariableRecord &After);
 
   // emitDbgValue can be called with:
-  //   Source=[AssignRecord|DbgValueInst*|DbgAssignIntrinsic*|DPValue*]
+  //   Source=[AssignRecord|DbgValueInst*|DbgAssignIntrinsic*|DbgVariableRecord*]
   // Since AssignRecord can be cast to one of the latter two types, and all
   // other types have a shared interface, we use a template to handle the latter
   // three types, and an explicit overload for AssignRecord that forwards to
@@ -1355,9 +1356,10 @@ class AssignmentTrackingLowering {
   /// attachment, \p I.
   void processUntaggedInstruction(Instruction &I, BlockInfo *LiveSet);
   void processDbgAssign(AssignRecord Assign, BlockInfo *LiveSet);
-  void processDPValue(DPValue &DPV, BlockInfo *LiveSet);
-  void processDbgValue(PointerUnion<DbgValueInst *, DPValue *> DbgValueRecord,
-                       BlockInfo *LiveSet);
+  void processDbgVariableRecord(DbgVariableRecord &DVR, BlockInfo *LiveSet);
+  void processDbgValue(
+      PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgValueRecord,
+      BlockInfo *LiveSet);
   /// Add an assignment to memory for the variable /p Var.
   void addMemDef(BlockInfo *LiveSet, VariableID Var, const Assignment &AV);
   /// Add an assignment to the variable /p Var.
@@ -1457,10 +1459,10 @@ static DIAssignID *getIDFromMarker(const DbgAssignIntrinsic &DAI) {
   return cast<DIAssignID>(DAI.getAssignID());
 }
 
-static DIAssignID *getIDFromMarker(const DPValue &DPV) {
-  assert(DPV.isDbgAssign() &&
-         "Cannot get a DIAssignID from a non-assign DPValue!");
-  return DPV.getAssignID();
+static DIAssignID *getIDFromMarker(const DbgVariableRecord &DVR) {
+  assert(DVR.isDbgAssign() &&
+         "Cannot get a DIAssignID from a non-assign DbgVariableRecord!");
+  return DVR.getAssignID();
 }
 
 /// Return true if \p Var has an assignment in \p M matching \p AV.
@@ -1493,10 +1495,10 @@ const char *locStr(AssignmentTrackingLowering::LocKind Loc) {
 }
 #endif
 
-VarLocInsertPt getNextNode(const DbgRecord *DPV) {
-  auto NextIt = ++(DPV->getIterator());
-  if (NextIt == DPV->getMarker()->getDbgRecordRange().end())
-    return DPV->getMarker()->MarkedInstr;
+VarLocInsertPt getNextNode(const DbgRecord *DVR) {
+  auto NextIt = ++(DVR->getIterator());
+  if (NextIt == DVR->getMarker()->getDbgRecordRange().end())
+    return DVR->getMarker()->MarkedInstr;
   return &*NextIt;
 }
 VarLocInsertPt getNextNode(const Instruction *Inst) {
@@ -1515,10 +1517,10 @@ DbgAssignIntrinsic *CastToDbgAssign(DbgVariableIntrinsic *DVI) {
   return cast<DbgAssignIntrinsic>(DVI);
 }
 
-DPValue *CastToDbgAssign(DPValue *DPV) {
-  assert(DPV->isDbgAssign() &&
-         "Attempted to cast non-assign DPValue to DPVAssign.");
-  return DPV;
+DbgVariableRecord *CastToDbgAssign(DbgVariableRecord *DVR) {
+  assert(DVR->isDbgAssign() &&
+         "Attempted to cast non-assign DbgVariableRecord to DVRAssign.");
+  return DVR;
 }
 
 void AssignmentTrackingLowering::emitDbgValue(
@@ -1527,7 +1529,7 @@ void AssignmentTrackingLowering::emitDbgValue(
   if (isa<DbgAssignIntrinsic *>(Source))
     emitDbgValue(Kind, cast<DbgAssignIntrinsic *>(Source), After);
   else
-    emitDbgValue(Kind, cast<DPValue *>(Source), After);
+    emitDbgValue(Kind, cast<DbgVariableRecord *>(Source), After);
 }
 template <typename T>
 void AssignmentTrackingLowering::emitDbgValue(
@@ -1674,7 +1676,7 @@ void AssignmentTrackingLowering::processUntaggedInstruction(
 void AssignmentTrackingLowering::processTaggedInstruction(
     Instruction &I, AssignmentTrackingLowering::BlockInfo *LiveSet) {
   auto Linked = at::getAssignmentMarkers(&I);
-  auto LinkedDPAssigns = at::getDPVAssignmentMarkers(&I);
+  auto LinkedDPAssigns = at::getDVRAssignmentMarkers(&I);
   // No dbg.assign intrinsics linked.
   // FIXME: All vars that have a stack slot this store modifies that don't have
   // a dbg.assign linked to it should probably treat this like an untagged
@@ -1757,8 +1759,8 @@ void AssignmentTrackingLowering::processTaggedInstruction(
   };
   for (DbgAssignIntrinsic *DAI : Linked)
     ProcessLinkedAssign(DAI);
-  for (DPValue *DPV : LinkedDPAssigns)
-    ProcessLinkedAssign(DPV);
+  for (DbgVariableRecord *DVR : LinkedDPAssigns)
+    ProcessLinkedAssign(DVR);
 }
 
 void AssignmentTrackingLowering::processDbgAssign(AssignRecord Assign,
@@ -1803,13 +1805,13 @@ void AssignmentTrackingLowering::processDbgAssign(AssignRecord Assign,
       emitDbgValue(LocKind::Val, DbgAssign, DbgAssign);
     }
   };
-  if (isa<DPValue *>(Assign))
-    return ProcessDbgAssignImpl(cast<DPValue *>(Assign));
+  if (isa<DbgVariableRecord *>(Assign))
+    return ProcessDbgAssignImpl(cast<DbgVariableRecord *>(Assign));
   return ProcessDbgAssignImpl(cast<DbgAssignIntrinsic *>(Assign));
 }
 
 void AssignmentTrackingLowering::processDbgValue(
-    PointerUnion<DbgValueInst *, DPValue *> DbgValueRecord,
+    PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgValueRecord,
     BlockInfo *LiveSet) {
   auto ProcessDbgValueImpl = [&](auto *DbgValue) {
     // Only other tracking variables that are at some point stack homed.
@@ -1834,8 +1836,8 @@ void AssignmentTrackingLowering::processDbgValue(
     setLocKind(LiveSet, Var, LocKind::Val);
     emitDbgValue(LocKind::Val, DbgValue, DbgValue);
   };
-  if (isa<DPValue *>(DbgValueRecord))
-    return ProcessDbgValueImpl(cast<DPValue *>(DbgValueRecord));
+  if (isa<DbgVariableRecord *>(DbgValueRecord))
+    return ProcessDbgValueImpl(cast<DbgVariableRecord *>(DbgValueRecord));
   return ProcessDbgValueImpl(cast<DbgValueInst *>(DbgValueRecord));
 }
 
@@ -1860,16 +1862,16 @@ void AssignmentTrackingLowering::processDbgInstruction(
   else if (auto *DVI = dyn_cast<DbgValueInst>(&I))
     processDbgValue(DVI, LiveSet);
 }
-void AssignmentTrackingLowering::processDPValue(
-    DPValue &DPV, AssignmentTrackingLowering::BlockInfo *LiveSet) {
+void AssignmentTrackingLowering::processDbgVariableRecord(
+    DbgVariableRecord &DVR, AssignmentTrackingLowering::BlockInfo *LiveSet) {
   // Ignore assignments to zero bits of the variable.
-  if (hasZeroSizedFragment(DPV))
+  if (hasZeroSizedFragment(DVR))
     return;
 
-  if (DPV.isDbgAssign())
-    processDbgAssign(&DPV, LiveSet);
-  else if (DPV.isDbgValue())
-    processDbgValue(&DPV, LiveSet);
+  if (DVR.isDbgAssign())
+    processDbgAssign(&DVR, LiveSet);
+  else if (DVR.isDbgValue())
+    processDbgValue(&DVR, LiveSet);
 }
 
 void AssignmentTrackingLowering::resetInsertionPoint(Instruction &After) {
@@ -1879,7 +1881,7 @@ void AssignmentTrackingLowering::resetInsertionPoint(Instruction &After) {
     return;
   R->second.clear();
 }
-void AssignmentTrackingLowering::resetInsertionPoint(DPValue &After) {
+void AssignmentTrackingLowering::resetInsertionPoint(DbgVariableRecord &After) {
   auto *R = InsertBeforeMap.find(getNextNode(&After));
   if (R == InsertBeforeMap.end())
     return;
@@ -1919,9 +1921,9 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
       // Skip over non-variable debug records (i.e., labels). They're going to
       // be read from IR (possibly re-ordering them within the debug record
       // range) rather than from the analysis results.
-      for (DPValue &DPV : filterDbgVars(II->getDbgRecordRange())) {
-        resetInsertionPoint(DPV);
-        processDPValue(DPV, LiveSet);
+      for (DbgVariableRecord &DVR : filterDbgVars(II->getDbgRecordRange())) {
+        resetInsertionPoint(DVR);
+        processDbgVariableRecord(DVR, LiveSet);
         assert(LiveSet->isValid());
       }
     }
@@ -2000,9 +2002,11 @@ AssignmentTrackingLowering::joinAssignment(const Assignment &A,
       return A.Source;
     if (!A.Source || !B.Source)
       return AssignRecord();
-    assert(isa<DPValue *>(A.Source) == isa<DPValue *>(B.Source));
-    if (isa<DPValue *>(A.Source) &&
-        cast<DPValue *>(A.Source)->isEquivalentTo(*cast<DPValue *>(B.Source)))
+    assert(isa<DbgVariableRecord *>(A.Source) ==
+           isa<DbgVariableRecord *>(B.Source));
+    if (isa<DbgVariableRecord *>(A.Source) &&
+        cast<DbgVariableRecord *>(A.Source)->isEquivalentTo(
+            *cast<DbgVariableRecord *>(B.Source)))
       return A.Source;
     if (isa<DbgAssignIntrinsic *>(A.Source) &&
         cast<DbgAssignIntrinsic *>(A.Source)->isIdenticalTo(
@@ -2123,8 +2127,8 @@ DbgDeclareInst *DynCastToDbgDeclare(DbgVariableIntrinsic *DVI) {
   return dyn_cast<DbgDeclareInst>(DVI);
 }
 
-DPValue *DynCastToDbgDeclare(DPValue *DPV) {
-  return DPV->isDbgDeclare() ? DPV : nullptr;
+DbgVariableRecord *DynCastToDbgDeclare(DbgVariableRecord *DVR) {
+  return DVR->isDbgDeclare() ? DVR : nullptr;
 }
 
 /// Build a map of {Variable x: Variables y} where all variable fragments
@@ -2161,7 +2165,7 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
   // We need to add fragments for untagged stores too so that we can correctly
   // clobber overlapped fragment locations later.
   SmallVector<DbgDeclareInst *> InstDeclares;
-  SmallVector<DPValue *> DPDeclares;
+  SmallVector<DbgVariableRecord *> DPDeclares;
   auto ProcessDbgRecord = [&](auto *Record, auto &DeclareList) {
     if (auto *Declare = DynCastToDbgDeclare(Record)) {
       DeclareList.push_back(Declare);
@@ -2176,8 +2180,8 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
   };
   for (auto &BB : Fn) {
     for (auto &I : BB) {
-      for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange()))
-        ProcessDbgRecord(&DPV, DPDeclares);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+        ProcessDbgRecord(&DVR, DPDeclares);
       if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
         ProcessDbgRecord(DII, InstDeclares);
       } else if (auto Info = getUntaggedStoreAssignmentInfo(
@@ -2218,8 +2222,8 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
         };
         for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(Info->Base))
           HandleDbgAssignForStore(DAI);
-        for (DPValue *DPV : at::getDPVAssignmentMarkers(Info->Base))
-          HandleDbgAssignForStore(DPV);
+        for (DbgVariableRecord *DVR : at::getDVRAssignmentMarkers(Info->Base))
+          HandleDbgAssignForStore(DVR);
       }
     }
   }
@@ -2269,10 +2273,10 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
   for (auto *DDI : InstDeclares)
     FnVarLocs->addSingleLocVar(DebugVariable(DDI), DDI->getExpression(),
                                DDI->getDebugLoc(), DDI->getWrappedLocation());
-  for (auto *DPV : DPDeclares)
-    FnVarLocs->addSingleLocVar(DebugVariable(DPV), DPV->getExpression(),
-                               DPV->getDebugLoc(),
-                               RawLocationWrapper(DPV->getRawLocation()));
+  for (auto *DVR : DPDeclares)
+    FnVarLocs->addSingleLocVar(DebugVariable(DVR), DVR->getExpression(),
+                               DVR->getDebugLoc(),
+                               RawLocationWrapper(DVR->getRawLocation()));
   return Map;
 }
 
@@ -2466,9 +2470,9 @@ bool AssignmentTrackingLowering::emitPromotedVarLocs(
   for (auto &BB : Fn) {
     for (auto &I : BB) {
       // Skip instructions other than dbg.values and dbg.assigns.
-      for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange()))
-        if (DPV.isDbgValue() || DPV.isDbgAssign())
-          TranslateDbgRecord(&DPV);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+        if (DVR.isDbgValue() || DVR.isDbgAssign())
+          TranslateDbgRecord(&DVR);
       auto *DVI = dyn_cast<DbgValueInst>(&I);
       if (DVI)
         TranslateDbgRecord(DVI);
@@ -2568,8 +2572,8 @@ removeRedundantDbgLocsUsingBackwardScan(const BasicBlock *BB,
       }
     };
     HandleLocsForWedge(&I);
-    for (DPValue &DPV : reverse(filterDbgVars(I.getDbgRecordRange())))
-      HandleLocsForWedge(&DPV);
+    for (DbgVariableRecord &DVR : reverse(filterDbgVars(I.getDbgRecordRange())))
+      HandleLocsForWedge(&DVR);
   }
 
   return Changed;
@@ -2633,8 +2637,8 @@ removeRedundantDbgLocsUsingForwardScan(const BasicBlock *BB,
       }
     };
 
-    for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange()))
-      HandleLocsForWedge(&DPV);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      HandleLocsForWedge(&DVR);
     HandleLocsForWedge(&I);
   }
 
@@ -2719,8 +2723,8 @@ removeUndefDbgLocsFromEntryBlock(const BasicBlock *BB,
         Changed = true;
       }
     };
-    for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange()))
-      HandleLocsForWedge(&DPV);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      HandleLocsForWedge(&DVR);
     HandleLocsForWedge(&I);
   }
 
@@ -2753,8 +2757,8 @@ static DenseSet<DebugAggregate> findVarsWithStackSlot(Function &Fn) {
       for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(&I)) {
         Result.insert({DAI->getVariable(), DAI->getDebugLoc().getInlinedAt()});
       }
-      for (DPValue *DPV : at::getDPVAssignmentMarkers(&I)) {
-        Result.insert({DPV->getVariable(), DPV->getDebugLoc().getInlinedAt()});
+      for (DbgVariableRecord *DVR : at::getDVRAssignmentMarkers(&I)) {
+        Result.insert({DVR->getVariable(), DVR->getDebugLoc().getInlinedAt()});
       }
     }
   }
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index bece70a6e008bc..9f99bb7e693f7e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -445,8 +445,8 @@ class CodeGenPrepare {
   bool optimizeExtractElementInst(Instruction *Inst);
   bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT);
   bool fixupDbgValue(Instruction *I);
-  bool fixupDPValue(DPValue &I);
-  bool fixupDPValuesOnInst(Instruction &I);
+  bool fixupDbgVariableRecord(DbgVariableRecord &I);
+  bool fixupDbgVariableRecordsOnInst(Instruction &I);
   bool placeDbgValues(Function &F);
   bool placePseudoProbes(Function &F);
   bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
@@ -1943,6 +1943,39 @@ static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) {
   return false;
 }
 
+static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
+                                  const DataLayout &DL) {
+  FCmpInst *FCmp = dyn_cast<FCmpInst>(Cmp);
+  if (!FCmp)
+    return false;
+
+  // Don't fold if the target offers free fabs and the predicate is legal.
+  EVT VT = TLI.getValueType(DL, Cmp->getOperand(0)->getType());
+  if (TLI.isFAbsFree(VT) &&
+      TLI.isCondCodeLegal(getFCmpCondCode(FCmp->getPredicate()),
+                          VT.getSimpleVT()))
+    return false;
+
+  // Reverse the canonicalization if it is a FP class test
+  auto ShouldReverseTransform = [](FPClassTest ClassTest) {
+    return ClassTest == fcInf || ClassTest == (fcInf | fcNan);
+  };
+  auto [ClassVal, ClassTest] =
+      fcmpToClassTest(FCmp->getPredicate(), *FCmp->getParent()->getParent(),
+                      FCmp->getOperand(0), FCmp->getOperand(1));
+  if (!ClassVal)
+    return false;
+
+  if (!ShouldReverseTransform(ClassTest) && !ShouldReverseTransform(~ClassTest))
+    return false;
+
+  IRBuilder<> Builder(Cmp);
+  Value *IsFPClass = Builder.createIsFPClass(ClassVal, ClassTest);
+  Cmp->replaceAllUsesWith(IsFPClass);
+  RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+  return true;
+}
+
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (sinkCmpExpression(Cmp, *TLI))
     return true;
@@ -1959,6 +1992,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (swapICmpOperandsToExposeCSEOpportunities(Cmp))
     return true;
 
+  if (foldFCmpToFPClassTest(Cmp, *TLI, *DL))
+    return true;
+
   return false;
 }
 
@@ -2021,9 +2057,9 @@ static bool sinkAndCmp0Expression(Instruction *AndI, const TargetLowering &TLI,
     // Keep the 'and' in the same place if the use is already in the same block.
     Instruction *InsertPt =
         User->getParent() == AndI->getParent() ? AndI : User;
-    Instruction *InsertedAnd =
-        BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
-                               AndI->getOperand(1), "", InsertPt);
+    Instruction *InsertedAnd = BinaryOperator::Create(
+        Instruction::And, AndI->getOperand(0), AndI->getOperand(1), "",
+        InsertPt->getIterator());
     // Propagate the debug info.
     InsertedAnd->setDebugLoc(AndI->getDebugLoc());
 
@@ -3187,7 +3223,7 @@ class TypePromotionTransaction {
     /// Keep track of the debug users.
     SmallVector<DbgValueInst *, 1> DbgValues;
     /// And non-instruction debug-users too.
-    SmallVector<DPValue *, 1> DPValues;
+    SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
 
     /// Keep track of the new value so that we can undo it by replacing
     /// instances of the new value with the original value.
@@ -3208,7 +3244,7 @@ class TypePromotionTransaction {
       }
       // Record the debug uses separately. They are not in the instruction's
       // use list, but they are replaced by RAUW.
-      findDbgValues(DbgValues, Inst, &DPValues);
+      findDbgValues(DbgValues, Inst, &DbgVariableRecords);
 
       // Now, we can replace the uses.
       Inst->replaceAllUsesWith(New);
@@ -3225,10 +3261,10 @@ class TypePromotionTransaction {
       // correctness and utility of debug value instructions.
       for (auto *DVI : DbgValues)
         DVI->replaceVariableLocationOp(New, Inst);
-      // Similar story with DPValues, the non-instruction representation of
-      // dbg.values.
-      for (DPValue *DPV : DPValues) // tested by transaction-test I'm adding
-        DPV->replaceVariableLocationOp(New, Inst);
+      // Similar story with DbgVariableRecords, the non-instruction
+      // representation of dbg.values.
+      for (DbgVariableRecord *DVR : DbgVariableRecords)
+        DVR->replaceVariableLocationOp(New, Inst);
     }
   };
 
@@ -4117,9 +4153,10 @@ class AddressingModeCombiner {
       if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {
         // Is it OK to get metadata from OrigSelect?!
         // Create a Select placeholder with dummy value.
-        SelectInst *Select = SelectInst::Create(
-            CurrentSelect->getCondition(), Dummy, Dummy,
-            CurrentSelect->getName(), CurrentSelect, CurrentSelect);
+        SelectInst *Select =
+            SelectInst::Create(CurrentSelect->getCondition(), Dummy, Dummy,
+                               CurrentSelect->getName(),
+                               CurrentSelect->getIterator(), CurrentSelect);
         Map[Current] = Select;
         ST.insertNewSelect(Select);
         // We are interested in True and False values.
@@ -6430,8 +6467,8 @@ bool CodeGenPrepare::optimizePhiType(
       ValMap[D] = D->getOperand(0);
       DeletedInstrs.insert(D);
     } else {
-      ValMap[D] =
-          new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());
+      BasicBlock::iterator insertPt = std::next(D->getIterator());
+      ValMap[D] = new BitCastInst(D, ConvertTy, D->getName() + ".bc", insertPt);
     }
   }
   for (PHINode *Phi : PhiNodes)
@@ -6451,8 +6488,8 @@ bool CodeGenPrepare::optimizePhiType(
       DeletedInstrs.insert(U);
       replaceAllUsesWith(U, ValMap[U->getOperand(0)], FreshBBs, IsHugeFunc);
     } else {
-      U->setOperand(0,
-                    new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
+      U->setOperand(0, new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc",
+                                       U->getIterator()));
     }
   }
 
@@ -7080,9 +7117,9 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   CurInstIterator = std::next(LastSI->getIterator());
   // Examine debug-info attached to the consecutive select instructions. They
   // won't be individually optimised by optimizeInst, so we need to perform
-  // DPValue maintenence here instead.
+  // DbgVariableRecord maintenence here instead.
   for (SelectInst *SI : ArrayRef(ASI).drop_front())
-    fixupDPValuesOnInst(*SI);
+    fixupDbgVariableRecordsOnInst(*SI);
 
   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
 
@@ -8238,7 +8275,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
 
 bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
   bool AnyChange = false;
-  AnyChange = fixupDPValuesOnInst(*I);
+  AnyChange = fixupDbgVariableRecordsOnInst(*I);
 
   // Bail out if we inserted the instruction to prevent optimizations from
   // stepping on each other's toes.
@@ -8348,7 +8385,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (GEPI->hasAllZeroIndices()) {
       /// The GEP operand must be a pointer, so must its result -> BitCast
       Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
-                                        GEPI->getName(), GEPI);
+                                        GEPI->getName(), GEPI->getIterator());
       NC->setDebugLoc(GEPI->getDebugLoc());
       replaceAllUsesWith(GEPI, NC, FreshBBs, IsHugeFunc);
       RecursivelyDeleteTriviallyDeadInstructions(
@@ -8380,7 +8417,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
                     isa<ConstantPointerNull>(Op1);
       if (Const0 || Const1) {
         if (!Const0 || !Const1) {
-          auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI);
+          auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI->getIterator());
           F->takeName(FI);
           CmpI->setOperand(Const0 ? 1 : 0, F);
         }
@@ -8504,24 +8541,24 @@ bool CodeGenPrepare::fixupDbgValue(Instruction *I) {
   return AnyChange;
 }
 
-bool CodeGenPrepare::fixupDPValuesOnInst(Instruction &I) {
+bool CodeGenPrepare::fixupDbgVariableRecordsOnInst(Instruction &I) {
   bool AnyChange = false;
-  for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange()))
-    AnyChange |= fixupDPValue(DPV);
+  for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+    AnyChange |= fixupDbgVariableRecord(DVR);
   return AnyChange;
 }
 
 // FIXME: should updating debug-info really cause the "changed" flag to fire,
 // which can cause a function to be reprocessed?
-bool CodeGenPrepare::fixupDPValue(DPValue &DPV) {
-  if (DPV.Type != DPValue::LocationType::Value &&
-      DPV.Type != DPValue::LocationType::Assign)
+bool CodeGenPrepare::fixupDbgVariableRecord(DbgVariableRecord &DVR) {
+  if (DVR.Type != DbgVariableRecord::LocationType::Value &&
+      DVR.Type != DbgVariableRecord::LocationType::Assign)
     return false;
 
-  // Does this DPValue refer to a sunk address calculation?
+  // Does this DbgVariableRecord refer to a sunk address calculation?
   bool AnyChange = false;
-  SmallDenseSet<Value *> LocationOps(DPV.location_ops().begin(),
-                                     DPV.location_ops().end());
+  SmallDenseSet<Value *> LocationOps(DVR.location_ops().begin(),
+                                     DVR.location_ops().end());
   for (Value *Location : LocationOps) {
     WeakTrackingVH SunkAddrVH = SunkAddrs[Location];
     Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
@@ -8531,7 +8568,7 @@ bool CodeGenPrepare::fixupDPValue(DPValue &DPV) {
       // of pointer being referred to; however this makes no difference to
       // debugging information, and we can't generate bitcasts that may affect
       // codegen.
-      DPV.replaceVariableLocationOp(Location, SunkAddr);
+      DVR.replaceVariableLocationOp(Location, SunkAddr);
       AnyChange = true;
     }
   }
@@ -8546,13 +8583,13 @@ static void DbgInserterHelper(DbgValueInst *DVI, Instruction *VI) {
     DVI->insertAfter(VI);
 }
 
-static void DbgInserterHelper(DPValue *DPV, Instruction *VI) {
-  DPV->removeFromParent();
+static void DbgInserterHelper(DbgVariableRecord *DVR, Instruction *VI) {
+  DVR->removeFromParent();
   BasicBlock *VIBB = VI->getParent();
   if (isa<PHINode>(VI))
-    VIBB->insertDbgRecordBefore(DPV, VIBB->getFirstInsertionPt());
+    VIBB->insertDbgRecordBefore(DVR, VIBB->getFirstInsertionPt());
   else
-    VIBB->insertDbgRecordAfter(DPV, VI);
+    VIBB->insertDbgRecordAfter(DVR, VI);
 }
 
 // A llvm.dbg.value may be using a value before its definition, due to
@@ -8617,13 +8654,13 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
         continue;
       }
 
-      // If this isn't a dbg.value, process any attached DPValue records
-      // attached to this instruction.
-      for (DPValue &DPV : llvm::make_early_inc_range(
+      // If this isn't a dbg.value, process any attached DbgVariableRecord
+      // records attached to this instruction.
+      for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
                filterDbgVars(Insn.getDbgRecordRange()))) {
-        if (DPV.Type != DPValue::LocationType::Value)
+        if (DVR.Type != DbgVariableRecord::LocationType::Value)
           continue;
-        DbgProcessor(&DPV, &Insn);
+        DbgProcessor(&DVR, &Insn);
       }
     }
   }
diff --git a/llvm/lib/CodeGen/DFAPacketizer.cpp b/llvm/lib/CodeGen/DFAPacketizer.cpp
index 48bb4a07662e10..c16166a1d5e1c5 100644
--- a/llvm/lib/CodeGen/DFAPacketizer.cpp
+++ b/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -252,12 +252,13 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
 bool VLIWPacketizerList::alias(const MachineMemOperand &Op1,
                                const MachineMemOperand &Op2,
                                bool UseTBAA) const {
-  if (!Op1.getValue() || !Op2.getValue())
+  if (!Op1.getValue() || !Op2.getValue() || !Op1.getSize().hasValue() ||
+      !Op2.getSize().hasValue())
     return true;
 
   int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
-  int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
-  int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
+  int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;
+  int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;
 
   AliasResult AAResult =
       AA->alias(MemoryLocation(Op1.getValue(), Overlapa,
diff --git a/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
index e7eb34d8e6518c..09e7cfb12bdbad 100644
--- a/llvm/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -111,7 +111,8 @@ Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
   }
 
   if (!ExnObj)
-    ExnObj = ExtractValueInst::Create(RI->getOperand(0), 0, "exn.obj", RI);
+    ExnObj = ExtractValueInst::Create(RI->getOperand(0), 0, "exn.obj",
+                                      RI->getIterator());
 
   RI->eraseFromParent();
 
@@ -158,7 +159,7 @@ size_t DwarfEHPrepare::pruneUnreachableResumes(
       Resumes[ResumesLeft++] = RI;
     } else {
       BasicBlock *BB = RI->getParent();
-      new UnreachableInst(Ctx, RI);
+      new UnreachableInst(Ctx, RI->getIterator());
       RI->eraseFromParent();
       simplifyCFG(BB, *TTI, DTU);
     }
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index 78ad2a25d0e47a..308f13c19f7564 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -375,7 +375,7 @@ static void expandIToFP(Instruction *IToFP) {
   Value *Sub2 = Builder.CreateSub(Builder.getIntN(BitWidthNew, BitWidth - 1),
                                   FloatWidth == 128 ? Call : Cast);
   Value *Cmp3 = Builder.CreateICmpSGT(
-      Sub2, Builder.getIntN(BitWidthNew, FPMantissaWidth + 1));
+      Sub1, Builder.getIntN(BitWidthNew, FPMantissaWidth + 1));
   Builder.CreateCondBr(Cmp3, IfThen4, IfElse);
 
   // if.then4:
diff --git a/llvm/lib/CodeGen/FinalizeISel.cpp b/llvm/lib/CodeGen/FinalizeISel.cpp
index 329c9587e32122..978355f8eb1bbf 100644
--- a/llvm/lib/CodeGen/FinalizeISel.cpp
+++ b/llvm/lib/CodeGen/FinalizeISel.cpp
@@ -14,8 +14,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
@@ -45,6 +47,7 @@ INITIALIZE_PASS(FinalizeISel, DEBUG_TYPE,
 
 bool FinalizeISel::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
 
   // Iterate through each instruction in the function, looking for pseudos.
@@ -54,6 +57,12 @@ bool FinalizeISel::runOnMachineFunction(MachineFunction &MF) {
          MBBI != MBBE; ) {
       MachineInstr &MI = *MBBI++;
 
+      // Set AdjustsStack to true if the instruction selector emits a stack
+      // frame setup instruction or a stack aligning inlineasm.
+      if (MI.getOpcode() == TII->getCallFrameSetupOpcode() ||
+          MI.isStackAligningInlineAsm())
+        MF.getFrameInfo().setAdjustsStack(true);
+
       // If MI is a pseudo, expand it.
       if (MI.usesCustomInsertionHook()) {
         Changed = true;
diff --git a/llvm/lib/CodeGen/GCRootLowering.cpp b/llvm/lib/CodeGen/GCRootLowering.cpp
index 0d82f2bab8960b..700714d5398470 100644
--- a/llvm/lib/CodeGen/GCRootLowering.cpp
+++ b/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -181,7 +181,7 @@ static bool InsertRootInitializers(Function &F, ArrayRef<AllocaInst *> Roots) {
     if (!InitedRoots.count(Root)) {
       new StoreInst(
           ConstantPointerNull::get(cast<PointerType>(Root->getAllocatedType())),
-          Root, Root->getNextNode());
+          Root, std::next(Root->getIterator()));
       MadeChange = true;
     }
 
@@ -216,8 +216,8 @@ bool DoLowering(Function &F, GCStrategy &S) {
       default: break;
       case Intrinsic::gcwrite: {
         // Replace a write barrier with a simple store.
-        Value *St = new StoreInst(CI->getArgOperand(0),
-                                  CI->getArgOperand(2), CI);
+        Value *St = new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2),
+                                  CI->getIterator());
         CI->replaceAllUsesWith(St);
         CI->eraseFromParent();
         MadeChange = true;
@@ -225,7 +225,8 @@ bool DoLowering(Function &F, GCStrategy &S) {
       }
       case Intrinsic::gcread: {
         // Replace a read barrier with a simple load.
-        Value *Ld = new LoadInst(CI->getType(), CI->getArgOperand(1), "", CI);
+        Value *Ld = new LoadInst(CI->getType(), CI->getArgOperand(1), "",
+                                 CI->getIterator());
         Ld->takeName(CI);
         CI->replaceAllUsesWith(Ld);
         CI->eraseFromParent();
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 3a37bf3cd7a8a3..363fad53b76c35 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
@@ -91,6 +92,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
                              ArrayRef<Register> ResRegs,
                              ArrayRef<ArrayRef<Register>> ArgRegs,
                              Register SwiftErrorVReg,
+                             Register ConvergenceCtrlToken,
                              std::function<unsigned()> GetCalleeReg) const {
   CallLoweringInfo Info;
   const DataLayout &DL = MIRBuilder.getDataLayout();
@@ -121,7 +123,6 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
     CanBeTailCalled = false;
   }
 
-
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
   // we'll pass to the assigner function.
@@ -187,6 +188,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees);
   Info.CallConv = CallConv;
   Info.SwiftErrorVReg = SwiftErrorVReg;
+  Info.ConvergenceCtrlToken = ConvergenceCtrlToken;
   Info.IsMustTailCall = CB.isMustTailCall();
   Info.IsTailCall = CanBeTailCalled;
   Info.IsVarArg = IsVarArg;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index bee49dbd0f8380..d3f86af1e2908e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -770,12 +770,12 @@ bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
   LLT RegTy = MRI.getType(LoadReg);
   Register PtrReg = LoadMI->getPointerReg();
   unsigned RegSize = RegTy.getSizeInBits();
-  uint64_t LoadSizeBits = LoadMI->getMemSizeInBits();
+  LocationSize LoadSizeBits = LoadMI->getMemSizeInBits();
   unsigned MaskSizeBits = MaskVal.countr_one();
 
   // The mask may not be larger than the in-memory type, as it might cover sign
   // extended bits
-  if (MaskSizeBits > LoadSizeBits)
+  if (MaskSizeBits > LoadSizeBits.getValue())
     return false;
 
   // If the mask covers the whole destination register, there's nothing to
@@ -795,7 +795,8 @@ bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
   // still adjust the opcode to indicate the high bit behavior.
   if (LoadMI->isSimple())
     MemDesc.MemoryTy = LLT::scalar(MaskSizeBits);
-  else if (LoadSizeBits > MaskSizeBits || LoadSizeBits == RegSize)
+  else if (LoadSizeBits.getValue() > MaskSizeBits ||
+           LoadSizeBits.getValue() == RegSize)
     return false;
 
   // TODO: Could check if it's legal with the reduced or original memory size.
@@ -860,7 +861,8 @@ bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) {
   if (auto *LoadMI = getOpcodeDef<GSExtLoad>(LoadUser, MRI)) {
     // If truncating more than the original extended value, abort.
     auto LoadSizeBits = LoadMI->getMemSizeInBits();
-    if (TruncSrc && MRI.getType(TruncSrc).getSizeInBits() < LoadSizeBits)
+    if (TruncSrc &&
+        MRI.getType(TruncSrc).getSizeInBits() < LoadSizeBits.getValue())
       return false;
     if (LoadSizeBits == SizeInBits)
       return true;
@@ -891,7 +893,7 @@ bool CombinerHelper::matchSextInRegOfLoad(
   if (!LoadDef || !MRI.hasOneNonDBGUse(DstReg))
     return false;
 
-  uint64_t MemBits = LoadDef->getMemSizeInBits();
+  uint64_t MemBits = LoadDef->getMemSizeInBits().getValue();
 
   // If the sign extend extends from a narrower width than the load's width,
   // then we can narrow the load width when we combine to a G_SEXTLOAD.
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 099bf45b2734cb..2e2cc9a95bd95c 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -415,7 +415,8 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     if (DstTy.isVector())
       break;
     // Everything above the retrieved bits is zero
-    Known.Zero.setBitsFrom((*MI.memoperands_begin())->getSizeInBits());
+    Known.Zero.setBitsFrom(
+        (*MI.memoperands_begin())->getSizeInBits().getValue());
     break;
   }
   case TargetOpcode::G_ASHR: {
@@ -666,7 +667,7 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
 
     // e.g. i16->i32 = '17' bits known.
     const MachineMemOperand *MMO = *MI.memoperands_begin();
-    return TyBits - MMO->getSizeInBits() + 1;
+    return TyBits - MMO->getSizeInBits().getValue() + 1;
   }
   case TargetOpcode::G_ZEXTLOAD: {
     // FIXME: We need an in-memory type representation.
@@ -675,7 +676,7 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
 
     // e.g. i16->i32 = '16' bits known.
     const MachineMemOperand *MMO = *MI.memoperands_begin();
-    return TyBits - MMO->getSizeInBits();
+    return TyBits - MMO->getSizeInBits().getValue();
   }
   case TargetOpcode::G_TRUNC: {
     Register Src = MI.getOperand(1).getReg();
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 94fdb37e283bb4..c18574071bfb6c 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
@@ -213,8 +214,9 @@ ArrayRef<Register> IRTranslator::getOrCreateVRegs(const Value &Val) {
   auto *VRegs = VMap.getVRegs(Val);
   auto *Offsets = VMap.getOffsets(Val);
 
-  assert(Val.getType()->isSized() &&
-         "Don't know how to create an empty vreg");
+  if (!Val.getType()->isTokenTy())
+    assert(Val.getType()->isSized() &&
+           "Don't know how to create an empty vreg");
 
   SmallVector<LLT, 4> SplitTys;
   computeValueLLTs(*DL, *Val.getType(), SplitTys,
@@ -1361,9 +1363,8 @@ static bool isSwiftError(const Value *V) {
 
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
-
-  unsigned StoreSize = DL->getTypeStoreSize(LI.getType());
-  if (StoreSize == 0)
+  TypeSize StoreSize = DL->getTypeStoreSize(LI.getType());
+  if (StoreSize.isZero())
     return true;
 
   ArrayRef<Register> Regs = getOrCreateVRegs(LI);
@@ -1770,6 +1771,41 @@ bool IRTranslator::translateMemFunc(const CallInst &CI,
   return true;
 }
 
+bool IRTranslator::translateVectorInterleave2Intrinsic(
+    const CallInst &CI, MachineIRBuilder &MIRBuilder) {
+  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2 &&
+         "This function can only be called on the interleave2 intrinsic!");
+  // Canonicalize interleave2 to G_SHUFFLE_VECTOR (similar to SelectionDAG).
+  Register Op0 = getOrCreateVReg(*CI.getOperand(0));
+  Register Op1 = getOrCreateVReg(*CI.getOperand(1));
+  Register Res = getOrCreateVReg(CI);
+
+  LLT OpTy = MRI->getType(Op0);
+  MIRBuilder.buildShuffleVector(Res, Op0, Op1,
+                                createInterleaveMask(OpTy.getNumElements(), 2));
+
+  return true;
+}
+
+bool IRTranslator::translateVectorDeinterleave2Intrinsic(
+    const CallInst &CI, MachineIRBuilder &MIRBuilder) {
+  assert(CI.getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2 &&
+         "This function can only be called on the deinterleave2 intrinsic!");
+  // Canonicalize deinterleave2 to shuffles that extract sub-vectors (similar to
+  // SelectionDAG).
+  Register Op = getOrCreateVReg(*CI.getOperand(0));
+  auto Undef = MIRBuilder.buildUndef(MRI->getType(Op));
+  ArrayRef<Register> Res = getOrCreateVRegs(CI);
+
+  LLT ResTy = MRI->getType(Res[0]);
+  MIRBuilder.buildShuffleVector(Res[0], Op, Undef,
+                                createStrideMask(0, 2, ResTy.getNumElements()));
+  MIRBuilder.buildShuffleVector(Res[1], Op, Undef,
+                                createStrideMask(1, 2, ResTy.getNumElements()));
+
+  return true;
+}
+
 void IRTranslator::getStackGuard(Register DstReg,
                                  MachineIRBuilder &MIRBuilder) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
@@ -2038,6 +2074,36 @@ bool IRTranslator::translateIfEntryValueArgument(bool isDeclare, Value *Val,
   return true;
 }
 
+static unsigned getConvOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unexpected intrinsic");
+  case Intrinsic::experimental_convergence_anchor:
+    return TargetOpcode::CONVERGENCECTRL_ANCHOR;
+  case Intrinsic::experimental_convergence_entry:
+    return TargetOpcode::CONVERGENCECTRL_ENTRY;
+  case Intrinsic::experimental_convergence_loop:
+    return TargetOpcode::CONVERGENCECTRL_LOOP;
+  }
+}
+
+bool IRTranslator::translateConvergenceControlIntrinsic(
+    const CallInst &CI, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder) {
+  MachineInstrBuilder MIB = MIRBuilder.buildInstr(getConvOpcode(ID));
+  Register OutputReg = getOrCreateConvergenceTokenVReg(CI);
+  MIB.addDef(OutputReg);
+
+  if (ID == Intrinsic::experimental_convergence_loop) {
+    auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl);
+    assert(Bundle && "Expected a convergence control token.");
+    Register InputReg =
+        getOrCreateConvergenceTokenVReg(*Bundle->Inputs[0].get());
+    MIB.addUse(InputReg);
+  }
+
+  return true;
+}
+
 bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                                            MachineIRBuilder &MIRBuilder) {
   if (auto *MI = dyn_cast<AnyMemIntrinsic>(&CI)) {
@@ -2474,12 +2540,30 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
     return true;
   }
+
+  case Intrinsic::experimental_vector_interleave2:
+  case Intrinsic::experimental_vector_deinterleave2: {
+    // Both intrinsics have at least one operand.
+    Value *Op0 = CI.getOperand(0);
+    LLT ResTy = getLLTForType(*Op0->getType(), MIRBuilder.getDataLayout());
+    if (!ResTy.isFixedVector())
+      return false;
+
+    if (CI.getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+      return translateVectorInterleave2Intrinsic(CI, MIRBuilder);
+
+    return translateVectorDeinterleave2Intrinsic(CI, MIRBuilder);
+  }
+
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
     return translateConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(CI),
                                            MIRBuilder);
-
+  case Intrinsic::experimental_convergence_anchor:
+  case Intrinsic::experimental_convergence_entry:
+  case Intrinsic::experimental_convergence_loop:
+    return translateConvergenceControlIntrinsic(CI, ID, MIRBuilder);
   }
   return false;
 }
@@ -2530,12 +2614,18 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
     }
   }
 
+  Register ConvergenceCtrlToken = 0;
+  if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+    const auto &Token = *Bundle->Inputs[0].get();
+    ConvergenceCtrlToken = getOrCreateConvergenceTokenVReg(Token);
+  }
+
   // We don't set HasCalls on MFI here yet because call lowering may decide to
   // optimize into tail calls. Instead, we defer that to selection where a final
   // scan is done to check if any instructions are calls.
-  bool Success =
-      CLI->lowerCall(MIRBuilder, CB, Res, Args, SwiftErrorVReg,
-                     [&]() { return getOrCreateVReg(*CB.getCalledOperand()); });
+  bool Success = CLI->lowerCall(
+      MIRBuilder, CB, Res, Args, SwiftErrorVReg, ConvergenceCtrlToken,
+      [&]() { return getOrCreateVReg(*CB.getCalledOperand()); });
 
   // Check if we just inserted a tail call.
   if (Success) {
@@ -2649,6 +2739,14 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
         MF->getMachineMemOperand(MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata()));
   }
 
+  if (CI.isConvergent()) {
+    if (auto Bundle = CI.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+      auto *Token = Bundle->Inputs[0].get();
+      Register TokenReg = getOrCreateVReg(*Token);
+      MIB.addUse(TokenReg, RegState::Implicit);
+    }
+  }
+
   return true;
 }
 
@@ -3287,16 +3385,16 @@ void IRTranslator::translateDbgInfo(const Instruction &Inst,
       MIRBuilder.buildDbgLabel(DPL->getLabel());
       continue;
     }
-    DPValue &DPV = cast<DPValue>(DR);
-    const DILocalVariable *Variable = DPV.getVariable();
-    const DIExpression *Expression = DPV.getExpression();
-    Value *V = DPV.getVariableLocationOp(0);
-    if (DPV.isDbgDeclare())
-      translateDbgDeclareRecord(V, DPV.hasArgList(), Variable,
-                         Expression, DPV.getDebugLoc(), MIRBuilder);
+    DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
+    const DILocalVariable *Variable = DVR.getVariable();
+    const DIExpression *Expression = DVR.getExpression();
+    Value *V = DVR.getVariableLocationOp(0);
+    if (DVR.isDbgDeclare())
+      translateDbgDeclareRecord(V, DVR.hasArgList(), Variable, Expression,
+                                DVR.getDebugLoc(), MIRBuilder);
     else
-      translateDbgValueRecord(V, DPV.hasArgList(), Variable,
-                         Expression, DPV.getDebugLoc(), MIRBuilder);
+      translateDbgValueRecord(V, DVR.hasArgList(), Variable, Expression,
+                              DVR.getDebugLoc(), MIRBuilder);
   }
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 4089a5e941b051..14e1e1fdf01dea 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -538,6 +538,14 @@ bool InlineAsmLowering::lowerInlineAsm(
     }
   }
 
+  if (auto Bundle = Call.getOperandBundle(LLVMContext::OB_convergencectrl)) {
+    auto *Token = Bundle->Inputs[0].get();
+    ArrayRef<Register> SourceRegs = GetOrCreateVRegs(*Token);
+    assert(SourceRegs.size() == 1 &&
+           "Expected the control token to fit into a single virtual register");
+    Inst.addUse(SourceRegs[0], RegState::Implicit);
+  }
+
   if (const MDNode *SrcLoc = Call.getMetadata("srcloc"))
     Inst.addMetadata(SrcLoc);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index bd3ff7265d51f9..abe23af00a7890 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1317,7 +1317,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     if (DstTy.isVector())
       return UnableToLegalize;
 
-    if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
+    if (8 * LoadMI.getMemSize().getValue() != DstTy.getSizeInBits()) {
       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
       MIRBuilder.buildAnyExt(DstReg, TmpReg);
@@ -1335,7 +1335,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
     auto &MMO = LoadMI.getMMO();
-    unsigned MemSize = MMO.getSizeInBits();
+    unsigned MemSize = MMO.getSizeInBits().getValue();
 
     if (MemSize == NarrowSize) {
       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
@@ -1368,7 +1368,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     if (SrcTy.isVector() && LeftoverBits != 0)
       return UnableToLegalize;
 
-    if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
+    if (8 * StoreMI.getMemSize().getValue() != SrcTy.getSizeInBits()) {
       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildTrunc(TmpReg, SrcReg);
       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
@@ -4456,7 +4456,7 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
   LLT ValTy = MRI.getType(ValReg);
 
   // FIXME: Do we need a distinct NarrowMemory legalize action?
-  if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
+  if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize().getValue()) {
     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
     return UnableToLegalize;
   }
@@ -5411,6 +5411,9 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT:
   case TargetOpcode::G_TRUNC:
   case TargetOpcode::G_FPTRUNC:
   case TargetOpcode::G_FPEXT:
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index b5c9d3e912cc20..9fc8ecd60b03ff 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -117,12 +117,8 @@ bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1,
   if (!BasePtr0.BaseReg.isValid() || !BasePtr1.BaseReg.isValid())
     return false;
 
-  LocationSize Size1 = LdSt1->getMemSize() != MemoryLocation::UnknownSize
-                           ? LdSt1->getMemSize()
-                           : LocationSize::beforeOrAfterPointer();
-  LocationSize Size2 = LdSt2->getMemSize() != MemoryLocation::UnknownSize
-                           ? LdSt2->getMemSize()
-                           : LocationSize::beforeOrAfterPointer();
+  LocationSize Size1 = LdSt1->getMemSize();
+  LocationSize Size2 = LdSt2->getMemSize();
 
   int64_t PtrDiff;
   if (BasePtr0.BaseReg == BasePtr1.BaseReg) {
@@ -214,14 +210,9 @@ bool GISelAddressing::instMayAlias(const MachineInstr &MI,
         Offset = 0;
       }
 
-      TypeSize Size = LS->getMMO().getMemoryType().getSizeInBytes();
-      return {LS->isVolatile(),
-              LS->isAtomic(),
-              BaseReg,
-              Offset /*base offset*/,
-              Size.isScalable() ? LocationSize::beforeOrAfterPointer()
-                                : LocationSize::precise(Size),
-              &LS->getMMO()};
+      LocationSize Size = LS->getMMO().getSize();
+      return {LS->isVolatile(),       LS->isAtomic(), BaseReg,
+              Offset /*base offset*/, Size,           &LS->getMMO()};
     }
     // FIXME: support recognizing lifetime instructions.
     // Default.
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 4941d5b01ae0f3..545ee1741834a1 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -309,10 +309,9 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
   for (size_t GI = 0, GE = Globals.size(); GI != GE; ++GI) {
     GlobalVariable *GV = Globals[GI];
 
-    // Reset the encountered sets for this global...
-    std::fill(EncounteredUGS.begin(), EncounteredUGS.end(), 0);
-    // ...and grow it in case we created new sets for the previous global.
-    EncounteredUGS.resize(UsedGlobalSets.size());
+    // Reset the encountered sets for this global and grow it in case we created
+    // new sets for the previous global.
+    EncounteredUGS.assign(UsedGlobalSets.size(), 0);
 
     // We might need to create a set that only consists of the current global.
     // Keep track of its index into UsedGlobalSets.
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index c536ec9f79d625..cc5aad14e1b56f 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -580,7 +580,7 @@ PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
   BasicBlock *Preheader = L->getLoopPreheader();
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = ExitBranch->getParent();
-  IRBuilder<> Builder(Header->getFirstNonPHI());
+  IRBuilder<> Builder(Header, Header->getFirstNonPHIIt());
   PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
   Index->addIncoming(NumElts, Preheader);
   Index->addIncoming(EltsRem, Latch);
diff --git a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
index f7b931a3bdac2d..13f595bef8eeb8 100644
--- a/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
+++ b/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
@@ -113,7 +113,7 @@ bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
       // Handle the degenerate case of no successors by replacing the indirectbr
       // with unreachable as there is no successor available.
       if (IBr->getNumSuccessors() == 0) {
-        (void)new UnreachableInst(F.getContext(), IBr);
+        (void)new UnreachableInst(F.getContext(), IBr->getIterator());
         IBr->eraseFromParent();
         continue;
       }
@@ -183,7 +183,7 @@ bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
         for (BasicBlock *SuccBB : IBr->successors())
           Updates.push_back({DominatorTree::Delete, IBr->getParent(), SuccBB});
       }
-      (void)new UnreachableInst(F.getContext(), IBr);
+      (void)new UnreachableInst(F.getContext(), IBr->getIterator());
       IBr->eraseFromParent();
     }
     if (DTU) {
@@ -207,9 +207,10 @@ bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
   }
 
   auto GetSwitchValue = [CommonITy](IndirectBrInst *IBr) {
-    return CastInst::CreatePointerCast(
-        IBr->getAddress(), CommonITy,
-        Twine(IBr->getAddress()->getName()) + ".switch_cast", IBr);
+    return CastInst::CreatePointerCast(IBr->getAddress(), CommonITy,
+                                       Twine(IBr->getAddress()->getName()) +
+                                           ".switch_cast",
+                                       IBr->getIterator());
   };
 
   SmallVector<DominatorTree::UpdateType, 8> Updates;
@@ -243,7 +244,7 @@ bool runImpl(Function &F, const TargetLowering *TLI, DomTreeUpdater *DTU) {
       Updates.reserve(IndirectBrs.size() + 2 * IndirectBrSuccs.size());
     for (auto *IBr : IndirectBrs) {
       SwitchPN->addIncoming(GetSwitchValue(IBr), IBr->getParent());
-      BranchInst::Create(SwitchBB, IBr);
+      BranchInst::Create(SwitchBB, IBr->getIterator());
       if (DTU) {
         Updates.push_back({DominatorTree::Insert, IBr->getParent(), SwitchBB});
         for (BasicBlock *SuccBB : IBr->successors())
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 2a0daf404c9783..438ac1c3cc6e2c 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -388,14 +388,15 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
       return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements();
     }));
 
+    BasicBlock::iterator insertPos = SVI->getIterator();
     auto *NewSVI1 =
         new ShuffleVectorInst(BI->getOperand(0), PoisonValue::get(BIOp0Ty),
-                              Mask, SVI->getName(), SVI);
+                              Mask, SVI->getName(), insertPos);
     auto *NewSVI2 = new ShuffleVectorInst(
         BI->getOperand(1), PoisonValue::get(BI->getOperand(1)->getType()), Mask,
-        SVI->getName(), SVI);
+        SVI->getName(), insertPos);
     BinaryOperator *NewBI = BinaryOperator::CreateWithCopiedFlags(
-        BI->getOpcode(), NewSVI1, NewSVI2, BI, BI->getName(), SVI);
+        BI->getOpcode(), NewSVI1, NewSVI2, BI, BI->getName(), insertPos);
     SVI->replaceAllUsesWith(NewBI);
     LLVM_DEBUG(dbgs() << "  Replaced: " << *BI << "\n    And   : " << *SVI
                       << "\n  With    : " << *NewSVI1 << "\n    And   : "
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index fe450cba4a3332..09d282d12c5fb1 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -472,7 +472,7 @@ bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) {
   Function *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty);
 
   Value *Op = CI->getArgOperand(0);
-  Op = CallInst::Create(Int, Op, CI->getName(), CI);
+  Op = CallInst::Create(Int, Op, CI->getName(), CI->getIterator());
 
   CI->replaceAllUsesWith(Op);
   CI->eraseFromParent();
diff --git a/llvm/lib/CodeGen/JMCInstrumenter.cpp b/llvm/lib/CodeGen/JMCInstrumenter.cpp
index 62a38191887524..e2aaebedf5a4fd 100644
--- a/llvm/lib/CodeGen/JMCInstrumenter.cpp
+++ b/llvm/lib/CodeGen/JMCInstrumenter.cpp
@@ -227,7 +227,7 @@ bool runImpl(Module &M) {
     // FIXME: it would be nice to make CI scheduling boundary, although in
     //        practice it does not matter much.
     auto *CI = CallInst::Create(getCheckFunctionType(Ctx), CheckFunction,
-                                {Flag}, "", &*F.begin()->getFirstInsertionPt());
+                                {Flag}, "", F.begin()->getFirstInsertionPt());
     CI->addParamAttr(0, Attribute::NoUndef);
     if (UseX86FastCall) {
       CI->setCallingConv(CallingConv::X86_FastCall);
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index cfc8c28b99e562..481d9e341da377 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -1356,10 +1356,11 @@ InstrRefBasedLDV::findLocationForMemOperand(const MachineInstr &MI) {
   // from the stack at some point. Happily the memory operand will tell us
   // the size written to the stack.
   auto *MemOperand = *MI.memoperands_begin();
-  unsigned SizeInBits = MemOperand->getSizeInBits();
+  LocationSize SizeInBits = MemOperand->getSizeInBits();
+  assert(SizeInBits.hasValue() && "Expected to find a valid size!");
 
   // Find that position in the stack indexes we're tracking.
-  auto IdxIt = MTracker->StackSlotIdxes.find({SizeInBits, 0});
+  auto IdxIt = MTracker->StackSlotIdxes.find({SizeInBits.getValue(), 0});
   if (IdxIt == MTracker->StackSlotIdxes.end())
     // That index is not tracked. This is suprising, and unlikely to ever
     // occur, but the safe action is to indicate the variable is optimised out.
diff --git a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
index bc2ea3f05b6da6..5caf20add2a11e 100644
--- a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
+++ b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
@@ -39,6 +39,9 @@ LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
     return LLT::scalar(SizeInBits);
   }
 
+  if (Ty.isTokenTy())
+    return LLT::token();
+
   return LLT();
 }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index d41fc97cb80603..691c60d22724f3 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1919,10 +1919,13 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
 
   if (Token.range().front() == 's') {
     auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
-    if (!verifyScalarSize(ScalarSize))
-      return error("invalid size for scalar type");
-
-    Ty = LLT::scalar(ScalarSize);
+    if (ScalarSize) {
+      if (!verifyScalarSize(ScalarSize))
+        return error("invalid size for scalar type");
+      Ty = LLT::scalar(ScalarSize);
+    } else {
+      Ty = LLT::token();
+    }
     lex();
     return false;
   } else if (Token.range().front() == 'p') {
@@ -1980,7 +1983,7 @@ bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
   if (Token.range().front() == 's') {
     auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
     if (!verifyScalarSize(ScalarSize))
-      return error("invalid size for scalar type");
+      return error("invalid size for scalar element in vector");
     Ty = LLT::scalar(ScalarSize);
   } else if (Token.range().front() == 'p') {
     const DataLayout &DL = MF.getDataLayout();
diff --git a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
index 812d57984e6cae..ccfc4565d3a9bc 100644
--- a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
+++ b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -123,7 +123,7 @@ std::string VRegRenamer::getInstructionOpcodeHash(MachineInstr &MI) {
   llvm::transform(MI.uses(), std::back_inserter(MIOperands), GetHashableMO);
 
   for (const auto *Op : MI.memoperands()) {
-    MIOperands.push_back((unsigned)Op->getSize());
+    MIOperands.push_back((unsigned)Op->getSize().getValue());
     MIOperands.push_back((unsigned)Op->getFlags());
     MIOperands.push_back((unsigned)Op->getOffset());
     MIOperands.push_back((unsigned)Op->getSuccessOrdering());
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index c65937935ed820..a4c87a7678bd8d 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -155,9 +155,6 @@ MachineCombiner::getOperandDef(const MachineOperand &MO) {
   // We need a virtual register definition.
   if (MO.isReg() && MO.getReg().isVirtual())
     DefInstr = MRI->getUniqueVRegDef(MO.getReg());
-  // PHI's have no depth etc.
-  if (DefInstr && DefInstr->isPHI())
-    DefInstr = nullptr;
   return DefInstr;
 }
 
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index 280d3a6a41edc9..853de4c88caeb7 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -184,7 +184,8 @@ uint64_t MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
   return alignTo(Offset, StackAlign);
 }
 
-void MachineFrameInfo::computeMaxCallFrameSize(const MachineFunction &MF) {
+void MachineFrameInfo::computeMaxCallFrameSize(
+    MachineFunction &MF, std::vector<MachineBasicBlock::iterator> *FrameSDOps) {
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
@@ -192,18 +193,14 @@ void MachineFrameInfo::computeMaxCallFrameSize(const MachineFunction &MF) {
          "Can only compute MaxCallFrameSize if Setup/Destroy opcode are known");
 
   MaxCallFrameSize = 0;
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
       unsigned Opcode = MI.getOpcode();
       if (Opcode == FrameSetupOpcode || Opcode == FrameDestroyOpcode) {
         unsigned Size = TII.getFrameSize(MI);
         MaxCallFrameSize = std::max(MaxCallFrameSize, Size);
-        AdjustsStack = true;
-      } else if (MI.isInlineAsm()) {
-        // Some inline asm's need a stack frame, as indicated by operand 1.
-        unsigned ExtraInfo = MI.getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
-        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
-          AdjustsStack = true;
+        if (FrameSDOps != nullptr)
+          FrameSDOps->push_back(&MI);
       }
     }
   }
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 323f1a6a1b2bd7..ad532149926670 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -484,13 +484,17 @@ void MachineFunction::deleteMachineBasicBlock(MachineBasicBlock *MBB) {
 }
 
 MachineMemOperand *MachineFunction::getMachineMemOperand(
-    MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
-    Align base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
+    MachinePointerInfo PtrInfo, MachineMemOperand::Flags F, LocationSize Size,
+    Align BaseAlignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
     SyncScope::ID SSID, AtomicOrdering Ordering,
     AtomicOrdering FailureOrdering) {
+  assert((!Size.hasValue() ||
+          Size.getValue().getKnownMinValue() != ~UINT64_C(0)) &&
+         "Unexpected an unknown size to be represented using "
+         "LocationSize::beforeOrAfter()");
   return new (Allocator)
-      MachineMemOperand(PtrInfo, f, s, base_alignment, AAInfo, Ranges,
-                        SSID, Ordering, FailureOrdering);
+      MachineMemOperand(PtrInfo, F, Size, BaseAlignment, AAInfo, Ranges, SSID,
+                        Ordering, FailureOrdering);
 }
 
 MachineMemOperand *MachineFunction::getMachineMemOperand(
@@ -503,8 +507,14 @@ MachineMemOperand *MachineFunction::getMachineMemOperand(
                         Ordering, FailureOrdering);
 }
 
-MachineMemOperand *MachineFunction::getMachineMemOperand(
-    const MachineMemOperand *MMO, const MachinePointerInfo &PtrInfo, uint64_t Size) {
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      const MachinePointerInfo &PtrInfo,
+                                      LocationSize Size) {
+  assert((!Size.hasValue() ||
+          Size.getValue().getKnownMinValue() != ~UINT64_C(0)) &&
+         "Unexpected an unknown size to be represented using "
+         "LocationSize::beforeOrAfter()");
   return new (Allocator)
       MachineMemOperand(PtrInfo, MMO->getFlags(), Size, MMO->getBaseAlign(),
                         AAMDNodes(), nullptr, MMO->getSyncScopeID(),
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 6654e1d6ceceea..fe2f9ccd33a330 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1302,10 +1302,10 @@ static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA,
   int64_t OffsetB = MMOb->getOffset();
   int64_t MinOffset = std::min(OffsetA, OffsetB);
 
-  uint64_t WidthA = MMOa->getSize();
-  uint64_t WidthB = MMOb->getSize();
-  bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
-  bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
+  LocationSize WidthA = MMOa->getSize();
+  LocationSize WidthB = MMOb->getSize();
+  bool KnownWidthA = WidthA.hasValue();
+  bool KnownWidthB = WidthB.hasValue();
 
   const Value *ValA = MMOa->getValue();
   const Value *ValB = MMOb->getValue();
@@ -1325,8 +1325,8 @@ static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA,
     if (!KnownWidthA || !KnownWidthB)
       return true;
     int64_t MaxOffset = std::max(OffsetA, OffsetB);
-    int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
-    return (MinOffset + LowWidth > MaxOffset);
+    LocationSize LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
+    return (MinOffset + (int)LowWidth.getValue() > MaxOffset);
   }
 
   if (!AA)
@@ -1338,10 +1338,10 @@ static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA,
   assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
   assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
 
-  int64_t OverlapA =
-      KnownWidthA ? WidthA + OffsetA - MinOffset : MemoryLocation::UnknownSize;
-  int64_t OverlapB =
-      KnownWidthB ? WidthB + OffsetB - MinOffset : MemoryLocation::UnknownSize;
+  int64_t OverlapA = KnownWidthA ? WidthA.getValue() + OffsetA - MinOffset
+                                 : MemoryLocation::UnknownSize;
+  int64_t OverlapB = KnownWidthB ? WidthB.getValue() + OffsetB - MinOffset
+                                 : MemoryLocation::UnknownSize;
 
   return !AA->isNoAlias(
       MemoryLocation(ValA, OverlapA, UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
@@ -2357,15 +2357,16 @@ using MMOList = SmallVector<const MachineMemOperand *, 2>;
 static LocationSize getSpillSlotSize(const MMOList &Accesses,
                                      const MachineFrameInfo &MFI) {
   uint64_t Size = 0;
-  for (const auto *A : Accesses)
+  for (const auto *A : Accesses) {
     if (MFI.isSpillSlotObjectIndex(
             cast<FixedStackPseudoSourceValue>(A->getPseudoValue())
                 ->getFrameIndex())) {
-      uint64_t S = A->getSize();
-      if (S == ~UINT64_C(0))
+      LocationSize S = A->getSize();
+      if (!S.hasValue())
         return LocationSize::beforeOrAfterPointer();
-      Size += S;
+      Size += S.getValue();
     }
+  }
   return Size;
 }
 
@@ -2374,10 +2375,8 @@ MachineInstr::getSpillSize(const TargetInstrInfo *TII) const {
   int FI;
   if (TII->isStoreToStackSlotPostFE(*this, FI)) {
     const MachineFrameInfo &MFI = getMF()->getFrameInfo();
-    if (MFI.isSpillSlotObjectIndex(FI)) {
-      uint64_t Size = (*memoperands_begin())->getSize();
-      return Size == ~UINT64_C(0) ? LocationSize::beforeOrAfterPointer() : Size;
-    }
+    if (MFI.isSpillSlotObjectIndex(FI))
+      return (*memoperands_begin())->getSize();
   }
   return std::nullopt;
 }
@@ -2395,10 +2394,8 @@ MachineInstr::getRestoreSize(const TargetInstrInfo *TII) const {
   int FI;
   if (TII->isLoadFromStackSlotPostFE(*this, FI)) {
     const MachineFrameInfo &MFI = getMF()->getFrameInfo();
-    if (MFI.isSpillSlotObjectIndex(FI)) {
-      uint64_t Size = (*memoperands_begin())->getSize();
-      return Size == ~UINT64_C(0) ? LocationSize::beforeOrAfterPointer() : Size;
-    }
+    if (MFI.isSpillSlotObjectIndex(FI))
+      return (*memoperands_begin())->getSize();
   }
   return std::nullopt;
 }
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index 1492c8c366fb41..1019c53e57c6fb 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -198,6 +198,23 @@ MDNode *MachineLoop::getLoopID() const {
   return LoopID;
 }
 
+bool MachineLoop::isLoopInvariantImplicitPhysReg(Register Reg) const {
+  MachineFunction *MF = getHeader()->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  if (MRI->isConstantPhysReg(Reg))
+    return true;
+
+  if (!MF->getSubtarget()
+           .getRegisterInfo()
+           ->shouldAnalyzePhysregInMachineLoopInfo(Reg))
+    return false;
+
+  return !llvm::any_of(
+      MRI->def_instructions(Reg),
+      [this](const MachineInstr &MI) { return this->contains(&MI); });
+}
+
 bool MachineLoop::isLoopInvariant(MachineInstr &I,
                                   const Register ExcludeReg) const {
   MachineFunction *MF = I.getParent()->getParent();
@@ -226,7 +243,7 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I,
         // it could get allocated to something with a def during allocation.
         // However, if the physreg is known to always be caller saved/restored
         // then this use is safe to hoist.
-        if (!MRI->isConstantPhysReg(Reg) &&
+        if (!isLoopInvariantImplicitPhysReg(Reg) &&
             !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())) &&
             !TII->isIgnorableUse(MO))
           return false;
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index c7c0a1c20d57f4..937ca539513afd 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -1101,24 +1101,26 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
   assert(getFailureOrdering() == FailureOrdering && "Value truncated");
 }
 
-MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
-                                     uint64_t s, Align a,
+MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags F,
+                                     LocationSize TS, Align BaseAlignment,
                                      const AAMDNodes &AAInfo,
                                      const MDNode *Ranges, SyncScope::ID SSID,
                                      AtomicOrdering Ordering,
                                      AtomicOrdering FailureOrdering)
-    : MachineMemOperand(ptrinfo, f,
-                        s == ~UINT64_C(0) ? LLT() : LLT::scalar(8 * s), a,
-                        AAInfo, Ranges, SSID, Ordering, FailureOrdering) {}
+    : MachineMemOperand(ptrinfo, F,
+                        !TS.hasValue() || TS.isScalable()
+                            ? LLT()
+                            : LLT::scalar(8 * TS.getValue().getKnownMinValue()),
+                        BaseAlignment, AAInfo, Ranges, SSID, Ordering,
+                        FailureOrdering) {}
 
 void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
   // The Value and Offset may differ due to CSE. But the flags and size
   // should be the same.
   assert(MMO->getFlags() == getFlags() && "Flags mismatch!");
-  assert((MMO->getSize() == ~UINT64_C(0) || getSize() == ~UINT64_C(0) ||
+  assert((!MMO->getSize().hasValue() || !getSize().hasValue() ||
           MMO->getSize() == getSize()) &&
          "Size mismatch!");
-
   if (MMO->getBaseAlign() >= getBaseAlign()) {
     // Update the alignment value.
     BaseAlign = MMO->getBaseAlign();
@@ -1240,7 +1242,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
        << "unknown-address";
   }
   MachineOperand::printOperandOffset(OS, getOffset());
-  if (getSize() > 0 && getAlign() != getSize())
+  if (!getSize().hasValue() ||
+      getAlign() != getSize().getValue().getKnownMinValue())
     OS << ", align " << getAlign().value();
   if (getAlign() != getBaseAlign())
     OS << ", basealign " << getBaseAlign().value();
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index d8cb6816883394..eb42a78603d407 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -2732,19 +2732,20 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
   if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
     return true;
 
-  uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
-  uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
+  LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
+  LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
 
   // This is the main test, which checks the offset values and the loop
   // increment value to determine if the accesses may be loop carried.
-  if (AccessSizeS == MemoryLocation::UnknownSize ||
-      AccessSizeD == MemoryLocation::UnknownSize)
+  if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
     return true;
 
-  if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
+  if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
+      DeltaD < AccessSizeD.getValue())
     return true;
 
-  return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
+  return (OffsetS + (int64_t)AccessSizeS.getValue() <
+          OffsetD + (int64_t)AccessSizeD.getValue());
 }
 
 void SwingSchedulerDAG::postProcessDAG() {
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 1cd90474898e77..5abfbd5981fba8 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -200,7 +200,7 @@ stable_hash llvm::stableHashValue(const MachineInstr &MI, bool HashVRegs,
   for (const auto *Op : MI.memoperands()) {
     if (!HashMemOperands)
       break;
-    HashComponents.push_back(static_cast<unsigned>(Op->getSize()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getSize().getValue()));
     HashComponents.push_back(static_cast<unsigned>(Op->getFlags()));
     HashComponents.push_back(static_cast<unsigned>(Op->getOffset()));
     HashComponents.push_back(static_cast<unsigned>(Op->getSuccessOrdering()));
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c2d6dd35e1cb21..c69d36fc7fdd60 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1195,13 +1195,16 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       const MachineMemOperand &MMO = **MI->memoperands_begin();
       if (MI->getOpcode() == TargetOpcode::G_ZEXTLOAD ||
           MI->getOpcode() == TargetOpcode::G_SEXTLOAD) {
-        if (MMO.getSizeInBits() >= ValTy.getSizeInBits())
+        if (TypeSize::isKnownGE(MMO.getSizeInBits().getValue(),
+                                ValTy.getSizeInBits()))
           report("Generic extload must have a narrower memory type", MI);
       } else if (MI->getOpcode() == TargetOpcode::G_LOAD) {
-        if (MMO.getSize() > ValTy.getSizeInBytes())
+        if (TypeSize::isKnownGT(MMO.getSize().getValue(),
+                                ValTy.getSizeInBytes()))
           report("load memory size cannot exceed result size", MI);
       } else if (MI->getOpcode() == TargetOpcode::G_STORE) {
-        if (ValTy.getSizeInBytes() < MMO.getSize())
+        if (TypeSize::isKnownLT(ValTy.getSizeInBytes(),
+                                MMO.getSize().getValue()))
           report("store memory size cannot exceed value size", MI);
       }
 
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 0bef513342ff12..bdae94c4e6f885 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -979,8 +979,8 @@ void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
       NewMMOs.push_back(
           MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
     } else {
-      NewMMOs.push_back(
-          MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
+      NewMMOs.push_back(MF.getMachineMemOperand(
+          MMO, 0, LocationSize::beforeOrAfterPointer()));
     }
   }
   NewMI.setMemRefs(MF, NewMMOs);
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index e8391afb8e3f8d..26857c6a40889d 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -12,8 +12,6 @@
 namespace llvm {
 
 DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
-  if (Translator)
-    S = Translator(S);
   auto I = Strings.insert({S, DwarfStringPoolEntry()});
   auto &Entry = I.first->second;
   if (I.second || !Entry.isIndexed()) {
@@ -28,9 +26,6 @@ DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
 StringRef NonRelocatableStringpool::internString(StringRef S) {
   DwarfStringPoolEntry Entry{nullptr, 0, DwarfStringPoolEntry::NotIndexed};
 
-  if (Translator)
-    S = Translator(S);
-
   auto InsertResult = Strings.insert({S, Entry});
   return InsertResult.first->getKey();
 }
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 8af17e63e25c75..eaf96ec5cbde8c 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -228,9 +228,8 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
   FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(MF);
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
 
-  // Calculate the MaxCallFrameSize and AdjustsStack variables for the
-  // function's frame information. Also eliminates call frame pseudo
-  // instructions.
+  // Calculate the MaxCallFrameSize value for the function's frame
+  // information. Also eliminates call frame pseudo instructions.
   calculateCallFrameInfo(MF);
 
   // Determine placement of CSR spill/restore code and prolog/epilog code:
@@ -350,17 +349,13 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-/// Calculate the MaxCallFrameSize and AdjustsStack
-/// variables for the function's frame information and eliminate call frame
-/// pseudo instructions.
+/// Calculate the MaxCallFrameSize variable for the function's frame
+/// information and eliminate call frame pseudo instructions.
 void PEI::calculateCallFrameInfo(MachineFunction &MF) {
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  unsigned MaxCallFrameSize = 0;
-  bool AdjustsStack = MFI.adjustsStack();
-
   // Get the function call frame set-up and tear-down instruction opcode
   unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
@@ -370,26 +365,15 @@ void PEI::calculateCallFrameInfo(MachineFunction &MF) {
   if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
     return;
 
+  // (Re-)Compute the MaxCallFrameSize.
+  [[maybe_unused]] uint32_t MaxCFSIn =
+      MFI.isMaxCallFrameSizeComputed() ? MFI.getMaxCallFrameSize() : UINT32_MAX;
   std::vector<MachineBasicBlock::iterator> FrameSDOps;
-  for (MachineBasicBlock &BB : MF)
-    for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I)
-      if (TII.isFrameInstr(*I)) {
-        unsigned Size = TII.getFrameSize(*I);
-        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
-        AdjustsStack = true;
-        FrameSDOps.push_back(I);
-      } else if (I->isInlineAsm()) {
-        // Some inline asm's need a stack frame, as indicated by operand 1.
-        unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
-        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
-          AdjustsStack = true;
-      }
-
-  assert(!MFI.isMaxCallFrameSizeComputed() ||
-         (MFI.getMaxCallFrameSize() >= MaxCallFrameSize &&
-          !(AdjustsStack && !MFI.adjustsStack())));
-  MFI.setAdjustsStack(AdjustsStack);
-  MFI.setMaxCallFrameSize(MaxCallFrameSize);
+  MFI.computeMaxCallFrameSize(MF, &FrameSDOps);
+  assert(MFI.getMaxCallFrameSize() <= MaxCFSIn &&
+         "Recomputing MaxCFS gave a larger value.");
+  assert((FrameSDOps.empty() || MF.getFrameInfo().adjustsStack()) &&
+         "AdjustsStack not set in presence of a frame pseudo instruction.");
 
   if (TFI->canSimplifyCallFramePseudos(MF)) {
     // If call frames are not being included as part of the stack frame, and
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index f65d5320bab90d..2e03ae6aec94d3 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -624,8 +624,8 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With
     // RemoveDIs turned on, SplitPt would instead point to the next
     // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs,
-    // tell splitBasicBlock that we want to include any DPValues attached to
-    // SplitPt in the splice.
+    // tell splitBasicBlock that we want to include any DbgVariableRecords
+    // attached to SplitPt in the splice.
     SplitPt.setHeadBit(true);
     BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
     BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index deda5a3215f1fc..f199625bf67ad3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -597,8 +597,8 @@ namespace {
     SDValue foldSextSetcc(SDNode *N);
     SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
                               const SDLoc &DL);
-    SDValue foldSubToUSubSat(EVT DstVT, SDNode *N);
-    SDValue foldABSToABD(SDNode *N);
+    SDValue foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL);
+    SDValue foldABSToABD(SDNode *N, const SDLoc &DL);
     SDValue unfoldMaskedMerge(SDNode *N);
     SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
@@ -2529,6 +2529,23 @@ static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
 }
 
+// Attempt to form avgceilu(A, B) from (A | B) - ((A ^ B) >> 1)
+static SDValue combineFixedwidthToAVGCEILU(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+  if (TLI.isOperationLegal(ISD::AVGCEILU, VT)) {
+    SDValue A, B;
+    if (sd_match(N, m_Sub(m_Or(m_Value(A), m_Value(B)),
+                          m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
+                                m_SpecificInt(1))))) {
+      return DAG.getNode(ISD::AVGCEILU, DL, VT, A, B);
+    }
+  }
+  return SDValue();
+}
+
 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
 /// a shift and add with a different constant.
 static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
@@ -2703,11 +2720,11 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
   SDValue A, B, C;
 
   // fold ((0-A) + B) -> B-A
-  if (sd_match(N0, m_Sub(m_Zero(), m_Value(A))))
+  if (sd_match(N0, m_Neg(m_Value(A))))
     return DAG.getNode(ISD::SUB, DL, VT, N1, A);
 
   // fold (A + (0-B)) -> A-B
-  if (sd_match(N1, m_Sub(m_Zero(), m_Value(B))))
+  if (sd_match(N1, m_Neg(m_Value(B))))
     return DAG.getNode(ISD::SUB, DL, VT, N0, B);
 
   // fold (A+(B-A)) -> B
@@ -2820,6 +2837,23 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
   return SDValue();
 }
 
+// Attempt to form avgflooru(A, B) from (A & B) + ((A ^ B) >> 1)
+static SDValue combineFixedwidthToAVGFLOORU(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+  if (TLI.isOperationLegal(ISD::AVGFLOORU, VT)) {
+    SDValue A, B;
+    if (sd_match(N, m_Add(m_And(m_Value(A), m_Value(B)),
+                          m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
+                                m_SpecificInt(1))))) {
+      return DAG.getNode(ISD::AVGFLOORU, DL, VT, A, B);
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2835,6 +2869,10 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
+  // Try to match AVGFLOORU fixedwidth pattern
+  if (SDValue V = combineFixedwidthToAVGFLOORU(N, DAG))
+    return V;
+
   // fold (a+b) -> (a|b) iff a and b share no bits.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
       DAG.haveNoCommonBitsSet(N0, N1))
@@ -3596,7 +3634,7 @@ static SDValue getTruncatedUSUBSAT(EVT DstVT, EVT SrcVT, SDValue LHS,
 
 // Try to find umax(a,b) - b or a - umin(a,b) patterns that may be converted to
 // usubsat(a,b), optionally as a truncated type.
-SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) {
+SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) {
   if (N->getOpcode() != ISD::SUB ||
       !(!LegalOperations || hasOperation(ISD::USUBSAT, DstVT)))
     return SDValue();
@@ -3611,18 +3649,18 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) {
     SDValue MaxLHS = Op0.getOperand(0);
     SDValue MaxRHS = Op0.getOperand(1);
     if (MaxLHS == Op1)
-      return getTruncatedUSUBSAT(DstVT, SubVT, MaxRHS, Op1, DAG, SDLoc(N));
+      return getTruncatedUSUBSAT(DstVT, SubVT, MaxRHS, Op1, DAG, DL);
     if (MaxRHS == Op1)
-      return getTruncatedUSUBSAT(DstVT, SubVT, MaxLHS, Op1, DAG, SDLoc(N));
+      return getTruncatedUSUBSAT(DstVT, SubVT, MaxLHS, Op1, DAG, DL);
   }
 
   if (Op1.getOpcode() == ISD::UMIN && Op1.hasOneUse()) {
     SDValue MinLHS = Op1.getOperand(0);
     SDValue MinRHS = Op1.getOperand(1);
     if (MinLHS == Op0)
-      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinRHS, DAG, SDLoc(N));
+      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinRHS, DAG, DL);
     if (MinRHS == Op0)
-      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinLHS, DAG, SDLoc(N));
+      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinLHS, DAG, DL);
   }
 
   // sub(a,trunc(umin(zext(a),b))) -> usubsat(a,trunc(umin(b,SatLimit)))
@@ -3633,10 +3671,10 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) {
     SDValue MinRHS = Op1.getOperand(0).getOperand(1);
     if (MinLHS.getOpcode() == ISD::ZERO_EXTEND && MinLHS.getOperand(0) == Op0)
       return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinLHS, MinRHS,
-                                 DAG, SDLoc(N));
+                                 DAG, DL);
     if (MinRHS.getOpcode() == ISD::ZERO_EXTEND && MinRHS.getOperand(0) == Op0)
       return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinRHS, MinLHS,
-                                 DAG, SDLoc(N));
+                                 DAG, DL);
   }
 
   return SDValue();
@@ -3812,7 +3850,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getNOT(DL, B, VT));
 
   // fold (A - (-B * C)) -> (A + (B * C))
-  if (sd_match(N1, m_OneUse(m_Mul(m_Sub(m_Zero(), m_Value(B)), m_Value(C)))))
+  if (sd_match(N1, m_OneUse(m_Mul(m_Neg(m_Value(B)), m_Value(C)))))
     return DAG.getNode(ISD::ADD, DL, VT, N0,
                        DAG.getNode(ISD::MUL, DL, VT, B, C));
 
@@ -3828,10 +3866,14 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
+  // Try to match AVGCEILU fixedwidth pattern
+  if (SDValue V = combineFixedwidthToAVGCEILU(N, DAG))
+    return V;
+
   if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
     return V;
 
-  if (SDValue V = foldSubToUSubSat(VT, N))
+  if (SDValue V = foldSubToUSubSat(VT, N, DL))
     return V;
 
   // (A - B) - 1  ->  add (xor B, -1), A
@@ -6655,7 +6697,7 @@ static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
 
 /// For targets that support usubsat, match a bit-hack form of that operation
 /// that ends in 'and' and convert it.
-static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
+static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG, const SDLoc &DL) {
   EVT VT = N->getValueType(0);
   unsigned BitWidth = VT.getScalarSizeInBits();
   APInt SignMask = APInt::getSignMask(BitWidth);
@@ -6672,7 +6714,6 @@ static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
                                         m_SpecificInt(BitWidth - 1))))))
     return SDValue();
 
-  SDLoc DL(N);
   return DAG.getNode(ISD::USUBSAT, DL, VT, X,
                      DAG.getConstant(SignMask, DL, VT));
 }
@@ -6764,34 +6805,34 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N1.getValueType();
+  SDLoc DL(N);
 
   // x & x --> x
   if (N0 == N1)
     return N0;
 
   // fold (and c1, c2) -> c1&c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, DL, VT, {N0, N1}))
     return C;
 
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       !DAG.isConstantIntBuildVectorOrConstantInt(N1))
-    return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
+    return DAG.getNode(ISD::AND, DL, VT, N1, N0);
 
   if (areBitwiseNotOfEachother(N0, N1))
-    return DAG.getConstant(APInt::getZero(VT.getScalarSizeInBits()), SDLoc(N),
-                           VT);
+    return DAG.getConstant(APInt::getZero(VT.getScalarSizeInBits()), DL, VT);
 
   // fold vector ops
   if (VT.isVector()) {
-    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
       return FoldedVOp;
 
     // fold (and x, 0) -> 0, vector edition
     if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
       // do not return N1, because undef node may exist in N1
-      return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()),
-                             SDLoc(N), N1.getValueType());
+      return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()), DL,
+                             N1.getValueType());
 
     // fold (and x, -1) -> x, vector edition
     if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
@@ -6811,8 +6852,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         uint64_t ElementSize =
             LoadVT.getVectorElementType().getScalarSizeInBits();
         if (Splat->getAPIntValue().isMask(ElementSize)) {
-          auto NewLoad = DAG.getMaskedLoad(
-              ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(),
+          SDValue NewLoad = DAG.getMaskedLoad(
+              ExtVT, DL, MLoad->getChain(), MLoad->getBasePtr(),
               MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(),
               LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(),
               ISD::ZEXTLOAD, MLoad->isExpandingLoad());
@@ -6834,7 +6875,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   unsigned BitWidth = VT.getScalarSizeInBits();
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth)))
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
 
   if (SDValue R = foldAndOrOfSETCC(N, DAG))
     return R;
@@ -6843,12 +6884,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     return NewSel;
 
   // reassociate and
-  if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
+  if (SDValue RAND = reassociateOps(ISD::AND, DL, N0, N1, N->getFlags()))
     return RAND;
 
   // Fold and(vecreduce(x), vecreduce(y)) -> vecreduce(and(x, y))
-  if (SDValue SD = reassociateReduction(ISD::VECREDUCE_AND, ISD::AND, SDLoc(N),
-                                        VT, N0, N1))
+  if (SDValue SD =
+          reassociateReduction(ISD::VECREDUCE_AND, ISD::AND, DL, VT, N0, N1))
     return SD;
 
   // fold (and (or x, C), D) -> D if (C & D) == D
@@ -6868,18 +6909,16 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
     // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
     if (DAG.MaskedValueIsZero(N0Op0, Mask))
-      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0Op0);
+      return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0Op0);
 
     // fold (and (any_ext V), c) -> (zero_ext (and (trunc V), c)) if profitable.
     if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
         TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
         TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
-        TLI.isNarrowingProfitable(VT, SrcVT)) {
-      SDLoc DL(N);
+        TLI.isNarrowingProfitable(VT, SrcVT))
       return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
                          DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
                                      DAG.getZExtOrTrunc(N1, DL, SrcVT)));
-    }
   }
 
   // fold (and (ext (and V, c1)), c2) -> (and (ext V), (and c1, (ext c2)))
@@ -6891,7 +6930,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
         DAG.isConstantIntBuildVectorOrConstantInt(N0Op0.getOperand(1)) &&
         N0->hasOneUse() && N0Op0->hasOneUse()) {
-      SDLoc DL(N);
       SDValue NewMask =
           DAG.getNode(ISD::AND, DL, VT, N1,
                       DAG.getNode(ExtOpc, DL, VT, N0Op0.getOperand(1)));
@@ -6912,8 +6950,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
        N0.getOperand(0).getOpcode() == ISD::LOAD &&
        N0.getOperand(0).getResNo() == 0) ||
       (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
-    LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
-                                         N0 : N0.getOperand(0) );
+    auto *Load =
+        cast<LoadSDNode>((N0.getOpcode() == ISD::LOAD) ? N0 : N0.getOperand(0));
 
     // Get the constant (if applicable) the zero'th operand is being ANDed with.
     // This can be a pure constant or a vector splat, in which case we treat the
@@ -7023,9 +7061,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       //    (and (extract_subvector (zext|anyext|sext v) _) iN_mask)
       // => (extract_subvector (iN_zeroext v))
       SDValue ZeroExtExtendee =
-          DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), ExtVT, Extendee);
+          DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Extendee);
 
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, ZeroExtExtendee,
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ZeroExtExtendee,
                          N0.getOperand(1));
     }
   }
@@ -7042,8 +7080,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                        GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
 
       SDValue ZExtLoad = DAG.getMaskedGather(
-          DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
-          GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
+          DAG.getVTList(VT, MVT::Other), MemVT, DL, Ops, GN0->getMemOperand(),
+          GN0->getIndexType(), ISD::ZEXTLOAD);
 
       CombineTo(N, ZExtLoad);
       AddToWorklist(ZExtLoad.getNode());
@@ -7095,7 +7133,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         return SubRHS;
       if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
           SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
-        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
+        return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, SubRHS.getOperand(0));
     }
   }
 
@@ -7109,7 +7147,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (ISD::isUNINDEXEDLoad(N0.getNode()) &&
       (ISD::isEXTLoad(N0.getNode()) ||
        (ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    auto *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     // If we zero all the possible extended bits, then we can turn this into
     // a zextload if we are running before legalize or the operation is legal.
@@ -7164,10 +7202,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
   // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...).
   if (IsAndZeroExtMask(N0, N1))
-    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
 
   if (hasOperation(ISD::USUBSAT, VT))
-    if (SDValue V = foldAndToUsubsat(N, DAG))
+    if (SDValue V = foldAndToUsubsat(N, DAG, DL))
       return V;
 
   // Postpone until legalization completed to avoid interference with bswap
@@ -10724,7 +10762,7 @@ SDValue DAGCombiner::visitSHLSAT(SDNode *N) {
 // (ABS (SUB (EXTEND a), (EXTEND b))).
 // (TRUNC (ABS (SUB (EXTEND a), (EXTEND b)))).
 // Generates UABD/SABD instruction.
-SDValue DAGCombiner::foldABSToABD(SDNode *N) {
+SDValue DAGCombiner::foldABSToABD(SDNode *N, const SDLoc &DL) {
   EVT SrcVT = N->getValueType(0);
 
   if (N->getOpcode() == ISD::TRUNCATE)
@@ -10736,7 +10774,6 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDValue AbsOp1 = N->getOperand(0);
   SDValue Op0, Op1;
-  SDLoc DL(N);
 
   if (AbsOp1.getOpcode() != ISD::SUB)
     return SDValue();
@@ -10795,9 +10832,10 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N) {
 SDValue DAGCombiner::visitABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   // fold (abs c1) -> c2
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::ABS, SDLoc(N), VT, {N0}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::ABS, DL, VT, {N0}))
     return C;
   // fold (abs (abs x)) -> (abs x)
   if (N0.getOpcode() == ISD::ABS)
@@ -10806,7 +10844,7 @@ SDValue DAGCombiner::visitABS(SDNode *N) {
   if (DAG.SignBitIsZero(N0))
     return N0;
 
-  if (SDValue ABD = foldABSToABD(N))
+  if (SDValue ABD = foldABSToABD(N, DL))
     return ABD;
 
   // fold (abs (sign_extend_inreg x)) -> (zero_extend (abs (truncate x)))
@@ -10816,7 +10854,6 @@ SDValue DAGCombiner::visitABS(SDNode *N) {
     if (TLI.isTruncateFree(VT, ExtVT) && TLI.isZExtFree(ExtVT, VT) &&
         TLI.isTypeDesirableForOp(ISD::ABS, ExtVT) &&
         hasOperation(ISD::ABS, ExtVT)) {
-      SDLoc DL(N);
       return DAG.getNode(
           ISD::ZERO_EXTEND, DL, VT,
           DAG.getNode(ISD::ABS, DL, ExtVT,
@@ -14612,6 +14649,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   EVT VT = N->getValueType(0);
   EVT SrcVT = N0.getValueType();
   bool isLE = DAG.getDataLayout().isLittleEndian();
+  SDLoc DL(N);
 
   // trunc(undef) = undef
   if (N0.isUndef())
@@ -14619,10 +14657,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
   // fold (truncate (truncate x)) -> (truncate x)
   if (N0.getOpcode() == ISD::TRUNCATE)
-    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
 
   // fold (truncate c1) -> c1
-  if (SDValue C = DAG.FoldConstantArithmetic(ISD::TRUNCATE, SDLoc(N), VT, {N0}))
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, VT, {N0}))
     return C;
 
   // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
@@ -14631,10 +14669,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       N0.getOpcode() == ISD::ANY_EXTEND) {
     // if the source is smaller than the dest, we still need an extend.
     if (N0.getOperand(0).getValueType().bitsLT(VT))
-      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+      return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
     // if the source is larger than the dest, than we just need the truncate.
     if (N0.getOperand(0).getValueType().bitsGT(VT))
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
     // if the source and dest are the same type, we can drop both the extend
     // and the truncate.
     return N0.getOperand(0);
@@ -14648,8 +14686,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     SDValue ExtVal = N0.getOperand(1);
     EVT ExtVT = cast<VTSDNode>(ExtVal)->getVT();
     if (ExtVT.bitsLT(VT) && TLI.preferSextInRegOfTruncate(VT, SrcVT, ExtVT)) {
-      SDValue TrX = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X);
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, TrX, ExtVal);
+      SDValue TrX = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, TrX, ExtVal);
     }
   }
 
@@ -14684,8 +14722,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
       int Elt = EltNo->getAsZExtVal();
       int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
-
-      SDLoc DL(N);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
                          DAG.getBitcast(NVT, N0.getOperand(0)),
                          DAG.getVectorIdxConstant(Index, DL));
@@ -14700,7 +14736,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       SDValue Cond = N0.getOperand(0);
       SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
       SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
-      return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
+      return DAG.getNode(ISD::SELECT, DL, VT, Cond, TruncOp0, TruncOp1);
     }
   }
 
@@ -14712,22 +14748,20 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     KnownBits Known = DAG.computeKnownBits(Amt);
     unsigned Size = VT.getScalarSizeInBits();
     if (Known.countMaxActiveBits() <= Log2_32(Size)) {
-      SDLoc SL(N);
       EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
       if (AmtVT != Amt.getValueType()) {
-        Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
+        Amt = DAG.getZExtOrTrunc(Amt, DL, AmtVT);
         AddToWorklist(Amt.getNode());
       }
-      return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
+      return DAG.getNode(ISD::SHL, DL, VT, Trunc, Amt);
     }
   }
 
-  if (SDValue V = foldSubToUSubSat(VT, N0.getNode()))
+  if (SDValue V = foldSubToUSubSat(VT, N0.getNode(), DL))
     return V;
 
-  if (SDValue ABD = foldABSToABD(N))
+  if (SDValue ABD = foldABSToABD(N, DL))
     return ABD;
 
   // Attempt to pre-truncate BUILD_VECTOR sources.
@@ -14736,7 +14770,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
       // Avoid creating illegal types if running after type legalizer.
       (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {
-    SDLoc DL(N);
     EVT SVT = VT.getScalarType();
     SmallVector<SDValue, 8> TruncOps;
     for (const SDValue &Op : N0->op_values()) {
@@ -14750,7 +14783,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   if (N0.getOpcode() == ISD::SPLAT_VECTOR &&
       (!LegalTypes || TLI.isTypeLegal(VT.getScalarType())) &&
       (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, VT))) {
-    SDLoc DL(N);
     EVT SVT = VT.getScalarType();
     return DAG.getSplatVector(
         VT, DL, DAG.getNode(ISD::TRUNCATE, DL, SVT, N0->getOperand(0)));
@@ -14782,7 +14814,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
         Opnds.push_back(BuildVect.getOperand(i));
 
-      return DAG.getBuildVector(VT, SDLoc(N), Opnds);
+      return DAG.getBuildVector(VT, DL, Opnds);
     }
   }
 
@@ -14845,7 +14877,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
         AddToWorklist(NV.getNode());
         Opnds.push_back(NV);
       }
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
     }
   }
 
@@ -14859,11 +14891,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT &&
         (!LegalOperations ||
          TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) {
-      SDLoc SL(N);
-
       unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1;
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc,
-                         DAG.getVectorIdxConstant(Idx, SL));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecSrc,
+                         DAG.getVectorIdxConstant(Idx, DL));
     }
   }
 
@@ -14908,7 +14938,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       // we are extra cautious to not create an unsupported operation.
       // Target-specific changes are likely needed to avoid regressions here.
       if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) {
-        SDLoc DL(N);
         SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
         SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
         return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
@@ -14925,7 +14954,6 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     if (((!LegalOperations && N0.getOpcode() == ISD::UADDO_CARRY) ||
          TLI.isOperationLegal(N0.getOpcode(), VT)) &&
         N0.hasOneUse() && !N0->hasAnyUseOfValue(1)) {
-      SDLoc DL(N);
       SDValue X = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
       SDValue Y = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
       SDVTList VTs = DAG.getVTList(VT, N0->getValueType(1));
@@ -14942,7 +14970,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
             VT.getScalarSizeInBits() &&
         hasOperation(N0.getOpcode(), VT)) {
       return getTruncatedUSUBSAT(VT, SrcVT, N0.getOperand(0), N0.getOperand(1),
-                                 DAG, SDLoc(N));
+                                 DAG, DL);
     }
     break;
   }
@@ -22436,17 +22464,16 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     //      -> extract_vector_elt b, 0
     // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3
     //      -> extract_vector_elt b, 1
-    SDLoc SL(N);
     EVT ConcatVT = VecOp.getOperand(0).getValueType();
     unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
-    SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL,
+    SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, DL,
                                      Index.getValueType());
 
     SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts);
-    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
                               ConcatVT.getVectorElementType(),
                               ConcatOp, NewIdx);
-    return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt);
+    return DAG.getNode(ISD::BITCAST, DL, ScalarVT, Elt);
   }
 
   // Make sure we found a non-volatile load and the extractelement is
@@ -24174,7 +24201,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
   // TODO: Use "BaseIndexOffset" to make this more effective.
   SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL);
 
-  uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize());
+  LocationSize StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize());
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *MMO;
   if (Offset.isScalable()) {
@@ -27819,14 +27846,13 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
                  : (LSN->getAddressingMode() == ISD::PRE_DEC)
                      ? -1 * C->getSExtValue()
                      : 0;
-      uint64_t Size =
+      LocationSize Size =
           MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());
       return {LSN->isVolatile(),
               LSN->isAtomic(),
               LSN->getBasePtr(),
               Offset /*base offset*/,
-              Size != ~UINT64_C(0) ? LocationSize::precise(Size)
-                                   : LocationSize::beforeOrAfterPointer(),
+              Size,
               LSN->getMemOperand()};
     }
     if (const auto *LN = cast<LifetimeSDNode>(N))
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index cce91dbd9531c9..8b834862fb4d89 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1205,27 +1205,27 @@ void FastISel::handleDbgInfo(const Instruction *II) {
       continue;
     }
 
-    DPValue &DPV = cast<DPValue>(DR);
+    DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
 
     Value *V = nullptr;
-    if (!DPV.hasArgList())
-      V = DPV.getVariableLocationOp(0);
+    if (!DVR.hasArgList())
+      V = DVR.getVariableLocationOp(0);
 
     bool Res = false;
-    if (DPV.getType() == DPValue::LocationType::Value ||
-        DPV.getType() == DPValue::LocationType::Assign) {
-      Res = lowerDbgValue(V, DPV.getExpression(), DPV.getVariable(),
-                          DPV.getDebugLoc());
+    if (DVR.getType() == DbgVariableRecord::LocationType::Value ||
+        DVR.getType() == DbgVariableRecord::LocationType::Assign) {
+      Res = lowerDbgValue(V, DVR.getExpression(), DVR.getVariable(),
+                          DVR.getDebugLoc());
     } else {
-      assert(DPV.getType() == DPValue::LocationType::Declare);
-      if (FuncInfo.PreprocessedDPVDeclares.contains(&DPV))
+      assert(DVR.getType() == DbgVariableRecord::LocationType::Declare);
+      if (FuncInfo.PreprocessedDVRDeclares.contains(&DVR))
         continue;
-      Res = lowerDbgDeclare(V, DPV.getExpression(), DPV.getVariable(),
-                            DPV.getDebugLoc());
+      Res = lowerDbgDeclare(V, DVR.getExpression(), DVR.getVariable(),
+                            DVR.getDebugLoc());
     }
 
     if (!Res)
-      LLVM_DEBUG(dbgs() << "Dropping debug-info for " << DPV << "\n";);
+      LLVM_DEBUG(dbgs() << "Dropping debug-info for " << DVR << "\n";);
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index e01cd8cbf925a2..8fb6b11b8805c5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -358,7 +358,7 @@ void FunctionLoweringInfo::clear() {
   StatepointRelocationMaps.clear();
   PreferredExtendType.clear();
   PreprocessedDbgDeclares.clear();
-  PreprocessedDPVDeclares.clear();
+  PreprocessedDVRDeclares.clear();
 }
 
 /// CreateReg - Allocate a single virtual register for the given type.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2dccc45c803a05..808e3c622033e0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -267,7 +267,9 @@ static MachineMemOperand *getStackAlignedMMO(SDValue StackPtr,
   auto &MFI = MF.getFrameInfo();
   int FI = cast<FrameIndexSDNode>(StackPtr)->getIndex();
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
-  uint64_t ObjectSize = isObjectScalable ? ~UINT64_C(0) : MFI.getObjectSize(FI);
+  LocationSize ObjectSize = isObjectScalable
+                                ? LocationSize::beforeOrAfterPointer()
+                                : LocationSize::precise(MFI.getObjectSize(FI));
   return MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
                                  ObjectSize, MFI.getObjectAlign(FI));
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 52e12cf364066b..93ce9c22af5525 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -349,6 +349,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
                               N->getMemoryVT(), ResVT,
                               N->getChain(), N->getBasePtr(),
                               N->getMemOperand());
+  if (N->getOpcode() == ISD::ATOMIC_LOAD) {
+    ISD::LoadExtType ETy = cast<AtomicSDNode>(N)->getExtensionType();
+    if (ETy == ISD::NON_EXTLOAD) {
+      switch (TLI.getExtendForAtomicOps()) {
+      case ISD::SIGN_EXTEND:
+        ETy = ISD::SEXTLOAD;
+        break;
+      case ISD::ZERO_EXTEND:
+        ETy = ISD::ZEXTLOAD;
+        break;
+      case ISD::ANY_EXTEND:
+        ETy = ISD::EXTLOAD;
+        break;
+      default:
+        llvm_unreachable("Invalid atomic op extension");
+      }
+    }
+    cast<AtomicSDNode>(Res)->setExtensionType(ETy);
+  }
+
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5fb9d8d07d1514..1f6e0097f31ab4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1967,7 +1967,8 @@ void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       LD->getPointerInfo(), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, LD->getAAInfo(), LD->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, LD->getAAInfo(),
+      LD->getRanges());
 
   Lo =
       DAG.getLoadVP(LD->getAddressingMode(), ExtType, LoVT, dl, Ch, Ptr, Offset,
@@ -1990,8 +1991,8 @@ void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo,
           LoMemVT.getStoreSize().getFixedValue());
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
-        LD->getAAInfo(), LD->getRanges());
+        MPI, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
+        Alignment, LD->getAAInfo(), LD->getRanges());
 
     Hi = DAG.getLoadVP(LD->getAddressingMode(), ExtType, HiVT, dl, Ch, Ptr,
                        Offset, MaskHi, EVLHi, HiMemVT, MMO,
@@ -2070,8 +2071,8 @@ void DAGTypeLegalizer::SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD,
 
     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo(SLD->getPointerInfo().getAddrSpace()),
-        MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
-        SLD->getAAInfo(), SLD->getRanges());
+        MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
+        Alignment, SLD->getAAInfo(), SLD->getRanges());
 
     Hi = DAG.getStridedLoadVP(SLD->getAddressingMode(), SLD->getExtensionType(),
                               HiVT, DL, SLD->getChain(), Ptr, SLD->getOffset(),
@@ -2130,7 +2131,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MLD->getPointerInfo(), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, MLD->getAAInfo(),
+      LocationSize::beforeOrAfterPointer(), Alignment, MLD->getAAInfo(),
       MLD->getRanges());
 
   Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
@@ -2154,8 +2155,8 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
           LoMemVT.getStoreSize().getFixedValue());
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
-        MLD->getAAInfo(), MLD->getRanges());
+        MPI, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
+        Alignment, MLD->getAAInfo(), MLD->getRanges());
 
     Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi,
                            HiMemVT, MMO, MLD->getAddressingMode(), ExtType,
@@ -2217,7 +2218,8 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo(), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
+      N->getRanges());
 
   if (auto *MGT = dyn_cast<MaskedGatherSDNode>(N)) {
     SDValue PassThru = MGT->getPassThru();
@@ -2884,10 +2886,10 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo,
   auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
 
   MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOStore, MemoryLocation::UnknownSize,
+      PtrInfo, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
       Alignment);
   MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize,
+      PtrInfo, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
       Alignment);
 
   unsigned EltWidth = VT.getScalarSizeInBits() / 8;
@@ -3478,7 +3480,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) {
   SDValue Lo, Hi;
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo(), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
+      N->getRanges());
 
   Lo = DAG.getStoreVP(Ch, DL, DataLo, Ptr, Offset, MaskLo, EVLLo, LoMemVT, MMO,
                       N->getAddressingMode(), N->isTruncatingStore(),
@@ -3501,8 +3504,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) {
         LoMemVT.getStoreSize().getFixedValue());
 
   MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
-      N->getAAInfo(), N->getRanges());
+      MPI, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
+      Alignment, N->getAAInfo(), N->getRanges());
 
   Hi = DAG.getStoreVP(Ch, DL, DataHi, Ptr, Offset, MaskHi, EVLHi, HiMemVT, MMO,
                       N->getAddressingMode(), N->isTruncatingStore(),
@@ -3574,8 +3577,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(N->getPointerInfo().getAddrSpace()),
-      MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
-      N->getAAInfo(), N->getRanges());
+      MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
+      Alignment, N->getAAInfo(), N->getRanges());
 
   SDValue Hi = DAG.getStridedStoreVP(
       N->getChain(), DL, HiData, Ptr, N->getOffset(), N->getStride(), HiMask,
@@ -3626,7 +3629,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
   SDValue Lo, Hi, Res;
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo(), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
+      N->getRanges());
 
   Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
                           N->getAddressingMode(), N->isTruncatingStore(),
@@ -3651,8 +3655,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
           LoMemVT.getStoreSize().getFixedValue());
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
-        N->getAAInfo(), N->getRanges());
+        MPI, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
+        Alignment, N->getAAInfo(), N->getRanges());
 
     Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
                             N->getAddressingMode(), N->isTruncatingStore(),
@@ -3716,7 +3720,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) {
   SDValue Lo;
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo(), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+      LocationSize::beforeOrAfterPointer(), Alignment, N->getAAInfo(),
+      N->getRanges());
 
   if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) {
     SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale};
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 19f9354c23da42..9d73a42df2a479 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4070,6 +4070,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     if (Op.getResNo() == 0) {
       if (TLI->getExtendForAtomicOps() == ISD::ZERO_EXTEND)
         Known.Zero.setBitsFrom(MemBits);
+      else if (Op->getOpcode() == ISD::ATOMIC_LOAD &&
+               cast<AtomicSDNode>(Op)->getExtensionType() == ISD::ZEXTLOAD)
+        Known.Zero.setBitsFrom(MemBits);
     }
     break;
   }
@@ -4875,6 +4878,13 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
         return VTBits - Tmp + 1;
       if (TLI->getExtendForAtomicOps() == ISD::ZERO_EXTEND)
         return VTBits - Tmp;
+      if (Op->getOpcode() == ISD::ATOMIC_LOAD) {
+        ISD::LoadExtType ETy = cast<AtomicSDNode>(Op)->getExtensionType();
+        if (ETy == ISD::SEXTLOAD)
+          return VTBits - Tmp + 1;
+        if (ETy == ISD::ZEXTLOAD)
+          return VTBits - Tmp;
+      }
     }
     break;
   }
@@ -8392,11 +8402,12 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
 SDValue SelectionDAG::getMemIntrinsicNode(
     unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
     EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
-    MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
-  if (!Size && MemVT.isScalableVector())
-    Size = MemoryLocation::UnknownSize;
-  else if (!Size)
-    Size = MemVT.getStoreSize();
+    MachineMemOperand::Flags Flags, LocationSize Size,
+    const AAMDNodes &AAInfo) {
+  if (Size.hasValue() && MemVT.isScalableVector())
+    Size = LocationSize::beforeOrAfterPointer();
+  else if (Size.hasValue() && !Size.getValue())
+    Size = LocationSize::precise(MemVT.getStoreSize());
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
@@ -8558,7 +8569,7 @@ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
 
-  uint64_t Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
+  LocationSize Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
                                                    Alignment, AAInfo, Ranges);
@@ -8679,7 +8690,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
 
   MachineFunction &MF = getMachineFunction();
-  uint64_t Size =
+  LocationSize Size =
       MemoryLocation::getSizeOrUnknown(Val.getValueType().getStoreSize());
   MachineMemOperand *MMO =
       MF.getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo);
@@ -8828,7 +8839,7 @@ SDValue SelectionDAG::getLoadVP(
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
 
-  uint64_t Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
+  LocationSize Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
                                                    Alignment, AAInfo, Ranges);
@@ -9584,6 +9595,10 @@ SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) {
   if (ISD::matchUnaryPredicate(Y, isShiftTooBig, true))
     return getUNDEF(X.getValueType());
 
+  // shift i1/vXi1 X, Y --> X (any non-zero shift amount is undefined).
+  if (X.getValueType().getScalarType() == MVT::i1)
+    return X;
+
   return SDValue();
 }
 
@@ -11718,8 +11733,10 @@ MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
   // the MMO. This is because the MMO might indicate only a possible address
   // range instead of specifying the affected memory addresses precisely.
   // TODO: Make MachineMemOperands aware of scalable vectors.
-  assert(memvt.getStoreSize().getKnownMinValue() <= MMO->getSize() &&
-         "Size mismatch!");
+  assert(
+      (!MMO->getType().isValid() ||
+       memvt.getStoreSize().getKnownMinValue() <= MMO->getSize().getValue()) &&
+      "Size mismatch!");
 }
 
 /// Profile - Gather unique data for the node.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b6a35f7ad4c4c6..dd19ee16d1d656 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1245,14 +1245,14 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
     }
   }
 
-  // We must skip DPValues if they've already been processed above as we
-  // have just emitted the debug values resulting from assignment tracking
-  // analysis, making any existing DPValues redundant (and probably less
-  // correct). We still need to process DPLabels. This does sink DPLabels
+  // We must skip DbgVariableRecords if they've already been processed above as
+  // we have just emitted the debug values resulting from assignment tracking
+  // analysis, making any existing DbgVariableRecords redundant (and probably
+  // less correct). We still need to process DPLabels. This does sink DPLabels
   // to the bottom of the group of debug records. That sholdn't be important
-  // as it does so deterministcally and ordering between DPLabels and DPValues
-  // is immaterial (other than for MIR/IR printing).
-  bool SkipDPValues = DAG.getFunctionVarLocs();
+  // as it does so deterministcally and ordering between DPLabels and
+  // DbgVariableRecords is immaterial (other than for MIR/IR printing).
+  bool SkipDbgVariableRecords = DAG.getFunctionVarLocs();
   // Is there is any debug-info attached to this instruction, in the form of
   // DbgRecord non-instruction debug-info records.
   for (DbgRecord &DR : I.getDbgRecordRange()) {
@@ -1264,44 +1264,45 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
       continue;
     }
 
-    if (SkipDPValues)
+    if (SkipDbgVariableRecords)
       continue;
-    DPValue &DPV = cast<DPValue>(DR);
-    DILocalVariable *Variable = DPV.getVariable();
-    DIExpression *Expression = DPV.getExpression();
+    DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
+    DILocalVariable *Variable = DVR.getVariable();
+    DIExpression *Expression = DVR.getExpression();
     dropDanglingDebugInfo(Variable, Expression);
 
-    if (DPV.getType() == DPValue::LocationType::Declare) {
-      if (FuncInfo.PreprocessedDPVDeclares.contains(&DPV))
+    if (DVR.getType() == DbgVariableRecord::LocationType::Declare) {
+      if (FuncInfo.PreprocessedDVRDeclares.contains(&DVR))
         continue;
-      LLVM_DEBUG(dbgs() << "SelectionDAG visiting dbg_declare: " << DPV
+      LLVM_DEBUG(dbgs() << "SelectionDAG visiting dbg_declare: " << DVR
                         << "\n");
-      handleDebugDeclare(DPV.getVariableLocationOp(0), Variable, Expression,
-                         DPV.getDebugLoc());
+      handleDebugDeclare(DVR.getVariableLocationOp(0), Variable, Expression,
+                         DVR.getDebugLoc());
       continue;
     }
 
-    // A DPValue with no locations is a kill location.
-    SmallVector<Value *, 4> Values(DPV.location_ops());
+    // A DbgVariableRecord with no locations is a kill location.
+    SmallVector<Value *, 4> Values(DVR.location_ops());
     if (Values.empty()) {
-      handleKillDebugValue(Variable, Expression, DPV.getDebugLoc(),
+      handleKillDebugValue(Variable, Expression, DVR.getDebugLoc(),
                            SDNodeOrder);
       continue;
     }
 
-    // A DPValue with an undef or absent location is also a kill location.
+    // A DbgVariableRecord with an undef or absent location is also a kill
+    // location.
     if (llvm::any_of(Values,
                      [](Value *V) { return !V || isa<UndefValue>(V); })) {
-      handleKillDebugValue(Variable, Expression, DPV.getDebugLoc(),
+      handleKillDebugValue(Variable, Expression, DVR.getDebugLoc(),
                            SDNodeOrder);
       continue;
     }
 
-    bool IsVariadic = DPV.hasArgList();
-    if (!handleDebugValue(Values, Variable, Expression, DPV.getDebugLoc(),
+    bool IsVariadic = DVR.hasArgList();
+    if (!handleDebugValue(Values, Variable, Expression, DVR.getDebugLoc(),
                           SDNodeOrder, IsVariadic)) {
       addDanglingDebugInfo(Values, Variable, Expression, IsVariadic,
-                           DPV.getDebugLoc(), SDNodeOrder);
+                           DVR.getDebugLoc(), SDNodeOrder);
     }
   }
 }
@@ -3037,7 +3038,8 @@ static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
     auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
                  MachineMemOperand::MODereferenceable;
     MachineMemOperand *MemRef = MF.getMachineMemOperand(
-        MPInfo, Flags, PtrTy.getSizeInBits() / 8, DAG.getEVTAlign(PtrTy));
+        MPInfo, Flags, LocationSize::precise(PtrTy.getSizeInBits() / 8),
+        DAG.getEVTAlign(PtrTy));
     DAG.setNodeMemRefs(Node, {MemRef});
   }
   if (PtrTy != PtrMemTy)
@@ -4753,7 +4755,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata());
+      LocationSize::beforeOrAfterPointer(), Alignment, I.getAAMetadata());
   SDValue StoreNode =
       DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
                          ISD::UNINDEXED, false /* Truncating */, IsCompressing);
@@ -4857,9 +4859,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOStore,
-      // TODO: Make MachineMemOperands aware of scalable
-      // vectors.
-      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata());
+      LocationSize::beforeOrAfterPointer(), Alignment, I.getAAMetadata());
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
@@ -4925,7 +4925,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, AAInfo, Ranges);
+      LocationSize::beforeOrAfterPointer(), Alignment, AAInfo, Ranges);
 
   SDValue Load =
       DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
@@ -4961,9 +4961,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOLoad,
-      // TODO: Make MachineMemOperands aware of scalable
-      // vectors.
-      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata(), Ranges);
+      LocationSize::beforeOrAfterPointer(), Alignment, I.getAAMetadata(), Ranges);
 
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
@@ -5003,9 +5001,9 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
-      DAG.getEVTAlign(MemVT), AAMDNodes(), nullptr, SSID, SuccessOrdering,
-      FailureOrdering);
+      MachinePointerInfo(I.getPointerOperand()), Flags,
+      LocationSize::precise(MemVT.getStoreSize()), DAG.getEVTAlign(MemVT),
+      AAMDNodes(), nullptr, SSID, SuccessOrdering, FailureOrdering);
 
   SDValue L = DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
                                    dl, MemVT, VTs, InChain,
@@ -5057,8 +5055,9 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
-      DAG.getEVTAlign(MemVT), AAMDNodes(), nullptr, SSID, Ordering);
+      MachinePointerInfo(I.getPointerOperand()), Flags,
+      LocationSize::precise(MemVT.getStoreSize()), DAG.getEVTAlign(MemVT),
+      AAMDNodes(), nullptr, SSID, Ordering);
 
   SDValue L =
     DAG.getAtomic(NT, dl, MemVT, InChain,
@@ -5103,8 +5102,9 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
   auto Flags = TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
-      I.getAlign(), AAMDNodes(), nullptr, SSID, Order);
+      MachinePointerInfo(I.getPointerOperand()), Flags,
+      LocationSize::precise(MemVT.getStoreSize()), I.getAlign(), AAMDNodes(),
+      nullptr, SSID, Order);
 
   InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
 
@@ -5140,8 +5140,9 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
-      I.getAlign(), AAMDNodes(), nullptr, SSID, Ordering);
+      MachinePointerInfo(I.getPointerOperand()), Flags,
+      LocationSize::precise(MemVT.getStoreSize()), I.getAlign(), AAMDNodes(),
+      nullptr, SSID, Ordering);
 
   SDValue Val = getValue(I.getValueOperand());
   if (Val.getValueType() != MemVT)
@@ -6904,7 +6905,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       auto MPI =
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
       MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-          MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize,
+          MPI, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer(),
           TempAlign);
       Chain = DAG.getGetFPEnv(Chain, sdl, Temp, EnvVT, MMO);
       Res = DAG.getLoad(EnvVT, sdl, Chain, Temp, MPI);
@@ -6933,7 +6934,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       Chain = DAG.getStore(Chain, sdl, Env, Temp, MPI, TempAlign,
                            MachineMemOperand::MOStore);
       MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-          MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize,
+          MPI, MachineMemOperand::MOLoad, LocationSize::beforeOrAfterPointer(),
           TempAlign);
       Chain = DAG.getSetFPEnv(Chain, sdl, Temp, EnvVT, MMO);
     }
@@ -8087,7 +8088,7 @@ void SelectionDAGBuilder::visitVPLoad(
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
   LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2],
                      MMO, false /*IsExpanding */);
   if (AddToChain)
@@ -8110,8 +8111,8 @@ void SelectionDAGBuilder::visitVPGather(
   unsigned AS =
     PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-     MachinePointerInfo(AS), MachineMemOperand::MOLoad,
-     MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+      MachinePointerInfo(AS), MachineMemOperand::MOLoad,
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
   SDValue Base, Index, Scale;
   ISD::MemIndexType IndexType;
   bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
@@ -8151,7 +8152,7 @@ void SelectionDAGBuilder::visitVPStore(
   SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
   ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset,
                       OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED,
                       /* IsTruncating */ false, /*IsCompressing*/ false);
@@ -8174,7 +8175,7 @@ void SelectionDAGBuilder::visitVPScatter(
       PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
   SDValue Base, Index, Scale;
   ISD::MemIndexType IndexType;
   bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
@@ -8217,7 +8218,7 @@ void SelectionDAGBuilder::visitVPStridedLoad(
   unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges);
 
   SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1],
                                     OpValues[2], OpValues[3], MMO,
@@ -8240,7 +8241,7 @@ void SelectionDAGBuilder::visitVPStridedStore(
   unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo(AS), MachineMemOperand::MOStore,
-      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+      LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo);
 
   SDValue ST = DAG.getStridedStoreVP(
       getMemoryRoot(), DL, OpValues[0], OpValues[1],
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index fa71adc8da3f38..20375a0f92b238 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -841,6 +841,18 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   } else if (const MemSDNode *M = dyn_cast<MemSDNode>(this)) {
     OS << "<";
     printMemOperand(OS, *M->getMemOperand(), G);
+    if (auto *A = dyn_cast<AtomicSDNode>(M))
+      if (A->getOpcode() == ISD::ATOMIC_LOAD) {
+        bool doExt = true;
+        switch (A->getExtensionType()) {
+        default: doExt = false; break;
+        case ISD::EXTLOAD:  OS << ", anyext"; break;
+        case ISD::SEXTLOAD: OS << ", sext"; break;
+        case ISD::ZEXTLOAD: OS << ", zext"; break;
+        }
+        if (doExt)
+          OS << " from " << A->getMemoryVT();
+      }
     OS << ">";
   } else if (const BlockAddressSDNode *BA =
                dyn_cast<BlockAddressSDNode>(this)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 30f67d37e2e25e..d629c36bc792e3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1461,12 +1461,12 @@ static void processDbgDeclares(FunctionLoweringInfo &FuncInfo) {
     if (DI && processDbgDeclare(FuncInfo, DI->getAddress(), DI->getExpression(),
                                 DI->getVariable(), DI->getDebugLoc()))
       FuncInfo.PreprocessedDbgDeclares.insert(DI);
-    for (const DPValue &DPV : filterDbgVars(I.getDbgRecordRange())) {
-      if (DPV.Type == DPValue::LocationType::Declare &&
-          processDbgDeclare(FuncInfo, DPV.getVariableLocationOp(0),
-                            DPV.getExpression(), DPV.getVariable(),
-                            DPV.getDebugLoc()))
-        FuncInfo.PreprocessedDPVDeclares.insert(&DPV);
+    for (const DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      if (DVR.Type == DbgVariableRecord::LocationType::Declare &&
+          processDbgDeclare(FuncInfo, DVR.getVariableLocationOp(0),
+                            DVR.getExpression(), DVR.getVariable(),
+                            DVR.getDebugLoc()))
+        FuncInfo.PreprocessedDVRDeclares.insert(&DVR);
     }
   }
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b3dc9de7137311..16069c6c0dc317 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -611,6 +611,25 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
   return false;
 }
 
+static SDValue simplifyUseOfIntToFP(SDValue Op, const APInt &DemandedBits,
+                                    SelectionDAG &DAG) {
+  unsigned Opc = Op.getOpcode();
+  assert((Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP) &&
+         "Invalid Int -> FP Opcode");
+  if (!DemandedBits.isSignMask())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (Opc == ISD::UINT_TO_FP)
+    return DAG.getConstant(0, SDLoc(Op), VT);
+
+  EVT InnerVT = Op.getOperand(0).getValueType();
+  if (VT.getScalarSizeInBits() == InnerVT.getScalarSizeInBits())
+    return DAG.getBitcast(VT, Op.getOperand(0));
+
+  return SDValue();
+}
+
 bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
                                           DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -816,6 +835,11 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
     }
     break;
   }
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+    if (SDValue R = simplifyUseOfIntToFP(Op, DemandedBits, DAG))
+      return R;
+    break;
   case ISD::SIGN_EXTEND_INREG: {
     // If none of the extended bits are demanded, eliminate the sextinreg.
     SDValue Op0 = Op.getOperand(0);
@@ -2313,6 +2337,12 @@ bool TargetLowering::SimplifyDemandedBits(
     Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
     break;
   }
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+    if (SDValue R = simplifyUseOfIntToFP(Op, DemandedBits, TLO.DAG))
+      return TLO.CombineTo(Op, R);
+    Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+    break;
   case ISD::SIGN_EXTEND_INREG: {
     SDValue Op0 = Op.getOperand(0);
     EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
@@ -10694,7 +10724,7 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
 
       SDValue Lo, Hi;
       std::tie(Lo, Hi) = DAG.SplitVector(Op, dl);
-      Op = DAG.getNode(BaseOpcode, dl, HalfVT, Lo, Hi);
+      Op = DAG.getNode(BaseOpcode, dl, HalfVT, Lo, Hi, Node->getFlags());
       VT = HalfVT;
     }
   }
diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 4bad57d279e9e4..20c827ce08d845 100644
--- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -202,7 +202,7 @@ SjLjEHPrepareImpl::setupFunctionContext(Function &F,
   auto &DL = F.getParent()->getDataLayout();
   const Align Alignment = DL.getPrefTypeAlign(FunctionContextTy);
   FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(), nullptr,
-                           Alignment, "fn_context", &EntryBB->front());
+                           Alignment, "fn_context", EntryBB->begin());
 
   // Fill in the function context structure.
   for (LandingPadInst *LPI : LPads) {
@@ -271,7 +271,7 @@ void SjLjEHPrepareImpl::lowerIncomingArguments(Function &F) {
     Value *TrueValue = ConstantInt::getTrue(F.getContext());
     Value *UndefValue = UndefValue::get(Ty);
     Instruction *SI = SelectInst::Create(
-        TrueValue, &AI, UndefValue, AI.getName() + ".tmp", &*AfterAllocaInsPt);
+        TrueValue, &AI, UndefValue, AI.getName() + ".tmp", AfterAllocaInsPt);
     AI.replaceAllUsesWith(SI);
 
     // Reset the operand, because it  was clobbered by the RAUW above.
@@ -386,7 +386,7 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
       if (Function *Callee = II->getCalledFunction())
         if (Callee->getIntrinsicID() == Intrinsic::donothing) {
           // Remove the NOP invoke.
-          BranchInst::Create(II->getNormalDest(), II);
+          BranchInst::Create(II->getNormalDest(), II->getIterator());
           II->eraseFromParent();
           continue;
         }
@@ -445,7 +445,7 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
 
     // Record the call site value for the back end so it stays associated with
     // the invoke.
-    CallInst::Create(CallSiteFn, CallSiteNum, "", Invokes[I]);
+    CallInst::Create(CallSiteFn, CallSiteNum, "", Invokes[I]->getIterator());
   }
 
   // Mark call instructions that aren't nounwind as no-action (call_site ==
@@ -462,8 +462,8 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
   }
 
   // Register the function context and make sure it's known to not throw
-  CallInst *Register =
-      CallInst::Create(RegisterFn, FuncCtx, "", EntryBB->getTerminator());
+  CallInst *Register = CallInst::Create(
+      RegisterFn, FuncCtx, "", EntryBB->getTerminator()->getIterator());
   Register->setDoesNotThrow();
 
   // Following any allocas not in the entry block, update the saved SP in the
@@ -480,7 +480,8 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
       }
       Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
       StackAddr->insertAfter(&I);
-      new StoreInst(StackAddr, StackPtr, true, StackAddr->getNextNode());
+      new StoreInst(StackAddr, StackPtr, true,
+                    std::next(StackAddr->getIterator()));
     }
   }
 
@@ -490,7 +491,7 @@ bool SjLjEHPrepareImpl::setupEntryBlockAndCallSites(Function &F) {
     Instruction *InsertPoint = Return;
     if (CallInst *CI = Return->getParent()->getTerminatingMustTailCall())
       InsertPoint = CI;
-    CallInst::Create(UnregisterFn, FuncCtx, "", InsertPoint);
+    CallInst::Create(UnregisterFn, FuncCtx, "", InsertPoint->getIterator());
   }
 
   return true;
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 5b02c1bc39c0a7..9fbd516acea8e1 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1554,7 +1554,8 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
     SmallVector<uint64_t, 8> Ops;
     DIExpression::appendOffset(Ops, Offset);
     Ops.push_back(dwarf::DW_OP_deref_size);
-    Ops.push_back(MMO->getSize());
+    Ops.push_back(MMO->getSize().hasValue() ? MMO->getSize().getValue()
+                                            : ~UINT64_C(0));
     Expr = DIExpression::prependOpcodes(Expr, Ops);
     return ParamLoadedValue(*BaseOp, Expr);
   }
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 8ac55ee6a5d0c1..9990556f89ed8b 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2011,6 +2011,10 @@ bool TargetLoweringBase::isLegalAddressingMode(const DataLayout &DL,
   // The default implementation of this implements a conservative RISCy, r+r and
   // r+i addr mode.
 
+  // Scalable offsets not supported
+  if (AM.ScalableOffset)
+    return false;
+
   // Allows a sign-extended 16-bit immediate field.
   if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
     return false;
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 6943ce261d9d9c..15b59421a0f44a 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -2680,21 +2680,34 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForFunctionDescriptor(
 
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
     const MCSymbol *Sym, const TargetMachine &TM) const {
-  // Use TE storage-mapping class when large code model is enabled so that
-  // the chance of needing -bbigtoc is decreased. Also, the toc-entry for
-  // EH info is never referenced directly using instructions so it can be
-  // allocated with TE storage-mapping class.
-  // The "_$TLSML" symbol for TLS local-dynamic mode requires XMC_TC, otherwise
-  // the AIX assembler will complain.
+  const XCOFF::StorageMappingClass SMC = [](const MCSymbol *Sym,
+                                            const TargetMachine &TM) {
+    const MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(Sym);
+
+    // The "_$TLSML" symbol for TLS local-dynamic mode requires XMC_TC,
+    // otherwise the AIX assembler will complain.
+    if (XSym->getSymbolTableName() == "_$TLSML")
+      return XCOFF::XMC_TC;
+
+    // Use large code model toc entries for ehinfo symbols as they are
+    // never referenced directly. The runtime loads their TOC entry
+    // addresses from the trace-back table.
+    if (XSym->isEHInfo())
+      return XCOFF::XMC_TE;
+
+    // If the symbol does not have a code model specified use the module value.
+    if (!XSym->hasPerSymbolCodeModel())
+      return TM.getCodeModel() == CodeModel::Large ? XCOFF::XMC_TE
+                                                   : XCOFF::XMC_TC;
+
+    return XSym->getPerSymbolCodeModel() == MCSymbolXCOFF::CM_Large
+               ? XCOFF::XMC_TE
+               : XCOFF::XMC_TC;
+  }(Sym, TM);
+
   return getContext().getXCOFFSection(
       cast<MCSymbolXCOFF>(Sym)->getSymbolTableName(), SectionKind::getData(),
-      XCOFF::CsectProperties(
-          ((TM.getCodeModel() == CodeModel::Large &&
-            cast<MCSymbolXCOFF>(Sym)->getSymbolTableName() != "_$TLSML") ||
-           cast<MCSymbolXCOFF>(Sym)->isEHInfo())
-              ? XCOFF::XMC_TE
-              : XCOFF::XMC_TC,
-          XCOFF::XTY_SD));
+      XCOFF::CsectProperties(SMC, XCOFF::XTY_SD));
 }
 
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA(
diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp
index 95976c218c2f14..93a18f5bc9e8d4 100644
--- a/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -1234,10 +1234,10 @@ AllocaInst *WinEHPrepareImpl::insertPHILoads(PHINode *PN, Function &F) {
     // that will dominate all uses.
     SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(PN->getName(), ".wineh.spillslot"),
-                               &F.getEntryBlock().front());
+                               F.getEntryBlock().begin());
     Value *V = new LoadInst(PN->getType(), SpillSlot,
                             Twine(PN->getName(), ".wineh.reload"),
-                            &*PHIBlock->getFirstInsertionPt());
+                            PHIBlock->getFirstInsertionPt());
     PN->replaceAllUsesWith(V);
     return SpillSlot;
   }
@@ -1309,7 +1309,7 @@ void WinEHPrepareImpl::insertPHIStore(
   }
 
   // Otherwise, insert the store at the end of the basic block.
-  new StoreInst(PredVal, SpillSlot, PredBlock->getTerminator());
+  new StoreInst(PredVal, SpillSlot, PredBlock->getTerminator()->getIterator());
 }
 
 void WinEHPrepareImpl::replaceUseWithLoad(
@@ -1319,7 +1319,7 @@ void WinEHPrepareImpl::replaceUseWithLoad(
   if (!SpillSlot)
     SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(V->getName(), ".wineh.spillslot"),
-                               &F.getEntryBlock().front());
+                               F.getEntryBlock().begin());
 
   auto *UsingInst = cast<Instruction>(U.getUser());
   if (auto *UsingPHI = dyn_cast<PHINode>(UsingInst)) {
@@ -1376,16 +1376,16 @@ void WinEHPrepareImpl::replaceUseWithLoad(
     Value *&Load = Loads[IncomingBlock];
     // Insert the load into the predecessor block
     if (!Load)
-      Load = new LoadInst(V->getType(), SpillSlot,
-                          Twine(V->getName(), ".wineh.reload"),
-                          /*isVolatile=*/false, IncomingBlock->getTerminator());
+      Load = new LoadInst(
+          V->getType(), SpillSlot, Twine(V->getName(), ".wineh.reload"),
+          /*isVolatile=*/false, IncomingBlock->getTerminator()->getIterator());
 
     U.set(Load);
   } else {
     // Reload right before the old use.
     auto *Load = new LoadInst(V->getType(), SpillSlot,
                               Twine(V->getName(), ".wineh.reload"),
-                              /*isVolatile=*/false, UsingInst);
+                              /*isVolatile=*/false, UsingInst->getIterator());
     U.set(Load);
   }
 }
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index 9b581a6c9ab774..60f664ece7eef9 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -2701,8 +2701,8 @@ Error DWARFLinker::link() {
   // This Dwarf string pool which is used for emission. It must be used
   // serially as the order of calling getStringOffset matters for
   // reproducibility.
-  OffsetsStringPool DebugStrPool(StringsTranslator, true);
-  OffsetsStringPool DebugLineStrPool(StringsTranslator, false);
+  OffsetsStringPool DebugStrPool(true);
+  OffsetsStringPool DebugLineStrPool(false);
   DebugDieValuePool StringOffsetPool;
 
   // ODR Contexts for the optimize.
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
index 6d522370e440d1..8b31b5ead29ad0 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
@@ -32,10 +32,9 @@ using namespace dwarf_linker::classic;
 
 Expected<std::unique_ptr<DwarfStreamer>> DwarfStreamer::createStreamer(
     const Triple &TheTriple, DWARFLinkerBase::OutputFileType FileType,
-    raw_pwrite_stream &OutFile, DWARFLinkerBase::TranslatorFuncTy Translator,
-    DWARFLinkerBase::MessageHandlerTy Warning) {
+    raw_pwrite_stream &OutFile, DWARFLinkerBase::MessageHandlerTy Warning) {
   std::unique_ptr<DwarfStreamer> Streamer =
-      std::make_unique<DwarfStreamer>(FileType, OutFile, Translator, Warning);
+      std::make_unique<DwarfStreamer>(FileType, OutFile, Warning);
   if (Error Err = Streamer->init(TheTriple, "__DWARF"))
     return std::move(Err);
 
@@ -977,11 +976,10 @@ void DwarfStreamer::emitLineTableString(const DWARFDebugLine::Prologue &P,
 
   switch (String.getForm()) {
   case dwarf::DW_FORM_string: {
-    StringRef TranslatedString =
-        (Translator) ? Translator(*StringVal) : *StringVal;
-    Asm->OutStreamer->emitBytes(TranslatedString.data());
+    StringRef Str = *StringVal;
+    Asm->OutStreamer->emitBytes(Str.data());
     Asm->emitInt8(0);
-    LineSectionSize += TranslatedString.size() + 1;
+    LineSectionSize += Str.size() + 1;
   } break;
   case dwarf::DW_FORM_strp:
   case dwarf::DW_FORM_line_strp: {
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinker.cpp
index ad8d28a643174a..31c7e867767a0f 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinker.cpp
@@ -15,8 +15,6 @@ using namespace dwarf_linker::parallel;
 
 std::unique_ptr<DWARFLinker>
 DWARFLinker::createLinker(MessageHandlerTy ErrorHandler,
-                          MessageHandlerTy WarningHandler,
-                          TranslatorFuncTy StringsTranslator) {
-  return std::make_unique<DWARFLinkerImpl>(ErrorHandler, WarningHandler,
-                                           StringsTranslator);
+                          MessageHandlerTy WarningHandler) {
+  return std::make_unique<DWARFLinkerImpl>(ErrorHandler, WarningHandler);
 }
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h
index 38c261a8106fcd..7ca81eb34f005e 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h
@@ -21,7 +21,6 @@ class DWARFDie;
 namespace dwarf_linker {
 namespace parallel {
 
-using TranslatorFuncTy = std::function<StringRef(StringRef)>;
 using MessageHandlerTy = std::function<void(
     const Twine &Warning, StringRef Context, const DWARFDie *DIE)>;
 
@@ -95,19 +94,6 @@ class LinkingGlobalData {
   /// Returns global string pool.
   StringPool &getStringPool() { return Strings; }
 
-  /// Set translation function.
-  void setTranslator(TranslatorFuncTy Translator) {
-    this->Translator = Translator;
-  }
-
-  /// Translate specified string.
-  StringRef translateString(StringRef String) {
-    if (Translator)
-      return Translator(String);
-
-    return String;
-  }
-
   /// Returns linking options.
   const DWARFLinkerOptions &getOptions() const { return Options; }
 
@@ -161,7 +147,6 @@ class LinkingGlobalData {
 protected:
   llvm::parallel::PerThreadBumpPtrAllocator Allocator;
   StringPool Strings;
-  TranslatorFuncTy Translator;
   DWARFLinkerOptions Options;
   MessageHandlerTy WarningHandler;
   MessageHandlerTy ErrorHandler;
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
index 49b08997eb9c1c..e68bf0c227a0a0 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
@@ -20,11 +20,9 @@ using namespace dwarf_linker;
 using namespace dwarf_linker::parallel;
 
 DWARFLinkerImpl::DWARFLinkerImpl(MessageHandlerTy ErrorHandler,
-                                 MessageHandlerTy WarningHandler,
-                                 TranslatorFuncTy StringsTranslator)
+                                 MessageHandlerTy WarningHandler)
     : UniqueUnitID(0), DebugStrStrings(GlobalData),
       DebugLineStrStrings(GlobalData), CommonSections(GlobalData) {
-  GlobalData.setTranslator(StringsTranslator);
   GlobalData.setErrorHandler(ErrorHandler);
   GlobalData.setWarningHandler(WarningHandler);
 }
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
index 7c17c5b79c7c18..597bb1b4da59a8 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
@@ -26,8 +26,7 @@ namespace parallel {
 class DWARFLinkerImpl : public DWARFLinker {
 public:
   DWARFLinkerImpl(MessageHandlerTy ErrorHandler,
-                  MessageHandlerTy WarningHandler,
-                  TranslatorFuncTy StringsTranslator);
+                  MessageHandlerTy WarningHandler);
 
   /// Add object file to be linked. Pre-load compile unit die. Call
   /// \p OnCUDieLoaded for each compile unit die. If specified \p File
diff --git a/llvm/lib/DWARFLinker/Parallel/OutputSections.h b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
index a21e4b2b75a503..0e1f2dae54bcc2 100644
--- a/llvm/lib/DWARFLinker/Parallel/OutputSections.h
+++ b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
@@ -253,7 +253,7 @@ struct SectionDescriptor : SectionDescriptorBase {
 
   /// Emit specified inplace string value into the current section contents.
   void emitInplaceString(StringRef String) {
-    OS << GlobalData.translateString(String);
+    OS << String;
     emitIntVal(0, 1);
   }
 
diff --git a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
index 858f224777dba7..f67536ef7a1a84 100644
--- a/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
+++ b/llvm/lib/DWARFLinker/Parallel/StringEntryToDwarfStringPoolEntryMap.h
@@ -33,7 +33,7 @@ class StringEntryToDwarfStringPoolEntryMap {
       DwarfStringPoolEntryWithExtString *DataPtr =
           GlobalData.getAllocator()
               .Allocate<DwarfStringPoolEntryWithExtString>();
-      DataPtr->String = GlobalData.translateString(String->getKey());
+      DataPtr->String = String->getKey();
       DataPtr->Index = DwarfStringPoolEntry::NotIndexed;
       DataPtr->Offset = 0;
       DataPtr->Symbol = nullptr;
diff --git a/llvm/lib/DebugInfo/LogicalView/CMakeLists.txt b/llvm/lib/DebugInfo/LogicalView/CMakeLists.txt
index 38a174661b4f3f..f5adab723927a9 100644
--- a/llvm/lib/DebugInfo/LogicalView/CMakeLists.txt
+++ b/llvm/lib/DebugInfo/LogicalView/CMakeLists.txt
@@ -24,7 +24,7 @@ add_lv_impl_folder(Readers
   Readers/LVBinaryReader.cpp
   Readers/LVCodeViewReader.cpp
   Readers/LVCodeViewVisitor.cpp
-  Readers/LVELFReader.cpp
+  Readers/LVDWARFReader.cpp
   )
 
 list(APPEND LIBLV_ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/DebugInfo/LogicalView/LVReaderHandler.cpp b/llvm/lib/DebugInfo/LogicalView/LVReaderHandler.cpp
index 16c4fbed1423fb..c8410154059535 100644
--- a/llvm/lib/DebugInfo/LogicalView/LVReaderHandler.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/LVReaderHandler.cpp
@@ -14,7 +14,7 @@
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/LogicalView/Core/LVCompare.h"
 #include "llvm/DebugInfo/LogicalView/Readers/LVCodeViewReader.h"
-#include "llvm/DebugInfo/LogicalView/Readers/LVELFReader.h"
+#include "llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
 #include "llvm/Object/COFF.h"
@@ -49,7 +49,8 @@ Error LVReaderHandler::createReader(StringRef Filename, LVReaders &Readers,
                                                   *COFF, W, ExePath);
       }
       if (Obj.isELF() || Obj.isMachO() || Obj.isWasm())
-        return std::make_unique<LVELFReader>(Filename, FileFormatName, Obj, W);
+        return std::make_unique<LVDWARFReader>(Filename, FileFormatName, Obj,
+                                               W);
     }
     if (isa<PDBFile *>(Input)) {
       PDBFile &Pdb = *cast<PDBFile *>(Input);
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVELFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
similarity index 95%
rename from llvm/lib/DebugInfo/LogicalView/Readers/LVELFReader.cpp
rename to llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
index 6bdcbc0ab641b0..91e5a037054da0 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVELFReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
@@ -1,4 +1,4 @@
-//===-- LVELFReader.cpp ---------------------------------------------------===//
+//===-- LVDWARFReader.cpp -------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This implements the LVELFReader class.
+// This implements the LVDWARFReader class.
 // It supports ELF, Mach-O and Wasm binary formats.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/LogicalView/Readers/LVELFReader.h"
+#include "llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
@@ -27,9 +27,9 @@ using namespace llvm;
 using namespace llvm::object;
 using namespace llvm::logicalview;
 
-#define DEBUG_TYPE "ElfReader"
+#define DEBUG_TYPE "DWARFReader"
 
-LVElement *LVELFReader::createElement(dwarf::Tag Tag) {
+LVElement *LVDWARFReader::createElement(dwarf::Tag Tag) {
   CurrentScope = nullptr;
   CurrentSymbol = nullptr;
   CurrentType = nullptr;
@@ -243,8 +243,9 @@ LVElement *LVELFReader::createElement(dwarf::Tag Tag) {
   return nullptr;
 }
 
-void LVELFReader::processOneAttribute(const DWARFDie &Die, LVOffset *OffsetPtr,
-                                      const AttributeSpec &AttrSpec) {
+void LVDWARFReader::processOneAttribute(const DWARFDie &Die,
+                                        LVOffset *OffsetPtr,
+                                        const AttributeSpec &AttrSpec) {
   uint64_t OffsetOnEntry = *OffsetPtr;
   DWARFUnit *U = Die.getDwarfUnit();
   const DWARFFormValue &FormValue =
@@ -515,8 +516,8 @@ void LVELFReader::processOneAttribute(const DWARFDie &Die, LVOffset *OffsetPtr,
   }
 }
 
-LVScope *LVELFReader::processOneDie(const DWARFDie &InputDIE, LVScope *Parent,
-                                    DWARFDie &SkeletonDie) {
+LVScope *LVDWARFReader::processOneDie(const DWARFDie &InputDIE, LVScope *Parent,
+                                      DWARFDie &SkeletonDie) {
   // If the input DIE corresponds to the compile unit, it can be:
   // a) Simple DWARF: a standard DIE. Ignore the skeleton DIE (is empty).
   // b) Split DWARF: the DIE for the split DWARF. The skeleton is the DIE
@@ -688,8 +689,8 @@ LVScope *LVELFReader::processOneDie(const DWARFDie &InputDIE, LVScope *Parent,
   return CurrentScope;
 }
 
-void LVELFReader::traverseDieAndChildren(DWARFDie &DIE, LVScope *Parent,
-                                         DWARFDie &SkeletonDie) {
+void LVDWARFReader::traverseDieAndChildren(DWARFDie &DIE, LVScope *Parent,
+                                           DWARFDie &SkeletonDie) {
   // Process the current DIE.
   LVScope *Scope = processOneDie(DIE, Parent, SkeletonDie);
   if (Scope) {
@@ -709,13 +710,13 @@ void LVELFReader::traverseDieAndChildren(DWARFDie &DIE, LVScope *Parent,
   }
 }
 
-void LVELFReader::processLocationGaps() {
+void LVDWARFReader::processLocationGaps() {
   if (options().getAttributeAnyLocation())
     for (LVSymbol *Symbol : SymbolsWithLocations)
       Symbol->fillLocationGaps();
 }
 
-void LVELFReader::createLineAndFileRecords(
+void LVDWARFReader::createLineAndFileRecords(
     const DWARFDebugLine::LineTable *Lines) {
   if (!Lines)
     return;
@@ -772,8 +773,8 @@ void LVELFReader::createLineAndFileRecords(
     }
 }
 
-std::string LVELFReader::getRegisterName(LVSmall Opcode,
-                                         ArrayRef<uint64_t> Operands) {
+std::string LVDWARFReader::getRegisterName(LVSmall Opcode,
+                                           ArrayRef<uint64_t> Operands) {
   // The 'prettyPrintRegisterOp' function uses the DWARFUnit to support
   // DW_OP_regval_type. At this point we are operating on a logical view
   // item, with no access to the underlying DWARF data used by LLVM.
@@ -800,7 +801,7 @@ std::string LVELFReader::getRegisterName(LVSmall Opcode,
   return Stream.str();
 }
 
-Error LVELFReader::createScopes() {
+Error LVDWARFReader::createScopes() {
   LLVM_DEBUG({
     W.startLine() << "\n";
     W.printString("File", Obj.getFileName().str());
@@ -980,11 +981,11 @@ Error LVELFReader::createScopes() {
 }
 
 // Get the location information for the associated attribute.
-void LVELFReader::processLocationList(dwarf::Attribute Attr,
-                                      const DWARFFormValue &FormValue,
-                                      const DWARFDie &Die,
-                                      uint64_t OffsetOnEntry,
-                                      bool CallSiteLocation) {
+void LVDWARFReader::processLocationList(dwarf::Attribute Attr,
+                                        const DWARFFormValue &FormValue,
+                                        const DWARFDie &Die,
+                                        uint64_t OffsetOnEntry,
+                                        bool CallSiteLocation) {
 
   auto ProcessLocationExpression = [&](const DWARFExpression &Expression) {
     for (const DWARFExpression::Operation &Op : Expression)
@@ -1061,10 +1062,10 @@ void LVELFReader::processLocationList(dwarf::Attribute Attr,
   }
 }
 
-void LVELFReader::processLocationMember(dwarf::Attribute Attr,
-                                        const DWARFFormValue &FormValue,
-                                        const DWARFDie &Die,
-                                        uint64_t OffsetOnEntry) {
+void LVDWARFReader::processLocationMember(dwarf::Attribute Attr,
+                                          const DWARFFormValue &FormValue,
+                                          const DWARFDie &Die,
+                                          uint64_t OffsetOnEntry) {
   // Check if the value is an integer constant.
   if (FormValue.isFormClass(DWARFFormValue::FC_Constant))
     // Add a record to hold a constant as location.
@@ -1076,8 +1077,8 @@ void LVELFReader::processLocationMember(dwarf::Attribute Attr,
 }
 
 // Update the current element with the reference.
-void LVELFReader::updateReference(dwarf::Attribute Attr,
-                                  const DWARFFormValue &FormValue) {
+void LVDWARFReader::updateReference(dwarf::Attribute Attr,
+                                    const DWARFFormValue &FormValue) {
   // FIXME: We are assuming that at most one Reference (DW_AT_specification,
   // DW_AT_abstract_origin, ...) and at most one Type (DW_AT_import, DW_AT_type)
   // appear in any single DIE, but this may not be true.
@@ -1129,8 +1130,8 @@ void LVELFReader::updateReference(dwarf::Attribute Attr,
 }
 
 // Get an element given the DIE offset.
-LVElement *LVELFReader::getElementForOffset(LVOffset Offset, LVElement *Element,
-                                            bool IsType) {
+LVElement *LVDWARFReader::getElementForOffset(LVOffset Offset,
+                                              LVElement *Element, bool IsType) {
   auto Iter = ElementTable.try_emplace(Offset).first;
   // Update the element and all the references pointing to this element.
   LVElementEntry &Entry = Iter->second;
@@ -1143,7 +1144,7 @@ LVElement *LVELFReader::getElementForOffset(LVOffset Offset, LVElement *Element,
   return Entry.Element;
 }
 
-Error LVELFReader::loadTargetInfo(const ObjectFile &Obj) {
+Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) {
   // Detect the architecture from the object file. We usually don't need OS
   // info to lookup a target and create register info.
   Triple TT;
@@ -1162,7 +1163,7 @@ Error LVELFReader::loadTargetInfo(const ObjectFile &Obj) {
   return loadGenericTargetInfo(TT.str(), FeaturesValue.getString());
 }
 
-void LVELFReader::mapRangeAddress(const ObjectFile &Obj) {
+void LVDWARFReader::mapRangeAddress(const ObjectFile &Obj) {
   for (auto Iter = Obj.symbol_begin(); Iter != Obj.symbol_end(); ++Iter) {
     const SymbolRef &Symbol = *Iter;
 
@@ -1238,9 +1239,9 @@ void LVELFReader::mapRangeAddress(const ObjectFile &Obj) {
   }
 }
 
-void LVELFReader::sortScopes() { Root->sort(); }
+void LVDWARFReader::sortScopes() { Root->sort(); }
 
-void LVELFReader::print(raw_ostream &OS) const {
+void LVDWARFReader::print(raw_ostream &OS) const {
   OS << "LVType\n";
   LLVM_DEBUG(dbgs() << "CreateReaders\n");
 }
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 5f29226c14b705..32c5e3251df62f 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -628,13 +628,20 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
   ObjectPair Objects = ObjectsOrErr.get();
 
   std::unique_ptr<DIContext> Context;
-  // If this is a COFF object containing PDB info, use a PDBContext to
-  // symbolize. Otherwise, use DWARF.
+  // If this is a COFF object containing PDB info and not containing DWARF
+  // section, use a PDBContext to symbolize. Otherwise, use DWARF.
   if (auto CoffObject = dyn_cast<COFFObjectFile>(Objects.first)) {
     const codeview::DebugInfo *DebugInfo;
     StringRef PDBFileName;
     auto EC = CoffObject->getDebugPDBInfo(DebugInfo, PDBFileName);
-    if (!EC && DebugInfo != nullptr && !PDBFileName.empty()) {
+    // Use DWARF if there're DWARF sections.
+    bool HasDwarf =
+        llvm::any_of(Objects.first->sections(), [](SectionRef Section) -> bool {
+          if (Expected<StringRef> SectionName = Section.getName())
+            return SectionName.get() == ".debug_info";
+          return false;
+        });
+    if (!EC && !HasDwarf && DebugInfo != nullptr && !PDBFileName.empty()) {
       using namespace pdb;
       std::unique_ptr<IPDBSession> Session;
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 994acf5843642a..1fa8a1274911ca 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -255,7 +255,7 @@ struct ObjCImageInfoFlags {
 namespace llvm {
 namespace orc {
 
-MachOPlatform::HeaderOptions::BuildVersionOpts
+std::optional<MachOPlatform::HeaderOptions::BuildVersionOpts>
 MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(const Triple &TT,
                                                            uint32_t MinOS,
                                                            uint32_t SDK) {
@@ -263,26 +263,25 @@ MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(const Triple &TT,
   uint32_t Platform;
   switch (TT.getOS()) {
   case Triple::IOS:
-    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_IOS
-                                           : MachO::PLATFORM_IOSSIMULATOR;
+    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_IOSSIMULATOR
+                                           : MachO::PLATFORM_IOS;
     break;
   case Triple::MacOSX:
     Platform = MachO::PLATFORM_MACOS;
     break;
   case Triple::TvOS:
-    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_TVOS
-                                           : MachO::PLATFORM_TVOSSIMULATOR;
+    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_TVOSSIMULATOR
+                                           : MachO::PLATFORM_TVOS;
     break;
   case Triple::WatchOS:
-    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_WATCHOS
-                                           : MachO::PLATFORM_WATCHOSSIMULATOR;
+    Platform = TT.isSimulatorEnvironment() ? MachO::PLATFORM_WATCHOSSIMULATOR
+                                           : MachO::PLATFORM_WATCHOS;
     break;
   default:
-    Platform = MachO::PLATFORM_UNKNOWN;
-    break;
+    return std::nullopt;
   }
 
-  return {Platform, MinOS, SDK};
+  return MachOPlatform::HeaderOptions::BuildVersionOpts{Platform, MinOS, SDK};
 }
 
 Expected<std::unique_ptr<MachOPlatform>> MachOPlatform::Create(
@@ -1725,11 +1724,9 @@ jitlink::Block &createHeaderBlock(MachOPlatform &MOP,
   else
     B.template addLoadCommand<MachO::LC_ID_DYLIB>(JD.getName(), 0, 0, 0);
 
-  if (Opts.BuildVersion)
+  for (auto &BV : Opts.BuildVersions)
     B.template addLoadCommand<MachO::LC_BUILD_VERSION>(
-        Opts.BuildVersion->Platform, Opts.BuildVersion->MinOS,
-        Opts.BuildVersion->SDK, static_cast<uint32_t>(0));
-
+        BV.Platform, BV.MinOS, BV.SDK, static_cast<uint32_t>(0));
   for (auto &D : Opts.LoadDylibs)
     B.template addLoadCommand<MachO::LC_LOAD_DYLIB>(
         D.Name, D.Timestamp, D.CurrentVersion, D.CompatibilityVersion);
diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
index e870c5aa2ba6b8..37936d6000c891 100644
--- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
@@ -44,6 +44,7 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple) {
   case Triple::ppcle:
   case Triple::ppc64:
   case Triple::ppc64le:
+  case Triple::systemz:
   case Triple::x86:
   case Triple::x86_64:
     ActiveTraits.set(unsigned(TraitProperty::device_kind_cpu));
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c74c898e1e882d..16507a69ea8502 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4883,8 +4883,11 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData(
     return InsertPointTy();
 
   // Disable TargetData CodeGen on Device pass.
-  if (Config.IsTargetDevice.value_or(false))
+  if (Config.IsTargetDevice.value_or(false)) {
+    if (BodyGenCB)
+      Builder.restoreIP(BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv));
     return Builder.saveIP();
+  }
 
   Builder.restoreIP(CodeGenIP);
   bool IsStandAlone = !BodyGenCB;
@@ -5070,10 +5073,15 @@ FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
 
 static void replaceConstatExprUsesInFuncWithInstr(ConstantExpr *ConstExpr,
                                                   Function *Func) {
-  for (User *User : make_early_inc_range(ConstExpr->users()))
-    if (auto *Instr = dyn_cast<Instruction>(User))
-      if (Instr->getFunction() == Func)
-        Instr->replaceUsesOfWith(ConstExpr, ConstExpr->getAsInstruction(Instr));
+  for (User *User : make_early_inc_range(ConstExpr->users())) {
+    if (auto *Instr = dyn_cast<Instruction>(User)) {
+      if (Instr->getFunction() == Func) {
+        Instruction *ConstInst = ConstExpr->getAsInstruction();
+        ConstInst->insertBefore(*Instr->getParent(), Instr->getIterator());
+        Instr->replaceUsesOfWith(ConstExpr, ConstInst);
+      }
+    }
+  }
 }
 
 static void replaceConstantValueUsesInFuncWithInstr(llvm::Value *Input,
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 11383ea6214bf3..5caed518e265b7 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -862,7 +862,7 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   void processInstructionMetadata(const Instruction &I);
 
   /// Add all of the metadata from a DbgRecord.
-  void processDbgRecordMetadata(const DbgRecord &DPV);
+  void processDbgRecordMetadata(const DbgRecord &DVR);
 };
 
 } // end namespace llvm
@@ -1139,13 +1139,19 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 }
 
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
-  if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
+  if (const DbgVariableRecord *DVR = dyn_cast<const DbgVariableRecord>(&DR)) {
     // Process metadata used by DbgRecords; we only specifically care about the
     // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
     // Expression fields should only be printed inline and so do not use a slot.
-    CreateMetadataSlot(DPV->getRawVariable());
-    if (DPV->isDbgAssign())
-      CreateMetadataSlot(cast<MDNode>(DPV->getRawAssignID()));
+    // Note: The above doesn't apply for empty-metadata operands.
+    if (auto *Empty = dyn_cast<MDNode>(DVR->getRawLocation()))
+      CreateMetadataSlot(Empty);
+    CreateMetadataSlot(DVR->getRawVariable());
+    if (DVR->isDbgAssign()) {
+      CreateMetadataSlot(cast<MDNode>(DVR->getRawAssignID()));
+      if (auto *Empty = dyn_cast<MDNode>(DVR->getRawAddress()))
+        CreateMetadataSlot(Empty);
+    }
   } else if (const DPLabel *DPL = dyn_cast<const DPLabel>(&DR)) {
     CreateMetadataSlot(DPL->getRawLabel());
   } else {
@@ -1408,6 +1414,10 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
   } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(U)) {
     if (GEP->isInBounds())
       Out << " inbounds";
+    if (auto InRange = GEP->getInRange()) {
+      Out << " inrange(" << InRange->getLower() << ", " << InRange->getUpper()
+          << ")";
+    }
   } else if (const auto *NNI = dyn_cast<PossiblyNonNegInst>(U)) {
     if (NNI->hasNonNeg())
       Out << " nneg";
@@ -1685,18 +1695,13 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       Out << ' ' << static_cast<CmpInst::Predicate>(CE->getPredicate());
     Out << " (";
 
-    std::optional<unsigned> InRangeOp;
     if (const GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
       WriterCtx.TypePrinter->print(GEP->getSourceElementType(), Out);
       Out << ", ";
-      InRangeOp = GEP->getInRangeIndex();
-      if (InRangeOp)
-        ++*InRangeOp;
     }
 
-    for (User::const_op_iterator OI=CE->op_begin(); OI != CE->op_end(); ++OI) {
-      if (InRangeOp && unsigned(OI - CE->op_begin()) == *InRangeOp)
-        Out << "inrange ";
+    for (User::const_op_iterator OI = CE->op_begin(); OI != CE->op_end();
+         ++OI) {
       WriterCtx.TypePrinter->print((*OI)->getType(), Out);
       Out << ' ';
       WriteAsOperandInternal(Out, *OI, WriterCtx);
@@ -2135,6 +2140,16 @@ static void writeDIDerivedType(raw_ostream &Out, const DIDerivedType *N,
     Printer.printInt("dwarfAddressSpace", *DWARFAddressSpace,
                      /* ShouldSkipZero */ false);
   Printer.printMetadata("annotations", N->getRawAnnotations());
+  if (auto PtrAuthData = N->getPtrAuthData()) {
+    Printer.printInt("ptrAuthKey", PtrAuthData->key());
+    Printer.printBool("ptrAuthIsAddressDiscriminated",
+                      PtrAuthData->isAddressDiscriminated());
+    Printer.printInt("ptrAuthExtraDiscriminator",
+                     PtrAuthData->extraDiscriminator());
+    Printer.printBool("ptrAuthIsaPointer", PtrAuthData->isaPointer());
+    Printer.printBool("ptrAuthAuthenticatesNullValues",
+                      PtrAuthData->authenticatesNullValues());
+  }
   Out << ")";
 }
 
@@ -2703,10 +2718,10 @@ class AssemblyWriter {
   void printInstructionLine(const Instruction &I);
   void printInstruction(const Instruction &I);
   void printDPMarker(const DPMarker &DPI);
-  void printDPValue(const DPValue &DPI);
+  void printDbgVariableRecord(const DbgVariableRecord &DVR);
   void printDPLabel(const DPLabel &DPL);
-  void printDbgRecord(const DbgRecord &DPI);
-  void printDbgRecordLine(const DbgRecord &DPI);
+  void printDbgRecord(const DbgRecord &DR);
+  void printDbgRecordLine(const DbgRecord &DR);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
   void printUseLists(const Function *F);
@@ -4604,46 +4619,47 @@ void AssemblyWriter::printDPMarker(const DPMarker &Marker) {
 }
 
 void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
-  if (auto *DPV = dyn_cast<DPValue>(&DR))
-    printDPValue(*DPV);
+  if (auto *DVR = dyn_cast<DbgVariableRecord>(&DR))
+    printDbgVariableRecord(*DVR);
   else if (auto *DPL = dyn_cast<DPLabel>(&DR))
     printDPLabel(*DPL);
   else
     llvm_unreachable("Unexpected DbgRecord kind");
 }
 
-void AssemblyWriter::printDPValue(const DPValue &DPV) {
+void AssemblyWriter::printDbgVariableRecord(const DbgVariableRecord &DVR) {
   auto WriterCtx = getContext();
   Out << "#dbg_";
-  switch (DPV.getType()) {
-  case DPValue::LocationType::Value:
+  switch (DVR.getType()) {
+  case DbgVariableRecord::LocationType::Value:
     Out << "value";
     break;
-  case DPValue::LocationType::Declare:
+  case DbgVariableRecord::LocationType::Declare:
     Out << "declare";
     break;
-  case DPValue::LocationType::Assign:
+  case DbgVariableRecord::LocationType::Assign:
     Out << "assign";
     break;
   default:
-    llvm_unreachable("Tried to print a DPValue with an invalid LocationType!");
+    llvm_unreachable(
+        "Tried to print a DbgVariableRecord with an invalid LocationType!");
   }
   Out << "(";
-  WriteAsOperandInternal(Out, DPV.getRawLocation(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DVR.getRawLocation(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, DPV.getRawVariable(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DVR.getRawVariable(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, DPV.getRawExpression(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DVR.getRawExpression(), WriterCtx, true);
   Out << ", ";
-  if (DPV.isDbgAssign()) {
-    WriteAsOperandInternal(Out, DPV.getRawAssignID(), WriterCtx, true);
+  if (DVR.isDbgAssign()) {
+    WriteAsOperandInternal(Out, DVR.getRawAssignID(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, DPV.getRawAddress(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DVR.getRawAddress(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, DPV.getRawAddressExpression(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DVR.getRawAddressExpression(), WriterCtx, true);
     Out << ", ";
   }
-  WriteAsOperandInternal(Out, DPV.getDebugLoc().getAsMDNode(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DVR.getDebugLoc().getAsMDNode(), WriterCtx, true);
   Out << ")";
 }
 
@@ -4897,7 +4913,7 @@ void DPMarker::print(raw_ostream &ROS, bool IsForDebug) const {
   print(ROS, MST, IsForDebug);
 }
 
-void DPValue::print(raw_ostream &ROS, bool IsForDebug) const {
+void DbgVariableRecord::print(raw_ostream &ROS, bool IsForDebug) const {
 
   ModuleSlotTracker MST(getModuleFromDPI(this), true);
   print(ROS, MST, IsForDebug);
@@ -4924,8 +4940,8 @@ void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
   print(ROS, MST, IsForDebug);
 }
 
-void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
-                    bool IsForDebug) const {
+void DbgVariableRecord::print(raw_ostream &ROS, ModuleSlotTracker &MST,
+                              bool IsForDebug) const {
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4938,7 +4954,7 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                           ? Marker->getParent()->getParent()
                           : nullptr);
   AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug);
-  W.printDPValue(*this);
+  W.printDbgVariableRecord(*this);
 }
 
 void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 25992395e471b3..7d954f9d09ad62 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -647,11 +647,12 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
     // v16i8 respectively.
     if (Name.consume_front("bfdot.")) {
       // (arm|aarch64).neon.bfdot.*'.
-      Intrinsic::ID ID = StringSwitch<Intrinsic::ID>(Name)
-                             .Cases("v2f32.v8i8", "v4f32.v16i8",
-                                    IsArm ? Intrinsic::arm_neon_bfdot
-                                          : Intrinsic::aarch64_neon_bfdot)
-                             .Default(Intrinsic::not_intrinsic);
+      Intrinsic::ID ID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .Cases("v2f32.v8i8", "v4f32.v16i8",
+                     IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfdot
+                           : (Intrinsic::ID)Intrinsic::aarch64_neon_bfdot)
+              .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
         size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits();
         assert((OperandWidth == 64 || OperandWidth == 128) &&
@@ -674,12 +675,15 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
         // (arm|aarch64).neon.bfm*.v4f32.v16i8'.
         Intrinsic::ID ID =
             StringSwitch<Intrinsic::ID>(Name)
-                .Case("mla", IsArm ? Intrinsic::arm_neon_bfmmla
-                                   : Intrinsic::aarch64_neon_bfmmla)
-                .Case("lalb", IsArm ? Intrinsic::arm_neon_bfmlalb
-                                    : Intrinsic::aarch64_neon_bfmlalb)
-                .Case("lalt", IsArm ? Intrinsic::arm_neon_bfmlalt
-                                    : Intrinsic::aarch64_neon_bfmlalt)
+                .Case("mla",
+                      IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfmmla
+                            : (Intrinsic::ID)Intrinsic::aarch64_neon_bfmmla)
+                .Case("lalb",
+                      IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfmlalb
+                            : (Intrinsic::ID)Intrinsic::aarch64_neon_bfmlalb)
+                .Case("lalt",
+                      IsArm ? (Intrinsic::ID)Intrinsic::arm_neon_bfmlalt
+                            : (Intrinsic::ID)Intrinsic::aarch64_neon_bfmlalt)
                 .Default(Intrinsic::not_intrinsic);
         if (ID != Intrinsic::not_intrinsic) {
           NewFn = Intrinsic::getDeclaration(F->getParent(), ID);
@@ -1052,6 +1056,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   }
   case 'd':
     if (Name.consume_front("dbg.")) {
+      // Mark debug intrinsics for upgrade to new debug format.
+      if (F->getParent()->IsNewDbgInfoFormat) {
+        if (Name == "addr" || Name == "value" || Name == "assign" ||
+            Name == "declare" || Name == "label") {
+          // There's no function to replace these with.
+          NewFn = nullptr;
+          // But we do want these to get upgraded.
+          return true;
+        }
+      }
+      // Update llvm.dbg.addr intrinsics even in "new debug mode"; they'll get
+      // converted to DbgVariableRecords later.
       if (Name == "addr" || (Name == "value" && F->arg_size() == 4)) {
         rename(F);
         NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::dbg_value);
@@ -2328,6 +2344,59 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
   llvm_unreachable("Unknown function for AMDGPU intrinsic upgrade.");
 }
 
+/// Helper to unwrap intrinsic call MetadataAsValue operands.
+template <typename MDType>
+static MDType *unwrapMAVOp(CallBase *CI, unsigned Op) {
+  if (MetadataAsValue *MAV = dyn_cast<MetadataAsValue>(CI->getArgOperand(Op)))
+    return dyn_cast<MDType>(MAV->getMetadata());
+  return nullptr;
+}
+
+/// Convert debug intrinsic calls to non-instruction debug records.
+/// \p Name - Final part of the intrinsic name, e.g. 'value' in llvm.dbg.value.
+/// \p CI - The debug intrinsic call.
+static void upgradeDbgIntrinsicToDbgRecord(StringRef Name, CallBase *CI) {
+  DbgRecord *DR = nullptr;
+  if (Name == "label") {
+    DR = new DPLabel(unwrapMAVOp<DILabel>(CI, 0), CI->getDebugLoc());
+  } else if (Name == "assign") {
+    DR = new DbgVariableRecord(
+        unwrapMAVOp<Metadata>(CI, 0), unwrapMAVOp<DILocalVariable>(CI, 1),
+        unwrapMAVOp<DIExpression>(CI, 2), unwrapMAVOp<DIAssignID>(CI, 3),
+        unwrapMAVOp<Metadata>(CI, 4), unwrapMAVOp<DIExpression>(CI, 5),
+        CI->getDebugLoc());
+  } else if (Name == "declare") {
+    DR = new DbgVariableRecord(
+        unwrapMAVOp<Metadata>(CI, 0), unwrapMAVOp<DILocalVariable>(CI, 1),
+        unwrapMAVOp<DIExpression>(CI, 2), CI->getDebugLoc(),
+        DbgVariableRecord::LocationType::Declare);
+  } else if (Name == "addr") {
+    // Upgrade dbg.addr to dbg.value with DW_OP_deref.
+    DIExpression *Expr = unwrapMAVOp<DIExpression>(CI, 2);
+    Expr = DIExpression::append(Expr, dwarf::DW_OP_deref);
+    DR = new DbgVariableRecord(unwrapMAVOp<Metadata>(CI, 0),
+                               unwrapMAVOp<DILocalVariable>(CI, 1), Expr,
+                               CI->getDebugLoc());
+  } else if (Name == "value") {
+    // An old version of dbg.value had an extra offset argument.
+    unsigned VarOp = 1;
+    unsigned ExprOp = 2;
+    if (CI->arg_size() == 4) {
+      auto *Offset = dyn_cast_or_null<Constant>(CI->getArgOperand(1));
+      // Nonzero offset dbg.values get dropped without a replacement.
+      if (!Offset || !Offset->isZeroValue())
+        return;
+      VarOp = 2;
+      ExprOp = 3;
+    }
+    DR = new DbgVariableRecord(
+        unwrapMAVOp<Metadata>(CI, 0), unwrapMAVOp<DILocalVariable>(CI, VarOp),
+        unwrapMAVOp<DIExpression>(CI, ExprOp), CI->getDebugLoc());
+  }
+  assert(DR && "Unhandled intrinsic kind in upgrade to DbgRecord");
+  CI->getParent()->insertDbgRecordBefore(DR, CI->getIterator());
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
@@ -2353,6 +2422,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     bool IsNVVM = Name.consume_front("nvvm.");
     bool IsARM = Name.consume_front("arm.");
     bool IsAMDGCN = Name.consume_front("amdgcn.");
+    bool IsDbg = Name.consume_front("dbg.");
 
     if (IsX86 && Name.starts_with("sse4a.movnt.")) {
       SmallVector<Metadata *, 1> Elts;
@@ -2457,7 +2527,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       return;
     }
 
-    Value *Rep;
+    Value *Rep = nullptr;
     // Upgrade packed integer vector compare intrinsics to compare instructions.
     if (IsX86 && (Name.starts_with("sse2.pcmp") ||
                   Name.starts_with("avx2.pcmp"))) {
@@ -4192,6 +4262,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = upgradeARMIntrinsicCall(Name, CI, F, Builder);
     } else if (IsAMDGCN) {
       Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
+    } else if (IsDbg && CI->getModule()->IsNewDbgInfoFormat) {
+      upgradeDbgIntrinsicToDbgRecord(Name, CI);
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
@@ -5178,72 +5250,6 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
     Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
 }
 
-// Check if the module attribute is present and not zero.
-static bool isModuleAttributeSet(Module &M, const StringRef &ModAttr) {
-  const auto *Attr =
-      mdconst::extract_or_null<ConstantInt>(M.getModuleFlag(ModAttr));
-  return Attr && Attr->getZExtValue();
-}
-
-// Copy an attribute from module to the function if exists.
-// First value of the pair is used when the module attribute is not zero
-// the second otherwise.
-static void
-CopyModuleAttributeToFunction(Function &F, StringRef FnAttrName,
-                              StringRef ModAttrName,
-                              std::pair<StringRef, StringRef> Values) {
-  if (F.hasFnAttribute(FnAttrName))
-    return;
-  F.addFnAttr(FnAttrName, isModuleAttributeSet(*F.getParent(), ModAttrName)
-                              ? Values.first
-                              : Values.second);
-}
-
-// Copy a boolean attribute from module to the function if exists.
-// Module attribute treated false if zero otherwise true.
-static void CopyModuleAttributeToFunction(Function &F, StringRef AttrName) {
-  CopyModuleAttributeToFunction(
-      F, AttrName, AttrName,
-      std::make_pair<StringRef, StringRef>("true", "false"));
-}
-
-// Copy an attribute from module to the function if exists.
-// First value of the pair is used when the module attribute is not zero
-// the second otherwise.
-static void
-CopyModuleAttributeToFunction(Function &F, StringRef AttrName,
-                              std::pair<StringRef, StringRef> Values) {
-  CopyModuleAttributeToFunction(F, AttrName, AttrName, Values);
-}
-
-void llvm::CopyModuleAttrToFunctions(Module &M) {
-  Triple T(M.getTargetTriple());
-  if (!T.isThumb() && !T.isARM() && !T.isAArch64())
-    return;
-
-  for (Function &F : M.getFunctionList()) {
-    if (F.isDeclaration())
-      continue;
-
-    if (!F.hasFnAttribute("sign-return-address")) {
-      StringRef SignType = "none";
-      if (isModuleAttributeSet(M, "sign-return-address"))
-        SignType = "non-leaf";
-
-      if (isModuleAttributeSet(M, "sign-return-address-all"))
-        SignType = "all";
-
-      F.addFnAttr("sign-return-address", SignType);
-    }
-    CopyModuleAttributeToFunction(F, "branch-target-enforcement");
-    CopyModuleAttributeToFunction(F, "branch-protection-pauth-lr");
-    CopyModuleAttributeToFunction(F, "guarded-control-stack");
-    CopyModuleAttributeToFunction(
-        F, "sign-return-address-key",
-        std::make_pair<StringRef, StringRef>("b_key", "a_key"));
-  }
-}
-
 static bool isOldLoopArgument(Metadata *MD) {
   auto *T = dyn_cast_or_null<MDTuple>(MD);
   if (!T)
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 4dd1bdd6e2f4ad..2dff6e4d6d9c36 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -36,6 +36,11 @@ cl::opt<bool>
                                  "through iterators, eliminating intrinsics"),
                         cl::init(true));
 
+bool WriteNewDbgInfoFormatToBitcode /*set default value in cl::init() below*/;
+cl::opt<bool, true> WriteNewDbgInfoFormatToBitcode2(
+    "write-experimental-debuginfo-iterators-to-bitcode", cl::Hidden,
+    cl::location(WriteNewDbgInfoFormatToBitcode), cl::init(false));
+
 DPMarker *BasicBlock::createMarker(Instruction *I) {
   assert(IsNewDbgInfoFormat &&
          "Tried to create a marker in a non new debug-info block!");
@@ -66,34 +71,34 @@ void BasicBlock::convertToNewDbgValues() {
   // Iterate over all instructions in the instruction list, collecting debug
   // info intrinsics and converting them to DbgRecords. Once we find a "real"
   // instruction, attach all those DbgRecords to a DPMarker in that instruction.
-  SmallVector<DbgRecord *, 4> DPVals;
+  SmallVector<DbgRecord *, 4> DbgVarRecs;
   for (Instruction &I : make_early_inc_range(InstList)) {
     assert(!I.DbgMarker && "DbgMarker already set on old-format instrs?");
     if (DbgVariableIntrinsic *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
-      // Convert this dbg.value to a DPValue.
-      DPValue *Value = new DPValue(DVI);
-      DPVals.push_back(Value);
+      // Convert this dbg.value to a DbgVariableRecord.
+      DbgVariableRecord *Value = new DbgVariableRecord(DVI);
+      DbgVarRecs.push_back(Value);
       DVI->eraseFromParent();
       continue;
     }
 
     if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(&I)) {
-      DPVals.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc()));
+      DbgVarRecs.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc()));
       DLI->eraseFromParent();
       continue;
     }
 
-    if (DPVals.empty())
+    if (DbgVarRecs.empty())
       continue;
 
     // Create a marker to store DbgRecords in.
     createMarker(&I);
     DPMarker *Marker = I.DbgMarker;
 
-    for (DbgRecord *DPV : DPVals)
-      Marker->insertDbgRecord(DPV, false);
+    for (DbgRecord *DVR : DbgVarRecs)
+      Marker->insertDbgRecord(DVR, false);
 
-    DPVals.clear();
+    DbgVarRecs.clear();
   }
 }
 
@@ -1029,21 +1034,21 @@ void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
   flushTerminatorDbgRecords();
 }
 
-void BasicBlock::insertDbgRecordAfter(DbgRecord *DPV, Instruction *I) {
+void BasicBlock::insertDbgRecordAfter(DbgRecord *DR, Instruction *I) {
   assert(IsNewDbgInfoFormat);
   assert(I->getParent() == this);
 
   iterator NextIt = std::next(I->getIterator());
   DPMarker *NextMarker = createMarker(NextIt);
-  NextMarker->insertDbgRecord(DPV, true);
+  NextMarker->insertDbgRecord(DR, true);
 }
 
-void BasicBlock::insertDbgRecordBefore(DbgRecord *DPV,
+void BasicBlock::insertDbgRecordBefore(DbgRecord *DR,
                                        InstListType::iterator Where) {
   assert(Where == end() || Where->getParent() == this);
   bool InsertAtHead = Where.getHeadBit();
   DPMarker *M = createMarker(Where);
-  M->insertDbgRecord(DPV, InsertAtHead);
+  M->insertDbgRecord(DR, InsertAtHead);
 }
 
 DPMarker *BasicBlock::getNextMarker(Instruction *I) {
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 034e397bc69fce..a766b1fe601823 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1154,10 +1154,9 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2) {
                                 GV->getType()->getAddressSpace()))
         return ICmpInst::ICMP_UGT;
     }
-  } else {
+  } else if (auto *CE1 = dyn_cast<ConstantExpr>(V1)) {
     // Ok, the LHS is known to be a constantexpr.  The RHS can be any of a
     // constantexpr, a global, block address, or a simple constant.
-    ConstantExpr *CE1 = cast<ConstantExpr>(V1);
     Constant *CE1Op0 = CE1->getOperand(0);
 
     switch (CE1->getOpcode()) {
@@ -1469,6 +1468,10 @@ static Constant *foldGEPOfGEP(GEPOperator *GEP, Type *PointeeTy, bool InBounds,
   if (PointeeTy != GEP->getResultElementType())
     return nullptr;
 
+  // Leave inrange handling to DL-aware constant folding.
+  if (GEP->getInRange())
+    return nullptr;
+
   Constant *Idx0 = cast<Constant>(Idxs[0]);
   if (Idx0->isNullValue()) {
     // Handle the simple case of a zero index.
@@ -1478,7 +1481,7 @@ static Constant *foldGEPOfGEP(GEPOperator *GEP, Type *PointeeTy, bool InBounds,
     NewIndices.append(Idxs.begin() + 1, Idxs.end());
     return ConstantExpr::getGetElementPtr(
         GEP->getSourceElementType(), cast<Constant>(GEP->getPointerOperand()),
-        NewIndices, InBounds && GEP->isInBounds(), GEP->getInRangeIndex());
+        NewIndices, InBounds && GEP->isInBounds());
   }
 
   gep_type_iterator LastI = gep_type_end(GEP);
@@ -1527,21 +1530,14 @@ static Constant *foldGEPOfGEP(GEPOperator *GEP, Type *PointeeTy, bool InBounds,
   NewIndices.push_back(ConstantExpr::get(Instruction::Add, Idx0, LastIdx));
   NewIndices.append(Idxs.begin() + 1, Idxs.end());
 
-  // The combined GEP normally inherits its index inrange attribute from
-  // the inner GEP, but if the inner GEP's last index was adjusted by the
-  // outer GEP, any inbounds attribute on that index is invalidated.
-  std::optional<unsigned> IRIndex = GEP->getInRangeIndex();
-  if (IRIndex && *IRIndex == GEP->getNumIndices() - 1)
-    IRIndex = std::nullopt;
-
   return ConstantExpr::getGetElementPtr(
       GEP->getSourceElementType(), cast<Constant>(GEP->getPointerOperand()),
-      NewIndices, InBounds && GEP->isInBounds(), IRIndex);
+      NewIndices, InBounds && GEP->isInBounds());
 }
 
 Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
                                           bool InBounds,
-                                          std::optional<unsigned> InRangeIndex,
+                                          std::optional<ConstantRange> InRange,
                                           ArrayRef<Value *> Idxs) {
   if (Idxs.empty()) return C;
 
@@ -1557,7 +1553,7 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
 
   auto IsNoOp = [&]() {
     // Avoid losing inrange information.
-    if (InRangeIndex)
+    if (InRange)
       return false;
 
     return all_of(Idxs, [](Value *Idx) {
@@ -1595,12 +1591,6 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     if (!isa<ConstantInt>(Idxs[i - 1]) && !isa<ConstantDataVector>(Idxs[i - 1]))
       // Skip if the type of the previous index is not supported.
       continue;
-    if (InRangeIndex && i == *InRangeIndex + 1) {
-      // If an index is marked inrange, we cannot apply this canonicalization to
-      // the following index, as that will cause the inrange index to point to
-      // the wrong element.
-      continue;
-    }
     if (isa<StructType>(Ty)) {
       // The verify makes sure that GEPs into a struct are in range.
       continue;
@@ -1622,16 +1612,16 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
       }
     } else {
       auto *CV = cast<ConstantDataVector>(Idxs[i]);
-      bool InRange = true;
+      bool IsInRange = true;
       for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) {
         auto *CI = cast<ConstantInt>(CV->getElementAsConstant(I));
-        InRange &= isIndexInRangeOfArrayType(STy->getNumElements(), CI);
+        IsInRange &= isIndexInRangeOfArrayType(STy->getNumElements(), CI);
         if (CI->isNegative()) {
           Unknown = true;
           break;
         }
       }
-      if (InRange || Unknown)
+      if (IsInRange || Unknown)
         // It's in range, skip to the next index.
         // It's out of range and negative, don't try to factor it.
         continue;
@@ -1721,7 +1711,7 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
       if (!NewIdxs[i]) NewIdxs[i] = cast<Constant>(Idxs[i]);
     return ConstantExpr::getGetElementPtr(PointeeTy, C, NewIdxs, InBounds,
-                                          InRangeIndex);
+                                          InRange);
   }
 
   // If all indices are known integers and normalized, we can do a simple
@@ -1731,7 +1721,7 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
       if (!GV->hasExternalWeakLinkage() && GV->getValueType() == PointeeTy &&
           isInBoundsIndices(Idxs))
         return ConstantExpr::getGetElementPtr(PointeeTy, C, Idxs,
-                                              /*InBounds=*/true, InRangeIndex);
+                                              /*InBounds=*/true, InRange);
 
   return nullptr;
 }
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index e6b92aad392f66..c17419b529ac00 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -1568,7 +1568,7 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
     assert(SrcTy || (Ops[0]->getType() == getOperand(0)->getType()));
     return ConstantExpr::getGetElementPtr(
         SrcTy ? SrcTy : GEPO->getSourceElementType(), Ops[0], Ops.slice(1),
-        GEPO->isInBounds(), GEPO->getInRangeIndex(), OnlyIfReducedTy);
+        GEPO->isInBounds(), GEPO->getInRange(), OnlyIfReducedTy);
   }
   case Instruction::ICmp:
   case Instruction::FCmp:
@@ -2349,17 +2349,16 @@ Constant *ConstantExpr::getCompare(unsigned short Predicate, Constant *C1,
 
 Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
                                          ArrayRef<Value *> Idxs, bool InBounds,
-                                         std::optional<unsigned> InRangeIndex,
+                                         std::optional<ConstantRange> InRange,
                                          Type *OnlyIfReducedTy) {
   assert(Ty && "Must specify element type");
   assert(isSupportedGetElementPtr(Ty) && "Element type is unsupported!");
 
-  if (Constant *FC =
-          ConstantFoldGetElementPtr(Ty, C, InBounds, InRangeIndex, Idxs))
-    return FC;          // Fold a few common cases.
+  if (Constant *FC = ConstantFoldGetElementPtr(Ty, C, InBounds, InRange, Idxs))
+    return FC; // Fold a few common cases.
 
-  assert(GetElementPtrInst::getIndexedType(Ty, Idxs) &&
-         "GEP indices invalid!");;
+  assert(GetElementPtrInst::getIndexedType(Ty, Idxs) && "GEP indices invalid!");
+  ;
 
   // Get the result type of the getelementptr!
   Type *ReqTy = GetElementPtrInst::getGEPReturnType(C, Idxs);
@@ -2392,10 +2391,9 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
   }
 
   unsigned SubClassOptionalData = InBounds ? GEPOperator::IsInBounds : 0;
-  if (InRangeIndex && *InRangeIndex < 63)
-    SubClassOptionalData |= (*InRangeIndex + 1) << 1;
   const ConstantExprKeyType Key(Instruction::GetElementPtr, ArgVec, 0,
-                                SubClassOptionalData, std::nullopt, Ty);
+                                SubClassOptionalData, std::nullopt, Ty,
+                                InRange);
 
   LLVMContextImpl *pImpl = C->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
@@ -2691,13 +2689,15 @@ const char *ConstantExpr::getOpcodeName() const {
 }
 
 GetElementPtrConstantExpr::GetElementPtrConstantExpr(
-    Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList, Type *DestTy)
+    Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList, Type *DestTy,
+    std::optional<ConstantRange> InRange)
     : ConstantExpr(DestTy, Instruction::GetElementPtr,
                    OperandTraits<GetElementPtrConstantExpr>::op_end(this) -
                        (IdxList.size() + 1),
                    IdxList.size() + 1),
       SrcElementTy(SrcElementTy),
-      ResElementTy(GetElementPtrInst::getIndexedType(SrcElementTy, IdxList)) {
+      ResElementTy(GetElementPtrInst::getIndexedType(SrcElementTy, IdxList)),
+      InRange(std::move(InRange)) {
   Op<0>() = C;
   Use *OperandList = getOperandList();
   for (unsigned i = 0, E = IdxList.size(); i != E; ++i)
@@ -2712,6 +2712,10 @@ Type *GetElementPtrConstantExpr::getResultElementType() const {
   return ResElementTy;
 }
 
+std::optional<ConstantRange> GetElementPtrConstantExpr::getInRange() const {
+  return InRange;
+}
+
 //===----------------------------------------------------------------------===//
 //                       ConstantData* implementations
 
@@ -3303,7 +3307,7 @@ Value *ConstantExpr::handleOperandChangeImpl(Value *From, Value *ToV) {
       NewOps, this, From, To, NumUpdated, OperandNo);
 }
 
-Instruction *ConstantExpr::getAsInstruction(Instruction *InsertBefore) const {
+Instruction *ConstantExpr::getAsInstruction() const {
   SmallVector<Value *, 4> ValueOperands(operands());
   ArrayRef<Value*> Ops(ValueOperands);
 
@@ -3322,32 +3326,31 @@ Instruction *ConstantExpr::getAsInstruction(Instruction *InsertBefore) const {
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
     return CastInst::Create((Instruction::CastOps)getOpcode(), Ops[0],
-                            getType(), "", InsertBefore);
+                            getType(), "");
   case Instruction::InsertElement:
-    return InsertElementInst::Create(Ops[0], Ops[1], Ops[2], "", InsertBefore);
+    return InsertElementInst::Create(Ops[0], Ops[1], Ops[2], "");
   case Instruction::ExtractElement:
-    return ExtractElementInst::Create(Ops[0], Ops[1], "", InsertBefore);
+    return ExtractElementInst::Create(Ops[0], Ops[1], "");
   case Instruction::ShuffleVector:
-    return new ShuffleVectorInst(Ops[0], Ops[1], getShuffleMask(), "",
-                                 InsertBefore);
+    return new ShuffleVectorInst(Ops[0], Ops[1], getShuffleMask(), "");
 
   case Instruction::GetElementPtr: {
     const auto *GO = cast<GEPOperator>(this);
     if (GO->isInBounds())
-      return GetElementPtrInst::CreateInBounds(
-          GO->getSourceElementType(), Ops[0], Ops.slice(1), "", InsertBefore);
+      return GetElementPtrInst::CreateInBounds(GO->getSourceElementType(),
+                                               Ops[0], Ops.slice(1), "");
     return GetElementPtrInst::Create(GO->getSourceElementType(), Ops[0],
-                                     Ops.slice(1), "", InsertBefore);
+                                     Ops.slice(1), "");
   }
   case Instruction::ICmp:
   case Instruction::FCmp:
     return CmpInst::Create((Instruction::OtherOps)getOpcode(),
                            (CmpInst::Predicate)getPredicate(), Ops[0], Ops[1],
-                           "", InsertBefore);
+                           "");
   default:
     assert(getNumOperands() == 2 && "Must be binary operator?");
     BinaryOperator *BO = BinaryOperator::Create(
-        (Instruction::BinaryOps)getOpcode(), Ops[0], Ops[1], "", InsertBefore);
+        (Instruction::BinaryOps)getOpcode(), Ops[0], Ops[1], "");
     if (isa<OverflowingBinaryOperator>(BO)) {
       BO->setHasNoUnsignedWrap(SubclassOptionalData &
                                OverflowingBinaryOperator::NoUnsignedWrap);
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index 44a926b5dc58e0..7067d0d121117b 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -183,25 +183,29 @@ class ShuffleVectorConstantExpr final : public ConstantExpr {
 
 /// GetElementPtrConstantExpr - This class is private to Constants.cpp, and is
 /// used behind the scenes to implement getelementptr constant exprs.
-class GetElementPtrConstantExpr final : public ConstantExpr {
+class GetElementPtrConstantExpr : public ConstantExpr {
   Type *SrcElementTy;
   Type *ResElementTy;
+  std::optional<ConstantRange> InRange;
 
   GetElementPtrConstantExpr(Type *SrcElementTy, Constant *C,
-                            ArrayRef<Constant *> IdxList, Type *DestTy);
+                            ArrayRef<Constant *> IdxList, Type *DestTy,
+                            std::optional<ConstantRange> InRange);
 
 public:
-  static GetElementPtrConstantExpr *Create(Type *SrcElementTy, Constant *C,
-                                           ArrayRef<Constant *> IdxList,
-                                           Type *DestTy, unsigned Flags) {
+  static GetElementPtrConstantExpr *
+  Create(Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList,
+         Type *DestTy, unsigned Flags, std::optional<ConstantRange> InRange) {
     GetElementPtrConstantExpr *Result = new (IdxList.size() + 1)
-        GetElementPtrConstantExpr(SrcElementTy, C, IdxList, DestTy);
+        GetElementPtrConstantExpr(SrcElementTy, C, IdxList, DestTy,
+                                  std::move(InRange));
     Result->SubclassOptionalData = Flags;
     return Result;
   }
 
   Type *getSourceElementType() const;
   Type *getResultElementType() const;
+  std::optional<ConstantRange> getInRange() const;
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -405,6 +409,7 @@ struct ConstantExprKeyType {
   ArrayRef<Constant *> Ops;
   ArrayRef<int> ShuffleMask;
   Type *ExplicitTy;
+  std::optional<ConstantRange> InRange;
 
   static ArrayRef<int> getShuffleMaskIfValid(const ConstantExpr *CE) {
     if (CE->getOpcode() == Instruction::ShuffleVector)
@@ -418,22 +423,31 @@ struct ConstantExprKeyType {
     return nullptr;
   }
 
+  static std::optional<ConstantRange>
+  getInRangeIfValid(const ConstantExpr *CE) {
+    if (auto *GEPCE = dyn_cast<GetElementPtrConstantExpr>(CE))
+      return GEPCE->getInRange();
+    return std::nullopt;
+  }
+
 public:
   ConstantExprKeyType(unsigned Opcode, ArrayRef<Constant *> Ops,
                       unsigned short SubclassData = 0,
                       unsigned short SubclassOptionalData = 0,
                       ArrayRef<int> ShuffleMask = std::nullopt,
-                      Type *ExplicitTy = nullptr)
+                      Type *ExplicitTy = nullptr,
+                      std::optional<ConstantRange> InRange = std::nullopt)
       : Opcode(Opcode), SubclassOptionalData(SubclassOptionalData),
         SubclassData(SubclassData), Ops(Ops), ShuffleMask(ShuffleMask),
-        ExplicitTy(ExplicitTy) {}
+        ExplicitTy(ExplicitTy), InRange(std::move(InRange)) {}
 
   ConstantExprKeyType(ArrayRef<Constant *> Operands, const ConstantExpr *CE)
       : Opcode(CE->getOpcode()),
         SubclassOptionalData(CE->getRawSubclassOptionalData()),
         SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands),
         ShuffleMask(getShuffleMaskIfValid(CE)),
-        ExplicitTy(getSourceElementTypeIfValid(CE)) {}
+        ExplicitTy(getSourceElementTypeIfValid(CE)),
+        InRange(getInRangeIfValid(CE)) {}
 
   ConstantExprKeyType(const ConstantExpr *CE,
                       SmallVectorImpl<Constant *> &Storage)
@@ -441,17 +455,26 @@ struct ConstantExprKeyType {
         SubclassOptionalData(CE->getRawSubclassOptionalData()),
         SubclassData(CE->isCompare() ? CE->getPredicate() : 0),
         ShuffleMask(getShuffleMaskIfValid(CE)),
-        ExplicitTy(getSourceElementTypeIfValid(CE)) {
+        ExplicitTy(getSourceElementTypeIfValid(CE)),
+        InRange(getInRangeIfValid(CE)) {
     assert(Storage.empty() && "Expected empty storage");
     for (unsigned I = 0, E = CE->getNumOperands(); I != E; ++I)
       Storage.push_back(CE->getOperand(I));
     Ops = Storage;
   }
 
+  static bool rangesEqual(const std::optional<ConstantRange> &A,
+                          const std::optional<ConstantRange> &B) {
+    if (!A.has_value() || !B.has_value())
+      return A.has_value() == B.has_value();
+    return A->getBitWidth() == B->getBitWidth() && A == B;
+  }
+
   bool operator==(const ConstantExprKeyType &X) const {
     return Opcode == X.Opcode && SubclassData == X.SubclassData &&
            SubclassOptionalData == X.SubclassOptionalData && Ops == X.Ops &&
-           ShuffleMask == X.ShuffleMask && ExplicitTy == X.ExplicitTy;
+           ShuffleMask == X.ShuffleMask && ExplicitTy == X.ExplicitTy &&
+           rangesEqual(InRange, X.InRange);
   }
 
   bool operator==(const ConstantExpr *CE) const {
@@ -470,6 +493,8 @@ struct ConstantExprKeyType {
       return false;
     if (ExplicitTy != getSourceElementTypeIfValid(CE))
       return false;
+    if (!rangesEqual(InRange, getInRangeIfValid(CE)))
+      return false;
     return true;
   }
 
@@ -499,8 +524,8 @@ struct ConstantExprKeyType {
     case Instruction::ShuffleVector:
       return new ShuffleVectorConstantExpr(Ops[0], Ops[1], ShuffleMask);
     case Instruction::GetElementPtr:
-      return GetElementPtrConstantExpr::Create(ExplicitTy, Ops[0], Ops.slice(1),
-                                               Ty, SubclassOptionalData);
+      return GetElementPtrConstantExpr::Create(
+          ExplicitTy, Ops[0], Ops.slice(1), Ty, SubclassOptionalData, InRange);
     case Instruction::ICmp:
       return new CompareConstantExpr(Ty, Instruction::ICmp, SubclassData,
                                      Ops[0], Ops[1]);
diff --git a/llvm/lib/IR/ConvergenceVerifier.cpp b/llvm/lib/IR/ConvergenceVerifier.cpp
index e73aeaade5f714..6708f85819df7f 100644
--- a/llvm/lib/IR/ConvergenceVerifier.cpp
+++ b/llvm/lib/IR/ConvergenceVerifier.cpp
@@ -75,14 +75,14 @@ GenericConvergenceVerifier<SSAContext>::findAndCheckConvergenceTokenUsed(
 
 template <>
 bool GenericConvergenceVerifier<SSAContext>::isInsideConvergentFunction(
-    const InstructionT &I) {
+    const Instruction &I) {
   auto *F = I.getFunction();
   return F->isConvergent();
 }
 
 template <>
 bool GenericConvergenceVerifier<SSAContext>::isConvergent(
-    const InstructionT &I) {
+    const Instruction &I) {
   if (auto *CB = dyn_cast<CallBase>(&I)) {
     return CB->isConvergent();
   }
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 4b804a41a1676f..023cabc46911e5 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -404,6 +404,14 @@ void LLVMAddModuleFlag(LLVMModuleRef M, LLVMModuleFlagBehavior Behavior,
                            {Key, KeyLen}, unwrap(Val));
 }
 
+LLVMBool LLVMIsNewDbgInfoFormat(LLVMModuleRef M) {
+  return unwrap(M)->IsNewDbgInfoFormat;
+}
+
+void LLVMSetIsNewDbgInfoFormat(LLVMModuleRef M, LLVMBool UseNewFormat) {
+  unwrap(M)->setIsNewDbgInfoFormat(UseNewFormat);
+}
+
 /*--.. Printing modules ....................................................--*/
 
 void LLVMDumpModule(LLVMModuleRef M) {
@@ -2414,6 +2422,38 @@ void LLVMSetGC(LLVMValueRef Fn, const char *GC) {
     F->clearGC();
 }
 
+LLVMValueRef LLVMGetPrefixData(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return wrap(F->getPrefixData());
+}
+
+LLVMBool LLVMHasPrefixData(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return F->hasPrefixData();
+}
+
+void LLVMSetPrefixData(LLVMValueRef Fn, LLVMValueRef prefixData) {
+  Function *F = unwrap<Function>(Fn);
+  Constant *prefix = unwrap<Constant>(prefixData);
+  F->setPrefixData(prefix);
+}
+
+LLVMValueRef LLVMGetPrologueData(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return wrap(F->getPrologueData());
+}
+
+LLVMBool LLVMHasPrologueData(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return F->hasPrologueData();
+}
+
+void LLVMSetPrologueData(LLVMValueRef Fn, LLVMValueRef prologueData) {
+  Function *F = unwrap<Function>(Fn);
+  Constant *prologue = unwrap<Constant>(prologueData);
+  F->setPrologueData(prologue);
+}
+
 void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                              LLVMAttributeRef A) {
   unwrap<Function>(F)->addAttributeAtIndex(Idx, unwrap(A));
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index c673abd8bc30d0..f10b5acb980d6e 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -296,7 +296,20 @@ DIStringType *DIBuilder::createStringType(StringRef Name,
 
 DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy, 0,
-                            0, 0, std::nullopt, DINode::FlagZero);
+                            0, 0, std::nullopt, std::nullopt, DINode::FlagZero);
+}
+
+DIDerivedType *DIBuilder::createPtrAuthQualifiedType(
+    DIType *FromTy, unsigned Key, bool IsAddressDiscriminated,
+    unsigned ExtraDiscriminator, bool IsaPointer,
+    bool AuthenticatesNullValues) {
+  return DIDerivedType::get(VMContext, dwarf::DW_TAG_LLVM_ptrauth_type, "",
+                            nullptr, 0, nullptr, FromTy, 0, 0, 0, std::nullopt,
+                            std::optional<DIDerivedType::PtrAuthData>(
+                                std::in_place, Key, IsAddressDiscriminated,
+                                ExtraDiscriminator, IsaPointer,
+                                AuthenticatesNullValues),
+                            DINode::FlagZero);
 }
 
 DIDerivedType *
@@ -307,8 +320,8 @@ DIBuilder::createPointerType(DIType *PointeeTy, uint64_t SizeInBits,
   // FIXME: Why is there a name here?
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_pointer_type, Name,
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, DWARFAddressSpace, DINode::FlagZero,
-                            nullptr, Annotations);
+                            AlignInBits, 0, DWARFAddressSpace, std::nullopt,
+                            DINode::FlagZero, nullptr, Annotations);
 }
 
 DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
@@ -318,7 +331,8 @@ DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
                                                   DINode::DIFlags Flags) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_ptr_to_member_type, "",
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, std::nullopt, Flags, Base);
+                            AlignInBits, 0, std::nullopt, std::nullopt, Flags,
+                            Base);
 }
 
 DIDerivedType *
@@ -327,7 +341,7 @@ DIBuilder::createReferenceType(unsigned Tag, DIType *RTy, uint64_t SizeInBits,
                                std::optional<unsigned> DWARFAddressSpace) {
   assert(RTy && "Unable to create reference type");
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, RTy,
-                            SizeInBits, AlignInBits, 0, DWARFAddressSpace,
+                            SizeInBits, AlignInBits, 0, DWARFAddressSpace, {},
                             DINode::FlagZero);
 }
 
@@ -338,15 +352,16 @@ DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
                                         DINodeArray Annotations) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_typedef, Name, File,
                             LineNo, getNonCompileUnitScope(Context), Ty, 0,
-                            AlignInBits, 0, std::nullopt, Flags, nullptr,
-                            Annotations);
+                            AlignInBits, 0, std::nullopt, std::nullopt, Flags,
+                            nullptr, Annotations);
 }
 
 DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
   assert(Ty && "Invalid type!");
   assert(FriendTy && "Invalid friend type!");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_friend, "", nullptr, 0, Ty,
-                            FriendTy, 0, 0, 0, std::nullopt, DINode::FlagZero);
+                            FriendTy, 0, 0, 0, std::nullopt, std::nullopt,
+                            DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
@@ -358,7 +373,7 @@ DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
       ConstantInt::get(IntegerType::get(VMContext, 32), VBPtrOffset));
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_inheritance, "", nullptr,
                             0, Ty, BaseTy, 0, 0, BaseOffset, std::nullopt,
-                            Flags, ExtraData);
+                            std::nullopt, Flags, ExtraData);
 }
 
 DIDerivedType *DIBuilder::createMemberType(
@@ -368,7 +383,7 @@ DIDerivedType *DIBuilder::createMemberType(
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty,
                             SizeInBits, AlignInBits, OffsetInBits, std::nullopt,
-                            Flags, nullptr, Annotations);
+                            std::nullopt, Flags, nullptr, Annotations);
 }
 
 static ConstantAsMetadata *getConstantOrNull(Constant *C) {
@@ -381,10 +396,10 @@ DIDerivedType *DIBuilder::createVariantMemberType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
     Constant *Discriminant, DINode::DIFlags Flags, DIType *Ty) {
-  return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
-                            LineNumber, getNonCompileUnitScope(Scope), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, std::nullopt,
-                            Flags, getConstantOrNull(Discriminant));
+  return DIDerivedType::get(
+      VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
+      getNonCompileUnitScope(Scope), Ty, SizeInBits, AlignInBits, OffsetInBits,
+      std::nullopt, std::nullopt, Flags, getConstantOrNull(Discriminant));
 }
 
 DIDerivedType *DIBuilder::createBitFieldMemberType(
@@ -395,7 +410,7 @@ DIDerivedType *DIBuilder::createBitFieldMemberType(
   return DIDerivedType::get(
       VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
       getNonCompileUnitScope(Scope), Ty, SizeInBits, /*AlignInBits=*/0,
-      OffsetInBits, std::nullopt, Flags,
+      OffsetInBits, std::nullopt, std::nullopt, Flags,
       ConstantAsMetadata::get(ConstantInt::get(IntegerType::get(VMContext, 64),
                                                StorageOffsetInBits)),
       Annotations);
@@ -409,7 +424,8 @@ DIBuilder::createStaticMemberType(DIScope *Scope, StringRef Name, DIFile *File,
   Flags |= DINode::FlagStaticMember;
   return DIDerivedType::get(VMContext, Tag, Name, File, LineNumber,
                             getNonCompileUnitScope(Scope), Ty, 0, AlignInBits,
-                            0, std::nullopt, Flags, getConstantOrNull(Val));
+                            0, std::nullopt, std::nullopt, Flags,
+                            getConstantOrNull(Val));
 }
 
 DIDerivedType *
@@ -420,7 +436,7 @@ DIBuilder::createObjCIVar(StringRef Name, DIFile *File, unsigned LineNumber,
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(File), Ty,
                             SizeInBits, AlignInBits, OffsetInBits, std::nullopt,
-                            Flags, PropertyNode);
+                            std::nullopt, Flags, PropertyNode);
 }
 
 DIObjCProperty *
@@ -555,10 +571,10 @@ DIDerivedType *DIBuilder::createSetType(DIScope *Scope, StringRef Name,
                                         DIFile *File, unsigned LineNo,
                                         uint64_t SizeInBits,
                                         uint32_t AlignInBits, DIType *Ty) {
-  auto *R =
-      DIDerivedType::get(VMContext, dwarf::DW_TAG_set_type, Name, File, LineNo,
-                         getNonCompileUnitScope(Scope), Ty, SizeInBits,
-                         AlignInBits, 0, std::nullopt, DINode::FlagZero);
+  auto *R = DIDerivedType::get(VMContext, dwarf::DW_TAG_set_type, Name, File,
+                               LineNo, getNonCompileUnitScope(Scope), Ty,
+                               SizeInBits, AlignInBits, 0, std::nullopt,
+                               std::nullopt, DINode::FlagZero);
   trackIfUnresolved(R);
   return R;
 }
@@ -951,14 +967,14 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
   assert(Link && "Linked instruction must have DIAssign metadata attached");
 
   if (M.IsNewDbgInfoFormat) {
-    DPValue *DPV = DPValue::createDPVAssign(Val, SrcVar, ValExpr, Link, Addr,
-                                            AddrExpr, DL);
+    DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
+        Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
     BasicBlock *InsertBB = LinkedInstr->getParent();
     // Insert after LinkedInstr.
     BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
     Instruction *InsertBefore = NextIt == InsertBB->end() ? nullptr : &*NextIt;
-    insertDPValue(DPV, InsertBB, InsertBefore, true);
-    return DPV;
+    insertDbgVariableRecord(DVR, InsertBB, InsertBefore, true);
+    return DVR;
   }
 
   LLVMContext &Ctx = LinkedInstr->getContext();
@@ -1040,9 +1056,10 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(
     llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr,
     const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) {
   if (M.IsNewDbgInfoFormat) {
-    DPValue *DPV = DPValue::createDPValue(Val, VarInfo, Expr, DL);
-    insertDPValue(DPV, InsertBB, InsertBefore);
-    return DPV;
+    DbgVariableRecord *DVR =
+        DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
+    insertDbgVariableRecord(DVR, InsertBB, InsertBefore);
+    return DVR;
   }
 
   if (!ValueFn)
@@ -1062,9 +1079,10 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
          "Expected matching subprograms");
 
   if (M.IsNewDbgInfoFormat) {
-    DPValue *DPV = DPValue::createDPVDeclare(Storage, VarInfo, Expr, DL);
-    insertDPValue(DPV, InsertBB, InsertBefore);
-    return DPV;
+    DbgVariableRecord *DVR =
+        DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
+    insertDbgVariableRecord(DVR, InsertBB, InsertBefore);
+    return DVR;
   }
 
   if (!DeclareFn)
@@ -1081,13 +1099,15 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
   return B.CreateCall(DeclareFn, Args);
 }
 
-void DIBuilder::insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
-                              Instruction *InsertBefore, bool InsertAtHead) {
+void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR,
+                                        BasicBlock *InsertBB,
+                                        Instruction *InsertBefore,
+                                        bool InsertAtHead) {
   assert(InsertBefore || InsertBB);
-  trackIfUnresolved(DPV->getVariable());
-  trackIfUnresolved(DPV->getExpression());
-  if (DPV->isDbgAssign())
-    trackIfUnresolved(DPV->getAddressExpression());
+  trackIfUnresolved(DVR->getVariable());
+  trackIfUnresolved(DVR->getExpression());
+  if (DVR->isDbgAssign())
+    trackIfUnresolved(DVR->getAddressExpression());
 
   BasicBlock::iterator InsertPt;
   if (InsertBB && InsertBefore)
@@ -1095,7 +1115,7 @@ void DIBuilder::insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
   else if (InsertBB)
     InsertPt = InsertBB->end();
   InsertPt.setHeadBit(InsertAtHead);
-  InsertBB->insertDbgRecordBefore(DPV, InsertPt);
+  InsertBB->insertDbgRecordBefore(DVR, InsertPt);
 }
 
 Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn,
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 8587f17148d286..09bce9df1f3328 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -63,7 +63,7 @@ TinyPtrVector<DbgDeclareInst *> llvm::findDbgDeclares(Value *V) {
 
   return Declares;
 }
-TinyPtrVector<DPValue *> llvm::findDPVDeclares(Value *V) {
+TinyPtrVector<DbgVariableRecord *> llvm::findDVRDeclares(Value *V) {
   // This function is hot. Check whether the value has any metadata to avoid a
   // DenseMap lookup.
   if (!V->isUsedByMetadata())
@@ -72,18 +72,19 @@ TinyPtrVector<DPValue *> llvm::findDPVDeclares(Value *V) {
   if (!L)
     return {};
 
-  TinyPtrVector<DPValue *> Declares;
-  for (DPValue *DPV : L->getAllDPValueUsers())
-    if (DPV->getType() == DPValue::LocationType::Declare)
-      Declares.push_back(DPV);
+  TinyPtrVector<DbgVariableRecord *> Declares;
+  for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers())
+    if (DVR->getType() == DbgVariableRecord::LocationType::Declare)
+      Declares.push_back(DVR);
 
   return Declares;
 }
 
-template <typename IntrinsicT,
-          DPValue::LocationType Type = DPValue::LocationType::Any>
-static void findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
-                              SmallVectorImpl<DPValue *> *DPValues) {
+template <typename IntrinsicT, DbgVariableRecord::LocationType Type =
+                                   DbgVariableRecord::LocationType::Any>
+static void
+findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
+                  SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
   // This function is hot. Check whether the value has any metadata to avoid a
   // DenseMap lookup.
   if (!V->isUsedByMetadata())
@@ -96,25 +97,27 @@ static void findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
   // V will also appear twice in a dbg.assign if its used in the both the value
   // and address components.
   SmallPtrSet<IntrinsicT *, 4> EncounteredIntrinsics;
-  SmallPtrSet<DPValue *, 4> EncounteredDPValues;
+  SmallPtrSet<DbgVariableRecord *, 4> EncounteredDbgVariableRecords;
 
   /// Append IntrinsicT users of MetadataAsValue(MD).
-  auto AppendUsers = [&Ctx, &EncounteredIntrinsics, &EncounteredDPValues,
-                      &Result, DPValues](Metadata *MD) {
+  auto AppendUsers = [&Ctx, &EncounteredIntrinsics,
+                      &EncounteredDbgVariableRecords, &Result,
+                      DbgVariableRecords](Metadata *MD) {
     if (auto *MDV = MetadataAsValue::getIfExists(Ctx, MD)) {
       for (User *U : MDV->users())
         if (IntrinsicT *DVI = dyn_cast<IntrinsicT>(U))
           if (EncounteredIntrinsics.insert(DVI).second)
             Result.push_back(DVI);
     }
-    if (!DPValues)
+    if (!DbgVariableRecords)
       return;
-    // Get DPValues that use this as a single value.
+    // Get DbgVariableRecords that use this as a single value.
     if (LocalAsMetadata *L = dyn_cast<LocalAsMetadata>(MD)) {
-      for (DPValue *DPV : L->getAllDPValueUsers()) {
-        if (Type == DPValue::LocationType::Any || DPV->getType() == Type)
-          if (EncounteredDPValues.insert(DPV).second)
-            DPValues->push_back(DPV);
+      for (DbgVariableRecord *DVR : L->getAllDbgVariableRecordUsers()) {
+        if (Type == DbgVariableRecord::LocationType::Any ||
+            DVR->getType() == Type)
+          if (EncounteredDbgVariableRecords.insert(DVR).second)
+            DbgVariableRecords->push_back(DVR);
       }
     }
   };
@@ -123,27 +126,30 @@ static void findDbgIntrinsics(SmallVectorImpl<IntrinsicT *> &Result, Value *V,
     AppendUsers(L);
     for (Metadata *AL : L->getAllArgListUsers()) {
       AppendUsers(AL);
-      if (!DPValues)
+      if (!DbgVariableRecords)
         continue;
       DIArgList *DI = cast<DIArgList>(AL);
-      for (DPValue *DPV : DI->getAllDPValueUsers())
-        if (Type == DPValue::LocationType::Any || DPV->getType() == Type)
-          if (EncounteredDPValues.insert(DPV).second)
-            DPValues->push_back(DPV);
+      for (DbgVariableRecord *DVR : DI->getAllDbgVariableRecordUsers())
+        if (Type == DbgVariableRecord::LocationType::Any ||
+            DVR->getType() == Type)
+          if (EncounteredDbgVariableRecords.insert(DVR).second)
+            DbgVariableRecords->push_back(DVR);
     }
   }
 }
 
-void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues,
-                         Value *V, SmallVectorImpl<DPValue *> *DPValues) {
-  findDbgIntrinsics<DbgValueInst, DPValue::LocationType::Value>(DbgValues, V,
-                                                                DPValues);
+void llvm::findDbgValues(
+    SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V,
+    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
+  findDbgIntrinsics<DbgValueInst, DbgVariableRecord::LocationType::Value>(
+      DbgValues, V, DbgVariableRecords);
 }
 
-void llvm::findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers,
-                        Value *V, SmallVectorImpl<DPValue *> *DPValues) {
-  findDbgIntrinsics<DbgVariableIntrinsic, DPValue::LocationType::Any>(
-      DbgUsers, V, DPValues);
+void llvm::findDbgUsers(
+    SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers, Value *V,
+    SmallVectorImpl<DbgVariableRecord *> *DbgVariableRecords) {
+  findDbgIntrinsics<DbgVariableIntrinsic, DbgVariableRecord::LocationType::Any>(
+      DbgUsers, V, DbgVariableRecords);
 }
 
 DISubprogram *llvm::getDISubprogram(const MDNode *Scope) {
@@ -164,16 +170,16 @@ DebugLoc llvm::getDebugValueLoc(DbgVariableIntrinsic *DII) {
   return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt);
 }
 
-DebugLoc llvm::getDebugValueLoc(DPValue *DPV) {
+DebugLoc llvm::getDebugValueLoc(DbgVariableRecord *DVR) {
   // Original dbg.declare must have a location.
-  const DebugLoc &DeclareLoc = DPV->getDebugLoc();
+  const DebugLoc &DeclareLoc = DVR->getDebugLoc();
   MDNode *Scope = DeclareLoc.getScope();
   DILocation *InlinedAt = DeclareLoc.getInlinedAt();
   // Because no machine insts can come from debug intrinsics, only the scope
   // and inlinedAt is significant. Zero line numbers are used in case this
   // DebugLoc leaks into any adjacent instructions. Produce an unknown location
   // with the correct scope / inlinedAt fields.
-  return DILocation::get(DPV->getContext(), 0, 0, Scope, InlinedAt);
+  return DILocation::get(DVR->getContext(), 0, 0, Scope, InlinedAt);
 }
 
 //===----------------------------------------------------------------------===//
@@ -253,8 +259,8 @@ void DebugInfoFinder::processLocation(const Module &M, const DILocation *Loc) {
 }
 
 void DebugInfoFinder::processDbgRecord(const Module &M, const DbgRecord &DR) {
-  if (const DPValue *DPV = dyn_cast<const DPValue>(&DR))
-    processVariable(M, DPV->getVariable());
+  if (const DbgVariableRecord *DVR = dyn_cast<const DbgVariableRecord>(&DR))
+    processVariable(M, DVR->getVariable());
   processLocation(M, DR.getDebugLoc().get());
 }
 
@@ -1336,9 +1342,9 @@ LLVMMetadataRef LLVMDIBuilderCreatePointerType(
     LLVMDIBuilderRef Builder, LLVMMetadataRef PointeeTy,
     uint64_t SizeInBits, uint32_t AlignInBits, unsigned AddressSpace,
     const char *Name, size_t NameLen) {
-  return wrap(unwrap(Builder)->createPointerType(unwrapDI<DIType>(PointeeTy),
-                                         SizeInBits, AlignInBits,
-                                         AddressSpace, {Name, NameLen}));
+  return wrap(unwrap(Builder)->createPointerType(
+      unwrapDI<DIType>(PointeeTy), SizeInBits, AlignInBits, AddressSpace,
+      {Name, NameLen}));
 }
 
 LLVMMetadataRef LLVMDIBuilderCreateStructType(
@@ -1663,6 +1669,12 @@ LLVMValueRef
 LLVMDIBuilderInsertDeclareBefore(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
                                  LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
                                  LLVMMetadataRef DL, LLVMValueRef Instr) {
+  return LLVMDIBuilderInsertDeclareIntrinsicBefore(Builder, Storage, VarInfo,
+                                                   Expr, DL, Instr);
+}
+LLVMValueRef LLVMDIBuilderInsertDeclareIntrinsicBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMValueRef Instr) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare(
       unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
       unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
@@ -1671,11 +1683,27 @@ LLVMDIBuilderInsertDeclareBefore(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
          "Inserted a DbgRecord into function using old debug info mode");
   return wrap(cast<Instruction *>(DbgInst));
 }
+LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMValueRef Instr) {
+  return wrap(
+      unwrap(Builder)
+          ->insertDeclare(unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
+                          unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
+                          unwrap<Instruction>(Instr))
+          .get<DbgRecord *>());
+}
 
 LLVMValueRef
 LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
                                 LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
                                 LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
+  return LLVMDIBuilderInsertDeclareIntrinsicAtEnd(Builder, Storage, VarInfo,
+                                                  Expr, DL, Block);
+}
+LLVMValueRef LLVMDIBuilderInsertDeclareIntrinsicAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare(
       unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
       unwrap<DIExpression>(Expr), unwrap<DILocation>(DL), unwrap(Block));
@@ -1683,10 +1711,26 @@ LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
          "Inserted a DbgRecord into function using old debug info mode");
   return wrap(cast<Instruction *>(DbgInst));
 }
+LLVMDbgRecordRef LLVMDIBuilderInsertDeclareRecordAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
+  return wrap(unwrap(Builder)
+                  ->insertDeclare(unwrap(Storage),
+                                  unwrap<DILocalVariable>(VarInfo),
+                                  unwrap<DIExpression>(Expr),
+                                  unwrap<DILocation>(DL), unwrap(Block))
+                  .get<DbgRecord *>());
+}
 
 LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(
     LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
     LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) {
+  return LLVMDIBuilderInsertDbgValueIntrinsicBefore(Builder, Val, VarInfo, Expr,
+                                                    DebugLoc, Instr);
+}
+LLVMValueRef LLVMDIBuilderInsertDbgValueIntrinsicBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
       unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
       unwrap<DILocation>(DebugLoc), unwrap<Instruction>(Instr));
@@ -1694,10 +1738,26 @@ LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(
          "Inserted a DbgRecord into function using old debug info mode");
   return wrap(cast<Instruction *>(DbgInst));
 }
+LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) {
+  return wrap(unwrap(Builder)
+                  ->insertDbgValueIntrinsic(
+                      unwrap(Val), unwrap<DILocalVariable>(VarInfo),
+                      unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
+                      unwrap<Instruction>(Instr))
+                  .get<DbgRecord *>());
+}
 
 LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(
     LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
     LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) {
+  return LLVMDIBuilderInsertDbgValueIntrinsicAtEnd(Builder, Val, VarInfo, Expr,
+                                                   DebugLoc, Block);
+}
+LLVMValueRef LLVMDIBuilderInsertDbgValueIntrinsicAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) {
   DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
       unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
       unwrap<DILocation>(DebugLoc), unwrap(Block));
@@ -1705,6 +1765,16 @@ LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(
          "Inserted a DbgRecord into function using old debug info mode");
   return wrap(cast<Instruction *>(DbgInst));
 }
+LLVMDbgRecordRef LLVMDIBuilderInsertDbgValueRecordAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) {
+  return wrap(unwrap(Builder)
+                  ->insertDbgValueIntrinsic(
+                      unwrap(Val), unwrap<DILocalVariable>(VarInfo),
+                      unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
+                      unwrap(Block))
+                  .get<DbgRecord *>());
+}
 
 LLVMMetadataRef LLVMDIBuilderCreateAutoVariable(
     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
@@ -1800,14 +1870,14 @@ AssignmentMarkerRange at::getAssignmentMarkers(DIAssignID *ID) {
 
 void at::deleteAssignmentMarkers(const Instruction *Inst) {
   auto Range = getAssignmentMarkers(Inst);
-  SmallVector<DPValue *> DPVAssigns = getDPVAssignmentMarkers(Inst);
-  if (Range.empty() && DPVAssigns.empty())
+  SmallVector<DbgVariableRecord *> DVRAssigns = getDVRAssignmentMarkers(Inst);
+  if (Range.empty() && DVRAssigns.empty())
     return;
   SmallVector<DbgAssignIntrinsic *> ToDelete(Range.begin(), Range.end());
   for (auto *DAI : ToDelete)
     DAI->eraseFromParent();
-  for (auto *DPV : DPVAssigns)
-    DPV->eraseFromParent();
+  for (auto *DVR : DVRAssigns)
+    DVR->eraseFromParent();
 }
 
 void at::RAUW(DIAssignID *Old, DIAssignID *New) {
@@ -1825,12 +1895,12 @@ void at::RAUW(DIAssignID *Old, DIAssignID *New) {
 
 void at::deleteAll(Function *F) {
   SmallVector<DbgAssignIntrinsic *, 12> ToDelete;
-  SmallVector<DPValue *, 12> DPToDelete;
+  SmallVector<DbgVariableRecord *, 12> DPToDelete;
   for (BasicBlock &BB : *F) {
     for (Instruction &I : BB) {
-      for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange()))
-        if (DPV.isDbgAssign())
-          DPToDelete.push_back(&DPV);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+        if (DVR.isDbgAssign())
+          DPToDelete.push_back(&DVR);
       if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
         ToDelete.push_back(DAI);
       else
@@ -1839,20 +1909,20 @@ void at::deleteAll(Function *F) {
   }
   for (auto *DAI : ToDelete)
     DAI->eraseFromParent();
-  for (auto *DPV : DPToDelete)
-    DPV->eraseFromParent();
+  for (auto *DVR : DPToDelete)
+    DVR->eraseFromParent();
 }
 
 /// Get the FragmentInfo for the variable if it exists, otherwise return a
 /// FragmentInfo that covers the entire variable if the variable size is
 /// known, otherwise return a zero-sized fragment.
 static DIExpression::FragmentInfo
-getFragmentOrEntireVariable(const DPValue *DPV) {
+getFragmentOrEntireVariable(const DbgVariableRecord *DVR) {
   DIExpression::FragmentInfo VariableSlice(0, 0);
   // Get the fragment or variable size, or zero.
-  if (auto Sz = DPV->getFragmentSizeInBits())
+  if (auto Sz = DVR->getFragmentSizeInBits())
     VariableSlice.SizeInBits = *Sz;
-  if (auto Frag = DPV->getExpression()->getFragmentInfo())
+  if (auto Frag = DVR->getExpression()->getFragmentInfo())
     VariableSlice.OffsetInBits = Frag->OffsetInBits;
   return VariableSlice;
 }
@@ -2016,10 +2086,10 @@ bool at::calculateFragmentIntersect(
 }
 bool at::calculateFragmentIntersect(
     const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
-    uint64_t SliceSizeInBits, const DPValue *DPVAssign,
+    uint64_t SliceSizeInBits, const DbgVariableRecord *DVRAssign,
     std::optional<DIExpression::FragmentInfo> &Result) {
   return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits,
-                                        SliceSizeInBits, DPVAssign, Result);
+                                        SliceSizeInBits, DVRAssign, Result);
 }
 
 /// Collect constant properies (base, size, offset) of \p StoreDest.
@@ -2113,7 +2183,7 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
   DIExpression *AddrExpr =
       DIExpression::get(StoreLikeInst.getContext(), std::nullopt);
   if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) {
-    auto *Assign = DPValue::createLinkedDPVAssign(
+    auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
         &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
     (void)Assign;
     LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
@@ -2231,7 +2301,7 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   // storage" is limited to Allocas). We'll use this to find dbg.declares to
   // delete after running `trackAssignments`.
   DenseMap<const AllocaInst *, SmallPtrSet<DbgDeclareInst *, 2>> DbgDeclares;
-  DenseMap<const AllocaInst *, SmallPtrSet<DPValue *, 2>> DPVDeclares;
+  DenseMap<const AllocaInst *, SmallPtrSet<DbgVariableRecord *, 2>> DVRDeclares;
   // Create another similar map of {storage : variables} that we'll pass to
   // trackAssignments.
   StorageToVarsMap Vars;
@@ -2257,9 +2327,9 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   };
   for (auto &BB : F) {
     for (auto &I : BB) {
-      for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange())) {
-        if (DPV.isDbgDeclare())
-          ProcessDeclare(&DPV, DPVDeclares);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (DVR.isDbgDeclare())
+          ProcessDeclare(&DVR, DVRDeclares);
       }
       if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(&I))
         ProcessDeclare(DDI, DbgDeclares);
@@ -2300,8 +2370,8 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   };
   for (auto &P : DbgDeclares)
     DeleteSubsumedDeclare(at::getAssignmentMarkers(P.first), P.second);
-  for (auto &P : DPVDeclares)
-    DeleteSubsumedDeclare(at::getDPVAssignmentMarkers(P.first), P.second);
+  for (auto &P : DVRDeclares)
+    DeleteSubsumedDeclare(at::getDVRAssignmentMarkers(P.first), P.second);
   return Changed;
 }
 
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 28f96653d815b5..570515505607fb 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -34,6 +34,10 @@ cl::opt<bool> EnableFSDiscriminator(
     cl::desc("Enable adding flow sensitive discriminators"));
 } // namespace llvm
 
+uint32_t DIType::getAlignInBits() const {
+  return (getTag() == dwarf::DW_TAG_LLVM_ptrauth_type ? 0 : SubclassData32);
+}
+
 const DIExpression::FragmentInfo DebugVariable::DefaultFragment = {
     std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
 
@@ -42,10 +46,10 @@ DebugVariable::DebugVariable(const DbgVariableIntrinsic *DII)
       Fragment(DII->getExpression()->getFragmentInfo()),
       InlinedAt(DII->getDebugLoc().getInlinedAt()) {}
 
-DebugVariable::DebugVariable(const DPValue *DPV)
-    : Variable(DPV->getVariable()),
-      Fragment(DPV->getExpression()->getFragmentInfo()),
-      InlinedAt(DPV->getDebugLoc().getInlinedAt()) {}
+DebugVariable::DebugVariable(const DbgVariableRecord *DVR)
+    : Variable(DVR->getVariable()),
+      Fragment(DVR->getExpression()->getFragmentInfo()),
+      InlinedAt(DVR->getDebugLoc().getInlinedAt()) {}
 
 DebugVariableAggregate::DebugVariableAggregate(const DbgVariableIntrinsic *DVI)
     : DebugVariable(DVI->getVariable(), std::nullopt,
@@ -731,26 +735,32 @@ Constant *DIDerivedType::getDiscriminantValue() const {
   return nullptr;
 }
 
-DIDerivedType *
-DIDerivedType::getImpl(LLVMContext &Context, unsigned Tag, MDString *Name,
-                       Metadata *File, unsigned Line, Metadata *Scope,
-                       Metadata *BaseType, uint64_t SizeInBits,
-                       uint32_t AlignInBits, uint64_t OffsetInBits,
-                       std::optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-                       Metadata *ExtraData, Metadata *Annotations,
-                       StorageType Storage, bool ShouldCreate) {
+DIDerivedType *DIDerivedType::getImpl(
+    LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
+    unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
+    uint32_t AlignInBits, uint64_t OffsetInBits,
+    std::optional<unsigned> DWARFAddressSpace,
+    std::optional<PtrAuthData> PtrAuthData, DIFlags Flags, Metadata *ExtraData,
+    Metadata *Annotations, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIDerivedType,
                         (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                         AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                         ExtraData, Annotations));
+                         AlignInBits, OffsetInBits, DWARFAddressSpace,
+                         PtrAuthData, Flags, ExtraData, Annotations));
   Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData, Annotations};
   DEFINE_GETIMPL_STORE(DIDerivedType,
                        (Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
-                        DWARFAddressSpace, Flags),
+                        DWARFAddressSpace, PtrAuthData, Flags),
                        Ops);
 }
 
+std::optional<DIDerivedType::PtrAuthData>
+DIDerivedType::getPtrAuthData() const {
+  return getTag() == dwarf::DW_TAG_LLVM_ptrauth_type
+             ? std::optional<PtrAuthData>(PtrAuthData(SubclassData32))
+             : std::nullopt;
+}
+
 DICompositeType *DICompositeType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
@@ -1880,11 +1890,12 @@ DIExpression *DIExpression::append(const DIExpression *Expr,
 DIExpression *DIExpression::appendToStack(const DIExpression *Expr,
                                           ArrayRef<uint64_t> Ops) {
   assert(Expr && !Ops.empty() && "Can't append ops to this expression");
-  assert(none_of(Ops,
-                 [](uint64_t Op) {
-                   return Op == dwarf::DW_OP_stack_value ||
-                          Op == dwarf::DW_OP_LLVM_fragment;
-                 }) &&
+  assert(std::none_of(expr_op_iterator(Ops.begin()),
+                      expr_op_iterator(Ops.end()),
+                      [](auto Op) {
+                        return Op.getOp() == dwarf::DW_OP_stack_value ||
+                               Op.getOp() == dwarf::DW_OP_LLVM_fragment;
+                      }) &&
          "Can't append this op");
 
   // Append a DW_OP_deref after Expr's current op list if it's non-empty and
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index f34d3aecfa0a38..1fb435f46a5fc0 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -28,7 +28,7 @@ template class DbgRecordParamRef<DIExpression>;
 template class DbgRecordParamRef<DILabel>;
 template class DbgRecordParamRef<DILocalVariable>;
 
-DPValue::DPValue(const DbgVariableIntrinsic *DVI)
+DbgVariableRecord::DbgVariableRecord(const DbgVariableIntrinsic *DVI)
     : DbgRecord(ValueKind, DVI->getDebugLoc()),
       DebugValueUser({DVI->getRawLocation(), nullptr, nullptr}),
       Variable(DVI->getVariable()), Expression(DVI->getExpression()),
@@ -51,25 +51,27 @@ DPValue::DPValue(const DbgVariableIntrinsic *DVI)
   }
   default:
     llvm_unreachable(
-        "Trying to create a DPValue with an invalid intrinsic type!");
+        "Trying to create a DbgVariableRecord with an invalid intrinsic type!");
   }
 }
 
-DPValue::DPValue(const DPValue &DPV)
-    : DbgRecord(ValueKind, DPV.getDebugLoc()), DebugValueUser(DPV.DebugValues),
-      Type(DPV.getType()), Variable(DPV.getVariable()),
-      Expression(DPV.getExpression()),
-      AddressExpression(DPV.AddressExpression) {}
+DbgVariableRecord::DbgVariableRecord(const DbgVariableRecord &DVR)
+    : DbgRecord(ValueKind, DVR.getDebugLoc()), DebugValueUser(DVR.DebugValues),
+      Type(DVR.getType()), Variable(DVR.getVariable()),
+      Expression(DVR.getExpression()),
+      AddressExpression(DVR.AddressExpression) {}
 
-DPValue::DPValue(Metadata *Location, DILocalVariable *DV, DIExpression *Expr,
-                 const DILocation *DI, LocationType Type)
+DbgVariableRecord::DbgVariableRecord(Metadata *Location, DILocalVariable *DV,
+                                     DIExpression *Expr, const DILocation *DI,
+                                     LocationType Type)
     : DbgRecord(ValueKind, DI), DebugValueUser({Location, nullptr, nullptr}),
       Type(Type), Variable(DV), Expression(Expr) {}
 
-DPValue::DPValue(Metadata *Value, DILocalVariable *Variable,
-                 DIExpression *Expression, DIAssignID *AssignID,
-                 Metadata *Address, DIExpression *AddressExpression,
-                 const DILocation *DI)
+DbgVariableRecord::DbgVariableRecord(Metadata *Value, DILocalVariable *Variable,
+                                     DIExpression *Expression,
+                                     DIAssignID *AssignID, Metadata *Address,
+                                     DIExpression *AddressExpression,
+                                     const DILocation *DI)
     : DbgRecord(ValueKind, DI), DebugValueUser({Value, Address, AssignID}),
       Type(LocationType::Assign), Variable(Variable), Expression(Expression),
       AddressExpression(AddressExpression) {}
@@ -77,7 +79,7 @@ DPValue::DPValue(Metadata *Value, DILocalVariable *Variable,
 void DbgRecord::deleteRecord() {
   switch (RecordKind) {
   case ValueKind:
-    delete cast<DPValue>(this);
+    delete cast<DbgVariableRecord>(this);
     return;
   case LabelKind:
     delete cast<DPLabel>(this);
@@ -89,7 +91,7 @@ void DbgRecord::deleteRecord() {
 void DbgRecord::print(raw_ostream &O, bool IsForDebug) const {
   switch (RecordKind) {
   case ValueKind:
-    cast<DPValue>(this)->print(O, IsForDebug);
+    cast<DbgVariableRecord>(this)->print(O, IsForDebug);
     return;
   case LabelKind:
     cast<DPLabel>(this)->print(O, IsForDebug);
@@ -102,7 +104,7 @@ void DbgRecord::print(raw_ostream &O, ModuleSlotTracker &MST,
                       bool IsForDebug) const {
   switch (RecordKind) {
   case ValueKind:
-    cast<DPValue>(this)->print(O, MST, IsForDebug);
+    cast<DbgVariableRecord>(this)->print(O, MST, IsForDebug);
     return;
   case LabelKind:
     cast<DPLabel>(this)->print(O, MST, IsForDebug);
@@ -116,7 +118,8 @@ bool DbgRecord::isIdenticalToWhenDefined(const DbgRecord &R) const {
     return false;
   switch (RecordKind) {
   case ValueKind:
-    return cast<DPValue>(this)->isIdenticalToWhenDefined(*cast<DPValue>(&R));
+    return cast<DbgVariableRecord>(this)->isIdenticalToWhenDefined(
+        *cast<DbgVariableRecord>(&R));
   case LabelKind:
     return cast<DPLabel>(this)->getLabel() == cast<DPLabel>(R).getLabel();
   };
@@ -131,7 +134,7 @@ DbgInfoIntrinsic *
 DbgRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
   switch (RecordKind) {
   case ValueKind:
-    return cast<DPValue>(this)->createDebugIntrinsic(M, InsertBefore);
+    return cast<DbgVariableRecord>(this)->createDebugIntrinsic(M, InsertBefore);
   case LabelKind:
     return cast<DPLabel>(this)->createDebugIntrinsic(M, InsertBefore);
   };
@@ -153,79 +156,83 @@ DPLabel *DPLabel::createUnresolvedDPLabel(MDNode *Label, MDNode *DL) {
   return new DPLabel(Label, DL);
 }
 
-DPValue::DPValue(DPValue::LocationType Type, Metadata *Val, MDNode *Variable,
-                 MDNode *Expression, MDNode *AssignID, Metadata *Address,
-                 MDNode *AddressExpression, MDNode *DI)
+DbgVariableRecord::DbgVariableRecord(DbgVariableRecord::LocationType Type,
+                                     Metadata *Val, MDNode *Variable,
+                                     MDNode *Expression, MDNode *AssignID,
+                                     Metadata *Address,
+                                     MDNode *AddressExpression, MDNode *DI)
     : DbgRecord(ValueKind, DebugLoc(DI)),
       DebugValueUser({Val, Address, AssignID}), Type(Type), Variable(Variable),
       Expression(Expression), AddressExpression(AddressExpression) {}
 
-DPValue *DPValue::createUnresolvedDPValue(DPValue::LocationType Type,
-                                          Metadata *Val, MDNode *Variable,
-                                          MDNode *Expression, MDNode *AssignID,
-                                          Metadata *Address,
-                                          MDNode *AddressExpression,
-                                          MDNode *DI) {
-  return new DPValue(Type, Val, Variable, Expression, AssignID, Address,
-                     AddressExpression, DI);
+DbgVariableRecord *DbgVariableRecord::createUnresolvedDbgVariableRecord(
+    DbgVariableRecord::LocationType Type, Metadata *Val, MDNode *Variable,
+    MDNode *Expression, MDNode *AssignID, Metadata *Address,
+    MDNode *AddressExpression, MDNode *DI) {
+  return new DbgVariableRecord(Type, Val, Variable, Expression, AssignID,
+                               Address, AddressExpression, DI);
 }
 
-DPValue *DPValue::createDPValue(Value *Location, DILocalVariable *DV,
-                                DIExpression *Expr, const DILocation *DI) {
-  return new DPValue(ValueAsMetadata::get(Location), DV, Expr, DI,
-                     LocationType::Value);
+DbgVariableRecord *
+DbgVariableRecord::createDbgVariableRecord(Value *Location, DILocalVariable *DV,
+                                           DIExpression *Expr,
+                                           const DILocation *DI) {
+  return new DbgVariableRecord(ValueAsMetadata::get(Location), DV, Expr, DI,
+                               LocationType::Value);
 }
 
-DPValue *DPValue::createDPValue(Value *Location, DILocalVariable *DV,
-                                DIExpression *Expr, const DILocation *DI,
-                                DPValue &InsertBefore) {
-  auto *NewDPValue = createDPValue(Location, DV, Expr, DI);
-  NewDPValue->insertBefore(&InsertBefore);
-  return NewDPValue;
+DbgVariableRecord *DbgVariableRecord::createDbgVariableRecord(
+    Value *Location, DILocalVariable *DV, DIExpression *Expr,
+    const DILocation *DI, DbgVariableRecord &InsertBefore) {
+  auto *NewDbgVariableRecord = createDbgVariableRecord(Location, DV, Expr, DI);
+  NewDbgVariableRecord->insertBefore(&InsertBefore);
+  return NewDbgVariableRecord;
 }
 
-DPValue *DPValue::createDPVDeclare(Value *Address, DILocalVariable *DV,
-                                   DIExpression *Expr, const DILocation *DI) {
-  return new DPValue(ValueAsMetadata::get(Address), DV, Expr, DI,
-                     LocationType::Declare);
+DbgVariableRecord *DbgVariableRecord::createDVRDeclare(Value *Address,
+                                                       DILocalVariable *DV,
+                                                       DIExpression *Expr,
+                                                       const DILocation *DI) {
+  return new DbgVariableRecord(ValueAsMetadata::get(Address), DV, Expr, DI,
+                               LocationType::Declare);
 }
 
-DPValue *DPValue::createDPVDeclare(Value *Address, DILocalVariable *DV,
-                                   DIExpression *Expr, const DILocation *DI,
-                                   DPValue &InsertBefore) {
-  auto *NewDPVDeclare = createDPVDeclare(Address, DV, Expr, DI);
-  NewDPVDeclare->insertBefore(&InsertBefore);
-  return NewDPVDeclare;
+DbgVariableRecord *
+DbgVariableRecord::createDVRDeclare(Value *Address, DILocalVariable *DV,
+                                    DIExpression *Expr, const DILocation *DI,
+                                    DbgVariableRecord &InsertBefore) {
+  auto *NewDVRDeclare = createDVRDeclare(Address, DV, Expr, DI);
+  NewDVRDeclare->insertBefore(&InsertBefore);
+  return NewDVRDeclare;
 }
 
-DPValue *DPValue::createDPVAssign(Value *Val, DILocalVariable *Variable,
-                                  DIExpression *Expression,
-                                  DIAssignID *AssignID, Value *Address,
-                                  DIExpression *AddressExpression,
-                                  const DILocation *DI) {
-  return new DPValue(ValueAsMetadata::get(Val), Variable, Expression, AssignID,
-                     ValueAsMetadata::get(Address), AddressExpression, DI);
+DbgVariableRecord *DbgVariableRecord::createDVRAssign(
+    Value *Val, DILocalVariable *Variable, DIExpression *Expression,
+    DIAssignID *AssignID, Value *Address, DIExpression *AddressExpression,
+    const DILocation *DI) {
+  return new DbgVariableRecord(ValueAsMetadata::get(Val), Variable, Expression,
+                               AssignID, ValueAsMetadata::get(Address),
+                               AddressExpression, DI);
 }
 
-DPValue *DPValue::createLinkedDPVAssign(Instruction *LinkedInstr, Value *Val,
-                                        DILocalVariable *Variable,
-                                        DIExpression *Expression,
-                                        Value *Address,
-                                        DIExpression *AddressExpression,
-                                        const DILocation *DI) {
+DbgVariableRecord *DbgVariableRecord::createLinkedDVRAssign(
+    Instruction *LinkedInstr, Value *Val, DILocalVariable *Variable,
+    DIExpression *Expression, Value *Address, DIExpression *AddressExpression,
+    const DILocation *DI) {
   auto *Link = LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID);
   assert(Link && "Linked instruction must have DIAssign metadata attached");
-  auto *NewDPVAssign = DPValue::createDPVAssign(Val, Variable, Expression,
-                                                cast<DIAssignID>(Link), Address,
-                                                AddressExpression, DI);
-  LinkedInstr->getParent()->insertDbgRecordAfter(NewDPVAssign, LinkedInstr);
-  return NewDPVAssign;
+  auto *NewDVRAssign = DbgVariableRecord::createDVRAssign(
+      Val, Variable, Expression, cast<DIAssignID>(Link), Address,
+      AddressExpression, DI);
+  LinkedInstr->getParent()->insertDbgRecordAfter(NewDVRAssign, LinkedInstr);
+  return NewDVRAssign;
 }
 
-iterator_range<DPValue::location_op_iterator> DPValue::location_ops() const {
+iterator_range<DbgVariableRecord::location_op_iterator>
+DbgVariableRecord::location_ops() const {
   auto *MD = getRawLocation();
-  // If a Value has been deleted, the "location" for this DPValue will be
-  // replaced by nullptr. Return an empty range.
+  // If a Value has been deleted, the "location" for this DbgVariableRecord will
+  // be replaced by nullptr. Return an empty range.
   if (!MD)
     return {location_op_iterator(static_cast<ValueAsMetadata *>(nullptr)),
             location_op_iterator(static_cast<ValueAsMetadata *>(nullptr))};
@@ -245,13 +252,13 @@ iterator_range<DPValue::location_op_iterator> DPValue::location_ops() const {
           location_op_iterator(static_cast<ValueAsMetadata *>(nullptr))};
 }
 
-unsigned DPValue::getNumVariableLocationOps() const {
+unsigned DbgVariableRecord::getNumVariableLocationOps() const {
   if (hasArgList())
     return cast<DIArgList>(getRawLocation())->getArgs().size();
   return 1;
 }
 
-Value *DPValue::getVariableLocationOp(unsigned OpIdx) const {
+Value *DbgVariableRecord::getVariableLocationOp(unsigned OpIdx) const {
   auto *MD = getRawLocation();
   if (!MD)
     return nullptr;
@@ -261,7 +268,7 @@ Value *DPValue::getVariableLocationOp(unsigned OpIdx) const {
   if (isa<MDNode>(MD))
     return nullptr;
   assert(isa<ValueAsMetadata>(MD) &&
-         "Attempted to get location operand from DPValue with none.");
+         "Attempted to get location operand from DbgVariableRecord with none.");
   auto *V = cast<ValueAsMetadata>(MD);
   assert(OpIdx == 0 && "Operand Index must be 0 for a debug intrinsic with a "
                        "single location operand.");
@@ -274,8 +281,9 @@ static ValueAsMetadata *getAsMetadata(Value *V) {
                                  : ValueAsMetadata::get(V);
 }
 
-void DPValue::replaceVariableLocationOp(Value *OldValue, Value *NewValue,
-                                        bool AllowEmpty) {
+void DbgVariableRecord::replaceVariableLocationOp(Value *OldValue,
+                                                  Value *NewValue,
+                                                  bool AllowEmpty) {
   assert(NewValue && "Values must be non-null");
 
   bool DbgAssignAddrReplaced = isDbgAssign() && OldValue == getAddress();
@@ -307,7 +315,8 @@ void DPValue::replaceVariableLocationOp(Value *OldValue, Value *NewValue,
   setRawLocation(DIArgList::get(getVariableLocationOp(0)->getContext(), MDs));
 }
 
-void DPValue::replaceVariableLocationOp(unsigned OpIdx, Value *NewValue) {
+void DbgVariableRecord::replaceVariableLocationOp(unsigned OpIdx,
+                                                  Value *NewValue) {
   assert(OpIdx < getNumVariableLocationOps() && "Invalid Operand Index");
 
   if (!hasArgList()) {
@@ -326,8 +335,8 @@ void DPValue::replaceVariableLocationOp(unsigned OpIdx, Value *NewValue) {
   setRawLocation(DIArgList::get(getVariableLocationOp(0)->getContext(), MDs));
 }
 
-void DPValue::addVariableLocationOps(ArrayRef<Value *> NewValues,
-                                     DIExpression *NewExpr) {
+void DbgVariableRecord::addVariableLocationOps(ArrayRef<Value *> NewValues,
+                                               DIExpression *NewExpr) {
   assert(NewExpr->hasAllLocationOps(getNumVariableLocationOps() +
                                     NewValues.size()) &&
          "NewExpr for debug variable intrinsic does not reference every "
@@ -342,7 +351,7 @@ void DPValue::addVariableLocationOps(ArrayRef<Value *> NewValues,
   setRawLocation(DIArgList::get(getVariableLocationOp(0)->getContext(), MDs));
 }
 
-void DPValue::setKillLocation() {
+void DbgVariableRecord::setKillLocation() {
   // TODO: When/if we remove duplicate values from DIArgLists, we don't need
   // this set anymore.
   SmallPtrSet<Value *, 4> RemovedValues;
@@ -354,13 +363,13 @@ void DPValue::setKillLocation() {
   }
 }
 
-bool DPValue::isKillLocation() const {
+bool DbgVariableRecord::isKillLocation() const {
   return (getNumVariableLocationOps() == 0 &&
           !getExpression()->isComplex()) ||
          any_of(location_ops(), [](Value *V) { return isa<UndefValue>(V); });
 }
 
-std::optional<uint64_t> DPValue::getFragmentSizeInBits() const {
+std::optional<uint64_t> DbgVariableRecord::getFragmentSizeInBits() const {
   if (auto Fragment = getExpression()->getFragmentInfo())
     return Fragment->SizeInBits;
   return getVariable()->getSizeInBits();
@@ -369,21 +378,24 @@ std::optional<uint64_t> DPValue::getFragmentSizeInBits() const {
 DbgRecord *DbgRecord::clone() const {
   switch (RecordKind) {
   case ValueKind:
-    return cast<DPValue>(this)->clone();
+    return cast<DbgVariableRecord>(this)->clone();
   case LabelKind:
     return cast<DPLabel>(this)->clone();
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
 
-DPValue *DPValue::clone() const { return new DPValue(*this); }
+DbgVariableRecord *DbgVariableRecord::clone() const {
+  return new DbgVariableRecord(*this);
+}
 
 DPLabel *DPLabel::clone() const {
   return new DPLabel(getLabel(), getDebugLoc());
 }
 
 DbgVariableIntrinsic *
-DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
+DbgVariableRecord::createDebugIntrinsic(Module *M,
+                                        Instruction *InsertBefore) const {
   [[maybe_unused]] DICompileUnit *Unit =
       getDebugLoc().get()->getScope()->getSubprogram()->getUnit();
   assert(M && Unit &&
@@ -394,24 +406,25 @@ DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
 
   // Work out what sort of intrinsic we're going to produce.
   switch (getType()) {
-  case DPValue::LocationType::Declare:
+  case DbgVariableRecord::LocationType::Declare:
     IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare);
     break;
-  case DPValue::LocationType::Value:
+  case DbgVariableRecord::LocationType::Value:
     IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_value);
     break;
-  case DPValue::LocationType::Assign:
+  case DbgVariableRecord::LocationType::Assign:
     IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign);
     break;
-  case DPValue::LocationType::End:
-  case DPValue::LocationType::Any:
+  case DbgVariableRecord::LocationType::End:
+  case DbgVariableRecord::LocationType::Any:
     llvm_unreachable("Invalid LocationType");
   }
 
-  // Create the intrinsic from this DPValue's information, optionally insert
-  // into the target location.
+  // Create the intrinsic from this DbgVariableRecord's information, optionally
+  // insert into the target location.
   DbgVariableIntrinsic *DVI;
-  assert(getRawLocation() && "DPValue's RawLocation should be non-null.");
+  assert(getRawLocation() &&
+         "DbgVariableRecord's RawLocation should be non-null.");
   if (isDbgAssign()) {
     Value *AssignArgs[] = {
         MetadataAsValue::get(Context, getRawLocation()),
@@ -451,7 +464,7 @@ DbgLabelInst *DPLabel::createDebugIntrinsic(Module *M,
   return DbgLabel;
 }
 
-Value *DPValue::getAddress() const {
+Value *DbgVariableRecord::getAddress() const {
   auto *MD = getRawAddress();
   if (auto *V = dyn_cast<ValueAsMetadata>(MD))
     return V->getValue();
@@ -461,18 +474,20 @@ Value *DPValue::getAddress() const {
   return nullptr;
 }
 
-DIAssignID *DPValue::getAssignID() const {
+DIAssignID *DbgVariableRecord::getAssignID() const {
   return cast<DIAssignID>(DebugValues[2]);
 }
 
-void DPValue::setAssignId(DIAssignID *New) { resetDebugValue(2, New); }
+void DbgVariableRecord::setAssignId(DIAssignID *New) {
+  resetDebugValue(2, New);
+}
 
-void DPValue::setKillAddress() {
+void DbgVariableRecord::setKillAddress() {
   resetDebugValue(
       1, ValueAsMetadata::get(UndefValue::get(getAddress()->getType())));
 }
 
-bool DPValue::isKillAddress() const {
+bool DbgVariableRecord::isKillAddress() const {
   Value *Addr = getAddress();
   return !Addr || isa<UndefValue>(Addr);
 }
@@ -647,8 +662,8 @@ void DPMarker::insertDbgRecordAfter(DbgRecord *New, DbgRecord *InsertAfter) {
 
 void DPMarker::absorbDebugValues(DPMarker &Src, bool InsertAtHead) {
   auto It = InsertAtHead ? StoredDbgRecords.begin() : StoredDbgRecords.end();
-  for (DbgRecord &DPV : Src.StoredDbgRecords)
-    DPV.setMarker(this);
+  for (DbgRecord &DVR : Src.StoredDbgRecords)
+    DVR.setMarker(this);
 
   StoredDbgRecords.splice(It, Src.StoredDbgRecords);
 }
@@ -677,8 +692,8 @@ iterator_range<simple_ilist<DbgRecord>::iterator> DPMarker::cloneDebugInfoFrom(
   if (from_here.has_value())
     Range = make_range(*from_here, From->StoredDbgRecords.end());
 
-  // Clone each DPValue and insert into StoreDPValues; optionally place them at
-  // the start or the end of the list.
+  // Clone each DbgVariableRecord and insert into StoreDbgVariableRecords;
+  // optionally place them at the start or the end of the list.
   auto Pos = (InsertAtHead) ? StoredDbgRecords.begin() : StoredDbgRecords.end();
   for (DbgRecord &DR : Range) {
     DbgRecord *New = DR.clone();
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index b09b80f95871a1..d6746d1d438242 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -918,12 +918,14 @@ CallInst *IRBuilderBase::CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
   return createCallHelper(Fn, {V}, Name, FMFSource);
 }
 
-CallInst *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS,
-                                               Value *RHS,
-                                               Instruction *FMFSource,
-                                               const Twine &Name) {
+Value *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS,
+                                            Value *RHS, Instruction *FMFSource,
+                                            const Twine &Name) {
   Module *M = BB->getModule();
   Function *Fn = Intrinsic::getDeclaration(M, ID, { LHS->getType() });
+  if (Value *V = Folder.FoldBinaryIntrinsic(ID, LHS, RHS, Fn->getReturnType(),
+                                            FMFSource))
+    return V;
   return createCallHelper(Fn, {LHS, RHS}, Name, FMFSource);
 }
 
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 7a677d7f3c2be8..9744eb32d27a5e 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -143,8 +143,8 @@ void Instruction::insertBefore(BasicBlock &BB,
     return;
 
   // We've inserted "this": if InsertAtHead is set then it comes before any
-  // DPValues attached to InsertPos. But if it's not set, then any DbgRecords
-  // should now come before "this".
+  // DbgVariableRecords attached to InsertPos. But if it's not set, then any
+  // DbgRecords should now come before "this".
   bool InsertAtHead = InsertPos.getHeadBit();
   if (!InsertAtHead) {
     DPMarker *SrcMarker = BB.getMarker(InsertPos);
@@ -217,7 +217,7 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
   if (BB.IsNewDbgInfoFormat && DbgMarker && !Preserve) {
     if (I != this->getIterator() || InsertAtHead) {
       // "this" is definitely moving in the list, or it's moving ahead of its
-      // attached DPValues. Detach any existing DbgRecords.
+      // attached DbgVariableRecords. Detach any existing DbgRecords.
       handleMarkerRemoval();
     }
   }
@@ -320,8 +320,8 @@ void Instruction::dropDbgRecords() {
     DbgMarker->dropDbgRecords();
 }
 
-void Instruction::dropOneDbgRecord(DbgRecord *DPV) {
-  DbgMarker->dropOneDbgRecord(DPV);
+void Instruction::dropOneDbgRecord(DbgRecord *DVR) {
+  DbgMarker->dropOneDbgRecord(DVR);
 }
 
 bool Instruction::comesBefore(const Instruction *Other) const {
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index a471314ccb940a..72fedd8d673980 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -48,8 +48,8 @@ LLVMContextImpl::~LLVMContextImpl() {
 #ifndef NDEBUG
   // Check that any variable location records that fell off the end of a block
   // when it's terminator was removed were eventually replaced. This assertion
-  // firing indicates that DPValues went missing during the lifetime of the
-  // LLVMContext.
+  // firing indicates that DbgVariableRecords went missing during the lifetime
+  // of the LLVMContext.
   assert(TrailingDbgRecords.empty() && "DbgRecords in blocks not cleaned");
 #endif
 
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index b1dcb262fb657f..4542e16e59fd69 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -540,6 +540,7 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   uint64_t OffsetInBits;
   uint32_t AlignInBits;
   std::optional<unsigned> DWARFAddressSpace;
+  std::optional<DIDerivedType::PtrAuthData> PtrAuthData;
   unsigned Flags;
   Metadata *ExtraData;
   Metadata *Annotations;
@@ -547,18 +548,21 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
                 uint32_t AlignInBits, uint64_t OffsetInBits,
-                std::optional<unsigned> DWARFAddressSpace, unsigned Flags,
-                Metadata *ExtraData, Metadata *Annotations)
+                std::optional<unsigned> DWARFAddressSpace,
+                std::optional<DIDerivedType::PtrAuthData> PtrAuthData,
+                unsigned Flags, Metadata *ExtraData, Metadata *Annotations)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
         AlignInBits(AlignInBits), DWARFAddressSpace(DWARFAddressSpace),
-        Flags(Flags), ExtraData(ExtraData), Annotations(Annotations) {}
+        PtrAuthData(PtrAuthData), Flags(Flags), ExtraData(ExtraData),
+        Annotations(Annotations) {}
   MDNodeKeyImpl(const DIDerivedType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
         BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
         OffsetInBits(N->getOffsetInBits()), AlignInBits(N->getAlignInBits()),
-        DWARFAddressSpace(N->getDWARFAddressSpace()), Flags(N->getFlags()),
+        DWARFAddressSpace(N->getDWARFAddressSpace()),
+        PtrAuthData(N->getPtrAuthData()), Flags(N->getFlags()),
         ExtraData(N->getRawExtraData()), Annotations(N->getRawAnnotations()) {}
 
   bool isKeyOf(const DIDerivedType *RHS) const {
@@ -569,7 +573,8 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
            AlignInBits == RHS->getAlignInBits() &&
            OffsetInBits == RHS->getOffsetInBits() &&
            DWARFAddressSpace == RHS->getDWARFAddressSpace() &&
-           Flags == RHS->getFlags() && ExtraData == RHS->getRawExtraData() &&
+           PtrAuthData == RHS->getPtrAuthData() && Flags == RHS->getFlags() &&
+           ExtraData == RHS->getRawExtraData() &&
            Annotations == RHS->getRawAnnotations();
   }
 
@@ -1670,20 +1675,20 @@ class LLVMContextImpl {
   /// LLVMContext is used by compilation.
   void setOptPassGate(OptPassGate &);
 
-  /// Mapping of blocks to collections of "trailing" DPValues. As part of the
-  /// "RemoveDIs" project, debug-info variable location records are going to
-  /// cease being instructions... which raises the problem of where should they
-  /// be recorded when we remove the terminator of a blocks, such as:
+  /// Mapping of blocks to collections of "trailing" DbgVariableRecords. As part
+  /// of the "RemoveDIs" project, debug-info variable location records are going
+  /// to cease being instructions... which raises the problem of where should
+  /// they be recorded when we remove the terminator of a blocks, such as:
   ///
   ///    %foo = add i32 0, 0
   ///    br label %bar
   ///
   /// If the branch is removed, a legitimate transient state while editing a
   /// block, any debug-records between those two instructions will not have a
-  /// location. Each block thus records any DPValue records that "trail" in
-  /// such a way. These are stored in LLVMContext because typically LLVM only
-  /// edits a small number of blocks at a time, so there's no need to bloat
-  /// BasicBlock with such a data structure.
+  /// location. Each block thus records any DbgVariableRecord records that
+  /// "trail" in such a way. These are stored in LLVMContext because typically
+  /// LLVM only edits a small number of blocks at a time, so there's no need to
+  /// bloat BasicBlock with such a data structure.
   SmallDenseMap<BasicBlock *, DPMarker *> TrailingDbgRecords;
 
   // Set, get and delete operations for TrailingDbgRecords.
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 8945c6dbc8d848..953f21ce740590 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -528,8 +528,9 @@ bool PassManagerImpl::run(Module &M) {
   dumpArguments();
   dumpPasses();
 
-  // RemoveDIs: if a command line flag is given, convert to the DPValue
-  // representation of debug-info for the duration of these passes.
+  // RemoveDIs: if a command line flag is given, convert to the
+  // DbgVariableRecord representation of debug-info for the duration of these
+  // passes.
   bool shouldConvertDbgInfo = UseNewDbgInfoFormat && !M.IsNewDbgInfoFormat;
   if (shouldConvertDbgInfo)
     M.convertToNewDbgValues();
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 06db91fe4f8e74..4472bf128c323f 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -148,9 +148,11 @@ void MetadataAsValue::untrack() {
     MetadataTracking::untrack(MD);
 }
 
-DPValue *DebugValueUser::getUser() { return static_cast<DPValue *>(this); }
-const DPValue *DebugValueUser::getUser() const {
-  return static_cast<const DPValue *>(this);
+DbgVariableRecord *DebugValueUser::getUser() {
+  return static_cast<DbgVariableRecord *>(this);
+}
+const DbgVariableRecord *DebugValueUser::getUser() const {
+  return static_cast<const DbgVariableRecord *>(this);
 }
 
 void DebugValueUser::handleChangedValue(void *Old, Metadata *New) {
@@ -266,28 +268,29 @@ SmallVector<Metadata *> ReplaceableMetadataImpl::getAllArgListUsers() {
   return MDUsers;
 }
 
-SmallVector<DPValue *> ReplaceableMetadataImpl::getAllDPValueUsers() {
-  SmallVector<std::pair<OwnerTy, uint64_t> *> DPVUsersWithID;
+SmallVector<DbgVariableRecord *>
+ReplaceableMetadataImpl::getAllDbgVariableRecordUsers() {
+  SmallVector<std::pair<OwnerTy, uint64_t> *> DVRUsersWithID;
   for (auto Pair : UseMap) {
     OwnerTy Owner = Pair.second.first;
     if (Owner.isNull())
       continue;
     if (!Owner.is<DebugValueUser *>())
       continue;
-    DPVUsersWithID.push_back(&UseMap[Pair.first]);
+    DVRUsersWithID.push_back(&UseMap[Pair.first]);
   }
-  // Order DPValue users in reverse-creation order. Normal dbg.value users
-  // of MetadataAsValues are ordered by their UseList, i.e. reverse order of
-  // when they were added: we need to replicate that here. The structure of
+  // Order DbgVariableRecord users in reverse-creation order. Normal dbg.value
+  // users of MetadataAsValues are ordered by their UseList, i.e. reverse order
+  // of when they were added: we need to replicate that here. The structure of
   // debug-info output depends on the ordering of intrinsics, thus we need
   // to keep them consistent for comparisons sake.
-  llvm::sort(DPVUsersWithID, [](auto UserA, auto UserB) {
+  llvm::sort(DVRUsersWithID, [](auto UserA, auto UserB) {
     return UserA->second > UserB->second;
   });
-  SmallVector<DPValue *> DPVUsers;
-  for (auto UserWithID : DPVUsersWithID)
-    DPVUsers.push_back(UserWithID->first.get<DebugValueUser *>()->getUser());
-  return DPVUsers;
+  SmallVector<DbgVariableRecord *> DVRUsers;
+  for (auto UserWithID : DVRUsersWithID)
+    DVRUsers.push_back(UserWithID->first.get<DebugValueUser *>()->getUser());
+  return DVRUsers;
 }
 
 void ReplaceableMetadataImpl::addRef(void *Ref, OwnerTy Owner) {
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index caf8fe654a36dc..b9cd219d94dc8a 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -37,7 +37,7 @@ bool Operator::hasPoisonGeneratingFlags() const {
   case Instruction::GetElementPtr: {
     auto *GEP = cast<GEPOperator>(this);
     // Note: inrange exists on constexpr only
-    return GEP->isInBounds() || GEP->getInRangeIndex() != std::nullopt;
+    return GEP->isInBounds() || GEP->getInRange() != std::nullopt;
   }
   case Instruction::ZExt:
     if (auto *NNI = dyn_cast<PossiblyNonNegInst>(this))
@@ -69,6 +69,12 @@ Type *GEPOperator::getResultElementType() const {
   return cast<GetElementPtrConstantExpr>(this)->getResultElementType();
 }
 
+std::optional<ConstantRange> GEPOperator::getInRange() const {
+  if (auto *CE = dyn_cast<GetElementPtrConstantExpr>(this))
+    return CE->getInRange();
+  return std::nullopt;
+}
+
 Align GEPOperator::getMaxPreservedAlignment(const DataLayout &DL) const {
   /// compute the worse possible offset for every level of the GEP et accumulate
   /// the minimum alignment into Result.
diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp
index 42dec7c72328ea..9b07bd8040492a 100644
--- a/llvm/lib/IR/ReplaceConstant.cpp
+++ b/llvm/lib/IR/ReplaceConstant.cpp
@@ -22,11 +22,13 @@ static bool isExpandableUser(User *U) {
   return isa<ConstantExpr>(U) || isa<ConstantAggregate>(U);
 }
 
-static SmallVector<Instruction *, 4> expandUser(Instruction *InsertPt,
+static SmallVector<Instruction *, 4> expandUser(BasicBlock::iterator InsertPt,
                                                 Constant *C) {
   SmallVector<Instruction *, 4> NewInsts;
   if (auto *CE = dyn_cast<ConstantExpr>(C)) {
-    NewInsts.push_back(CE->getAsInstruction(InsertPt));
+    Instruction *ConstInst = CE->getAsInstruction();
+    ConstInst->insertBefore(*InsertPt->getParent(), InsertPt);
+    NewInsts.push_back(ConstInst);
   } else if (isa<ConstantStruct>(C) || isa<ConstantArray>(C)) {
     Value *V = PoisonValue::get(C->getType());
     for (auto [Idx, Op] : enumerate(C->operands())) {
@@ -80,12 +82,11 @@ bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts) {
     Instruction *I = InstructionWorklist.pop_back_val();
     DebugLoc Loc = I->getDebugLoc();
     for (Use &U : I->operands()) {
-      auto *BI = I;
+      BasicBlock::iterator BI = I->getIterator();
       if (auto *Phi = dyn_cast<PHINode>(I)) {
         BasicBlock *BB = Phi->getIncomingBlock(U);
-        BasicBlock::iterator It = BB->getFirstInsertionPt();
-        assert(It != BB->end() && "Unexpected empty basic block");
-        BI = &*It;
+        BI = BB->getFirstInsertionPt();
+        assert(BI != BB->end() && "Unexpected empty basic block");
       }
 
       if (auto *C = dyn_cast<Constant>(U.get())) {
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index c4e47975b18221..61e1c35ba4a3a2 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -574,16 +574,16 @@ void Value::replaceUsesWithIf(Value *New,
 /// with New.
 static void replaceDbgUsesOutsideBlock(Value *V, Value *New, BasicBlock *BB) {
   SmallVector<DbgVariableIntrinsic *> DbgUsers;
-  SmallVector<DPValue *> DPUsers;
+  SmallVector<DbgVariableRecord *> DPUsers;
   findDbgUsers(DbgUsers, V, &DPUsers);
   for (auto *DVI : DbgUsers) {
     if (DVI->getParent() != BB)
       DVI->replaceVariableLocationOp(V, New);
   }
-  for (auto *DPV : DPUsers) {
-    DPMarker *Marker = DPV->getMarker();
+  for (auto *DVR : DPUsers) {
+    DPMarker *Marker = DVR->getMarker();
     if (Marker->getParent() != BB)
-      DPV->replaceVariableLocationOp(V, New);
+      DVR->replaceVariableLocationOp(V, New);
   }
 }
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 2b9dc745d7bfc5..1e16e864846241 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -180,21 +180,21 @@ struct VerifierSupport {
     }
   }
 
-  void Write(DPValue::LocationType Type) {
+  void Write(DbgVariableRecord::LocationType Type) {
     switch (Type) {
-    case DPValue::LocationType::Value:
+    case DbgVariableRecord::LocationType::Value:
       *OS << "value";
       break;
-    case DPValue::LocationType::Declare:
+    case DbgVariableRecord::LocationType::Declare:
       *OS << "declare";
       break;
-    case DPValue::LocationType::Assign:
+    case DbgVariableRecord::LocationType::Assign:
       *OS << "assign";
       break;
-    case DPValue::LocationType::End:
+    case DbgVariableRecord::LocationType::End:
       *OS << "end";
       break;
-    case DPValue::LocationType::Any:
+    case DbgVariableRecord::LocationType::Any:
       *OS << "any";
       break;
     };
@@ -545,7 +545,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void visitTemplateParams(const MDNode &N, const Metadata &RawParams);
 
   void visit(DPLabel &DPL);
-  void visit(DPValue &DPV);
+  void visit(DbgVariableRecord &DVR);
   // InstVisitor overrides...
   using InstVisitor<Verifier>::visit;
   void visitDbgRecords(Instruction &I);
@@ -632,15 +632,15 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void verifySiblingFuncletUnwinds();
 
   void verifyFragmentExpression(const DbgVariableIntrinsic &I);
-  void verifyFragmentExpression(const DPValue &I);
+  void verifyFragmentExpression(const DbgVariableRecord &I);
   template <typename ValueOrMetadata>
   void verifyFragmentExpression(const DIVariable &V,
                                 DIExpression::FragmentInfo Fragment,
                                 ValueOrMetadata *Desc);
   void verifyFnArgs(const DbgVariableIntrinsic &I);
-  void verifyFnArgs(const DPValue &DPV);
+  void verifyFnArgs(const DbgVariableRecord &DVR);
   void verifyNotEntryValue(const DbgVariableIntrinsic &I);
-  void verifyNotEntryValue(const DPValue &I);
+  void verifyNotEntryValue(const DbgVariableRecord &I);
 
   /// Module-level debug info verification...
   void verifyCompileUnits();
@@ -690,12 +690,12 @@ void Verifier::visitDbgRecords(Instruction &I) {
     if (auto *Loc =
             dyn_cast_or_null<DILocation>(DR.getDebugLoc().getAsMDNode()))
       visitMDNode(*Loc, AreDebugLocsAllowed::Yes);
-    if (auto *DPV = dyn_cast<DPValue>(&DR)) {
-      visit(*DPV);
+    if (auto *DVR = dyn_cast<DbgVariableRecord>(&DR)) {
+      visit(*DVR);
       // These have to appear after `visit` for consistency with existing
       // intrinsic behaviour.
-      verifyFragmentExpression(*DPV);
-      verifyNotEntryValue(*DPV);
+      verifyFragmentExpression(*DVR);
+      verifyNotEntryValue(*DVR);
     } else if (auto *DPL = dyn_cast<DPLabel>(&DR)) {
       visit(*DPL);
     }
@@ -1235,6 +1235,7 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
               N.getTag() == dwarf::DW_TAG_volatile_type ||
               N.getTag() == dwarf::DW_TAG_restrict_type ||
               N.getTag() == dwarf::DW_TAG_atomic_type ||
+              N.getTag() == dwarf::DW_TAG_LLVM_ptrauth_type ||
               N.getTag() == dwarf::DW_TAG_member ||
               (N.getTag() == dwarf::DW_TAG_variable && N.isStaticMember()) ||
               N.getTag() == dwarf::DW_TAG_inheritance ||
@@ -4807,11 +4808,12 @@ void Verifier::visitDIAssignIDMetadata(Instruction &I, MDNode *MD) {
                 "dbg.assign not in same function as inst", DAI, &I);
     }
   }
-  for (DPValue *DPV : cast<DIAssignID>(MD)->getAllDPValueUsers()) {
-    CheckDI(DPV->isDbgAssign(),
-            "!DIAssignID should only be used by Assign DPVs.", MD, DPV);
-    CheckDI(DPV->getFunction() == I.getFunction(),
-            "DPVAssign not in same function as inst", DPV, &I);
+  for (DbgVariableRecord *DVR :
+       cast<DIAssignID>(MD)->getAllDbgVariableRecordUsers()) {
+    CheckDI(DVR->isDbgAssign(),
+            "!DIAssignID should only be used by Assign DVRs.", MD, DVR);
+    CheckDI(DVR->getFunction() == I.getFunction(),
+            "DVRAssign not in same function as inst", DVR, &I);
   }
 }
 
@@ -5265,6 +5267,29 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::ucmp:
+  case Intrinsic::scmp: {
+    Type *SrcTy = Call.getOperand(0)->getType();
+    Type *DestTy = Call.getType();
+
+    Check(DestTy->getScalarSizeInBits() >= 2,
+          "result type must be at least 2 bits wide", Call);
+
+    bool IsDestTypeVector = DestTy->isVectorTy();
+    Check(SrcTy->isVectorTy() == IsDestTypeVector,
+          "ucmp/scmp argument and result types must both be either vector or "
+          "scalar types",
+          Call);
+    if (IsDestTypeVector) {
+      auto SrcVecLen = cast<VectorType>(SrcTy)->getElementCount();
+      auto DestVecLen = cast<VectorType>(DestTy)->getElementCount();
+      Check(SrcVecLen == DestVecLen,
+            "return type and arguments must have the same number of "
+            "elements",
+            Call);
+    }
+    break;
+  }
   case Intrinsic::coro_id: {
     auto *InfoArg = Call.getArgOperand(3)->stripPointerCasts();
     if (isa<ConstantPointerNull>(InfoArg))
@@ -6247,71 +6272,72 @@ void Verifier::visit(DPLabel &DPL) {
           Loc->getScope()->getSubprogram());
 }
 
-void Verifier::visit(DPValue &DPV) {
-  BasicBlock *BB = DPV.getParent();
+void Verifier::visit(DbgVariableRecord &DVR) {
+  BasicBlock *BB = DVR.getParent();
   Function *F = BB->getParent();
 
-  CheckDI(DPV.getType() == DPValue::LocationType::Value ||
-              DPV.getType() == DPValue::LocationType::Declare ||
-              DPV.getType() == DPValue::LocationType::Assign,
-          "invalid #dbg record type", &DPV, DPV.getType());
+  CheckDI(DVR.getType() == DbgVariableRecord::LocationType::Value ||
+              DVR.getType() == DbgVariableRecord::LocationType::Declare ||
+              DVR.getType() == DbgVariableRecord::LocationType::Assign,
+          "invalid #dbg record type", &DVR, DVR.getType());
 
-  // The location for a DPValue must be either a ValueAsMetadata, DIArgList, or
-  // an empty MDNode (which is a legacy representation for an "undef" location).
-  auto *MD = DPV.getRawLocation();
+  // The location for a DbgVariableRecord must be either a ValueAsMetadata,
+  // DIArgList, or an empty MDNode (which is a legacy representation for an
+  // "undef" location).
+  auto *MD = DVR.getRawLocation();
   CheckDI(MD && (isa<ValueAsMetadata>(MD) || isa<DIArgList>(MD) ||
                  (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands())),
-          "invalid #dbg record address/value", &DPV, MD);
+          "invalid #dbg record address/value", &DVR, MD);
   if (auto *VAM = dyn_cast<ValueAsMetadata>(MD))
     visitValueAsMetadata(*VAM, F);
   else if (auto *AL = dyn_cast<DIArgList>(MD))
     visitDIArgList(*AL, F);
 
-  CheckDI(isa_and_nonnull<DILocalVariable>(DPV.getRawVariable()),
-          "invalid #dbg record variable", &DPV, DPV.getRawVariable());
-  visitMDNode(*DPV.getRawVariable(), AreDebugLocsAllowed::No);
+  CheckDI(isa_and_nonnull<DILocalVariable>(DVR.getRawVariable()),
+          "invalid #dbg record variable", &DVR, DVR.getRawVariable());
+  visitMDNode(*DVR.getRawVariable(), AreDebugLocsAllowed::No);
 
-  CheckDI(isa_and_nonnull<DIExpression>(DPV.getRawExpression()),
-          "invalid #dbg record expression", &DPV, DPV.getRawExpression());
-  visitMDNode(*DPV.getExpression(), AreDebugLocsAllowed::No);
+  CheckDI(isa_and_nonnull<DIExpression>(DVR.getRawExpression()),
+          "invalid #dbg record expression", &DVR, DVR.getRawExpression());
+  visitMDNode(*DVR.getExpression(), AreDebugLocsAllowed::No);
 
-  if (DPV.isDbgAssign()) {
-    CheckDI(isa_and_nonnull<DIAssignID>(DPV.getRawAssignID()),
-            "invalid #dbg_assign DIAssignID", &DPV, DPV.getRawAssignID());
-    visitMDNode(*cast<DIAssignID>(DPV.getRawAssignID()),
+  if (DVR.isDbgAssign()) {
+    CheckDI(isa_and_nonnull<DIAssignID>(DVR.getRawAssignID()),
+            "invalid #dbg_assign DIAssignID", &DVR, DVR.getRawAssignID());
+    visitMDNode(*cast<DIAssignID>(DVR.getRawAssignID()),
                 AreDebugLocsAllowed::No);
 
-    const auto *RawAddr = DPV.getRawAddress();
-    // Similarly to the location above, the address for an assign DPValue must
-    // be a ValueAsMetadata or an empty MDNode, which represents an undef
-    // address.
+    const auto *RawAddr = DVR.getRawAddress();
+    // Similarly to the location above, the address for an assign
+    // DbgVariableRecord must be a ValueAsMetadata or an empty MDNode, which
+    // represents an undef address.
     CheckDI(
         isa<ValueAsMetadata>(RawAddr) ||
             (isa<MDNode>(RawAddr) && !cast<MDNode>(RawAddr)->getNumOperands()),
-        "invalid #dbg_assign address", &DPV, DPV.getRawAddress());
+        "invalid #dbg_assign address", &DVR, DVR.getRawAddress());
     if (auto *VAM = dyn_cast<ValueAsMetadata>(RawAddr))
       visitValueAsMetadata(*VAM, F);
 
-    CheckDI(isa_and_nonnull<DIExpression>(DPV.getRawAddressExpression()),
-            "invalid #dbg_assign address expression", &DPV,
-            DPV.getRawAddressExpression());
-    visitMDNode(*DPV.getAddressExpression(), AreDebugLocsAllowed::No);
+    CheckDI(isa_and_nonnull<DIExpression>(DVR.getRawAddressExpression()),
+            "invalid #dbg_assign address expression", &DVR,
+            DVR.getRawAddressExpression());
+    visitMDNode(*DVR.getAddressExpression(), AreDebugLocsAllowed::No);
 
-    // All of the linked instructions should be in the same function as DPV.
-    for (Instruction *I : at::getAssignmentInsts(&DPV))
-      CheckDI(DPV.getFunction() == I->getFunction(),
-              "inst not in same function as #dbg_assign", I, &DPV);
+    // All of the linked instructions should be in the same function as DVR.
+    for (Instruction *I : at::getAssignmentInsts(&DVR))
+      CheckDI(DVR.getFunction() == I->getFunction(),
+              "inst not in same function as #dbg_assign", I, &DVR);
   }
 
   // This check is redundant with one in visitLocalVariable().
-  DILocalVariable *Var = DPV.getVariable();
+  DILocalVariable *Var = DVR.getVariable();
   CheckDI(isType(Var->getRawType()), "invalid type ref", Var,
           Var->getRawType());
 
-  auto *DLNode = DPV.getDebugLoc().getAsMDNode();
+  auto *DLNode = DVR.getDebugLoc().getAsMDNode();
   CheckDI(isa_and_nonnull<DILocation>(DLNode), "invalid #dbg record DILocation",
-          &DPV, DLNode);
-  DILocation *Loc = DPV.getDebugLoc();
+          &DVR, DLNode);
+  DILocation *Loc = DVR.getDebugLoc();
 
   // The scopes for variables and !dbg attachments must agree.
   DISubprogram *VarSP = getSubprogram(Var->getRawScope());
@@ -6321,10 +6347,10 @@ void Verifier::visit(DPValue &DPV) {
 
   CheckDI(VarSP == LocSP,
           "mismatched subprogram between #dbg record variable and DILocation",
-          &DPV, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
+          &DVR, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
           Loc->getScope()->getSubprogram());
 
-  verifyFnArgs(DPV);
+  verifyFnArgs(DVR);
 }
 
 void Verifier::visitVPIntrinsic(VPIntrinsic &VPI) {
@@ -6685,9 +6711,9 @@ void Verifier::verifyFragmentExpression(const DbgVariableIntrinsic &I) {
 
   verifyFragmentExpression(*V, *Fragment, &I);
 }
-void Verifier::verifyFragmentExpression(const DPValue &DPV) {
-  DILocalVariable *V = dyn_cast_or_null<DILocalVariable>(DPV.getRawVariable());
-  DIExpression *E = dyn_cast_or_null<DIExpression>(DPV.getRawExpression());
+void Verifier::verifyFragmentExpression(const DbgVariableRecord &DVR) {
+  DILocalVariable *V = dyn_cast_or_null<DILocalVariable>(DVR.getRawVariable());
+  DIExpression *E = dyn_cast_or_null<DIExpression>(DVR.getRawExpression());
 
   // We don't know whether this intrinsic verified correctly.
   if (!V || !E || !E->isValid())
@@ -6707,7 +6733,7 @@ void Verifier::verifyFragmentExpression(const DPValue &DPV) {
   if (V->isArtificial())
     return;
 
-  verifyFragmentExpression(*V, *Fragment, &DPV);
+  verifyFragmentExpression(*V, *Fragment, &DVR);
 }
 
 template <typename ValueOrMetadata>
@@ -6755,7 +6781,7 @@ void Verifier::verifyFnArgs(const DbgVariableIntrinsic &I) {
   CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I,
           Prev, Var);
 }
-void Verifier::verifyFnArgs(const DPValue &DPV) {
+void Verifier::verifyFnArgs(const DbgVariableRecord &DVR) {
   // This function does not take the scope of noninlined function arguments into
   // account. Don't run it if current function is nodebug, because it may
   // contain inlined debug intrinsics.
@@ -6763,10 +6789,10 @@ void Verifier::verifyFnArgs(const DPValue &DPV) {
     return;
 
   // For performance reasons only check non-inlined ones.
-  if (DPV.getDebugLoc()->getInlinedAt())
+  if (DVR.getDebugLoc()->getInlinedAt())
     return;
 
-  DILocalVariable *Var = DPV.getVariable();
+  DILocalVariable *Var = DVR.getVariable();
   CheckDI(Var, "#dbg record without variable");
 
   unsigned ArgNo = Var->getArg();
@@ -6780,7 +6806,7 @@ void Verifier::verifyFnArgs(const DPValue &DPV) {
 
   auto *Prev = DebugFnArgs[ArgNo - 1];
   DebugFnArgs[ArgNo - 1] = Var;
-  CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &DPV,
+  CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &DVR,
           Prev, Var);
 }
 
@@ -6807,15 +6833,15 @@ void Verifier::verifyNotEntryValue(const DbgVariableIntrinsic &I) {
           "swiftasync Argument",
           &I);
 }
-void Verifier::verifyNotEntryValue(const DPValue &DPV) {
-  DIExpression *E = dyn_cast_or_null<DIExpression>(DPV.getRawExpression());
+void Verifier::verifyNotEntryValue(const DbgVariableRecord &DVR) {
+  DIExpression *E = dyn_cast_or_null<DIExpression>(DVR.getRawExpression());
 
   // We don't know whether this intrinsic verified correctly.
   if (!E || !E->isValid())
     return;
 
-  if (isa<ValueAsMetadata>(DPV.getRawLocation())) {
-    Value *VarValue = DPV.getVariableLocationOp(0);
+  if (isa<ValueAsMetadata>(DVR.getRawLocation())) {
+    Value *VarValue = DVR.getVariableLocationOp(0);
     if (isa<UndefValue>(VarValue) || isa<PoisonValue>(VarValue))
       return;
     // We allow EntryValues for swift async arguments, as they have an
@@ -6828,7 +6854,7 @@ void Verifier::verifyNotEntryValue(const DPValue &DPV) {
   CheckDI(!E->isEntryValue(),
           "Entry values are only allowed in MIR unless they target a "
           "swiftasync Argument",
-          &DPV);
+          &DVR);
 }
 
 void Verifier::verifyCompileUnits() {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a9afb6f7348a8f..a7e6db82e5c23c 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Linker/IRMover.h"
 #include "LinkDiagnosticInfo.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -1546,7 +1547,23 @@ Error IRLinker::run() {
     if (Error Err = SrcM->getMaterializer()->materializeMetadata())
       return Err;
 
-  DstM.IsNewDbgInfoFormat = SrcM->IsNewDbgInfoFormat;
+  // Convert source module to match dest for the duration of the link.
+  bool SrcModuleNewDbgFormat = SrcM->IsNewDbgInfoFormat;
+  if (DstM.IsNewDbgInfoFormat != SrcM->IsNewDbgInfoFormat) {
+    if (DstM.IsNewDbgInfoFormat)
+      SrcM->convertToNewDbgValues();
+    else
+      SrcM->convertFromNewDbgValues();
+  }
+  // Undo debug mode conversion afterwards.
+  auto Cleanup = make_scope_exit([&]() {
+    if (SrcModuleNewDbgFormat != SrcM->IsNewDbgInfoFormat) {
+      if (SrcModuleNewDbgFormat)
+        SrcM->convertToNewDbgValues();
+      else
+        SrcM->convertFromNewDbgValues();
+    }
+  });
 
   // Inherit the target data from the source module if the destination module
   // doesn't have one already.
@@ -1606,11 +1623,6 @@ Error IRLinker::run() {
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
-  // Convert module level attributes to function level attributes because
-  // after merging modules the attributes might change and would have different
-  // effect on the functions as the original module would have.
-  CopyModuleAttrToFunctions(*SrcM);
-
   std::reverse(Worklist.begin(), Worklist.end());
   while (!Worklist.empty()) {
     GlobalValue *GV = Worklist.back();
@@ -1780,8 +1792,6 @@ IRMover::IRMover(Module &M) : Composite(M) {
 Error IRMover::move(std::unique_ptr<Module> Src,
                     ArrayRef<GlobalValue *> ValuesToLink,
                     LazyCallback AddLazyFor, bool IsPerformingImport) {
-  if (getModule().IsNewDbgInfoFormat)
-    Src->convertToNewDbgValues();
   IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes,
                        std::move(Src), ValuesToLink, std::move(AddLazyFor),
                        IsPerformingImport);
diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp
index 39856e96e9be51..d72d6e07f2e6fd 100644
--- a/llvm/lib/MC/SPIRVObjectWriter.cpp
+++ b/llvm/lib/MC/SPIRVObjectWriter.cpp
@@ -43,18 +43,14 @@ class SPIRVObjectWriter : public MCObjectWriter {
 
 void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) {
   constexpr uint32_t MagicNumber = 0x07230203;
-
-  // TODO: set the version on a min-necessary basis (just like the translator
-  // does) requires some refactoring of MCAssembler::VersionInfoType.
-  constexpr uint32_t Major = 1;
-  constexpr uint32_t Minor = 0;
-  constexpr uint32_t VersionNumber = 0 | (Major << 16) | (Minor << 8);
-  // TODO: check if we could use anything other than 0 (spec allows).
   constexpr uint32_t GeneratorMagicNumber = 0;
-  // TODO: do not hardcode this as well.
-  constexpr uint32_t Bound = 900;
   constexpr uint32_t Schema = 0;
 
+  // Construct SPIR-V version and Bound
+  const MCAssembler::VersionInfoType &VIT = Asm.getVersionInfo();
+  uint32_t VersionNumber = 0 | (VIT.Major << 16) | (VIT.Minor << 8);
+  uint32_t Bound = VIT.Update;
+
   W.write<uint32_t>(MagicNumber);
   W.write<uint32_t>(VersionNumber);
   W.write<uint32_t>(GeneratorMagicNumber);
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index aa57e55de70c8d..97d0e4f75be8e0 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -795,23 +795,31 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
       Entry.second = Entry.second > 1 ? 1 : 0;
   }
 
+  std::vector<std::unique_ptr<SymbolicFile>> SymFiles;
+
+  if (NeedSymbols != SymtabWritingMode::NoSymtab || isAIXBigArchive(Kind)) {
+    for (const NewArchiveMember &M : NewMembers) {
+      Expected<std::unique_ptr<SymbolicFile>> SymFileOrErr =
+          getSymbolicFile(M.Buf->getMemBufferRef(), Context);
+      if (!SymFileOrErr)
+        return createFileError(M.MemberName, SymFileOrErr.takeError());
+      SymFiles.push_back(std::move(*SymFileOrErr));
+    }
+  }
+
   // The big archive format needs to know the offset of the previous member
   // header.
   uint64_t PrevOffset = 0;
   uint64_t NextMemHeadPadSize = 0;
-  std::unique_ptr<SymbolicFile> CurSymFile;
-  std::unique_ptr<SymbolicFile> NextSymFile;
-  uint16_t Index = 0;
 
-  for (auto M = NewMembers.begin(); M < NewMembers.end(); ++M) {
+  for (uint32_t Index = 0; Index < NewMembers.size(); ++Index) {
+    const NewArchiveMember *M = &NewMembers[Index];
     std::string Header;
     raw_string_ostream Out(Header);
 
     MemoryBufferRef Buf = M->Buf->getMemBufferRef();
     StringRef Data = Thin ? "" : Buf.getBuffer();
 
-    Index++;
-
     // ld64 expects the members to be 8-byte aligned for 64-bit content and at
     // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
     // uniformly.  This matches the behaviour with cctools and ensures that ld64
@@ -837,29 +845,9 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
           std::move(StringMsg), object::object_error::parse_failed);
     }
 
-    if (NeedSymbols != SymtabWritingMode::NoSymtab || isAIXBigArchive(Kind)) {
-      auto SetNextSymFile = [&NextSymFile,
-                             &Context](MemoryBufferRef Buf,
-                                       StringRef MemberName) -> Error {
-        Expected<std::unique_ptr<SymbolicFile>> SymFileOrErr =
-            getSymbolicFile(Buf, Context);
-        if (!SymFileOrErr)
-          return createFileError(MemberName, SymFileOrErr.takeError());
-        NextSymFile = std::move(*SymFileOrErr);
-        return Error::success();
-      };
-
-      if (M == NewMembers.begin())
-        if (Error Err = SetNextSymFile(Buf, M->MemberName))
-          return std::move(Err);
-
-      CurSymFile = std::move(NextSymFile);
-
-      if ((M + 1) != NewMembers.end())
-        if (Error Err = SetNextSymFile((M + 1)->Buf->getMemBufferRef(),
-                                       (M + 1)->MemberName))
-          return std::move(Err);
-    }
+    std::unique_ptr<SymbolicFile> CurSymFile;
+    if (!SymFiles.empty())
+      CurSymFile = std::move(SymFiles[Index]);
 
     // In the big archive file format, we need to calculate and include the next
     // member offset and previous member offset in the file member header.
@@ -880,13 +868,13 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
 
       // If there is another member file after this, we need to calculate the
       // padding before the header.
-      if ((M + 1) != NewMembers.end()) {
-        uint64_t OffsetToNextMemData = NextOffset +
-                                       sizeof(object::BigArMemHdrType) +
-                                       alignTo((M + 1)->MemberName.size(), 2);
+      if (Index + 1 != SymFiles.size()) {
+        uint64_t OffsetToNextMemData =
+            NextOffset + sizeof(object::BigArMemHdrType) +
+            alignTo(NewMembers[Index + 1].MemberName.size(), 2);
         NextMemHeadPadSize =
             alignToPowerOf2(OffsetToNextMemData,
-                            getMemberAlignment(NextSymFile.get())) -
+                            getMemberAlignment(SymFiles[Index + 1].get())) -
             OffsetToNextMemData;
         NextOffset += NextMemHeadPadSize;
       }
@@ -902,7 +890,7 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
     std::vector<unsigned> Symbols;
     if (NeedSymbols != SymtabWritingMode::NoSymtab) {
       Expected<std::vector<unsigned>> SymbolsOrErr =
-          getSymbols(CurSymFile.get(), Index, SymNames, SymMap);
+          getSymbols(CurSymFile.get(), Index + 1, SymNames, SymMap);
       if (!SymbolsOrErr)
         return createFileError(M->MemberName, SymbolsOrErr.takeError());
       Symbols = std::move(*SymbolsOrErr);
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 137f606dd2d46b..55dd0c8e06c092 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -251,7 +251,10 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     }
     break;
   case ELF::EM_HEXAGON:
-    switch (Type) { STRINGIFY_ENUM_CASE(ELF, SHT_HEX_ORDERED); }
+    switch (Type) {
+      STRINGIFY_ENUM_CASE(ELF, SHT_HEX_ORDERED);
+      STRINGIFY_ENUM_CASE(ELF, SHT_HEXAGON_ATTRIBUTES);
+    }
     break;
   case ELF::EM_X86_64:
     switch (Type) { STRINGIFY_ENUM_CASE(ELF, SHT_X86_64_UNWIND); }
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 33be48196ae7d2..efec612957de33 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Support/ARMAttributeParser.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/HexagonAttributeParser.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributeParser.h"
 #include "llvm/Support/RISCVAttributes.h"
@@ -287,6 +288,81 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   return Features;
 }
 
+static std::optional<std::string> hexagonAttrToFeatureString(unsigned Attr) {
+  switch (Attr) {
+  case 5:
+    return "v5";
+  case 55:
+    return "v55";
+  case 60:
+    return "v60";
+  case 62:
+    return "v62";
+  case 65:
+    return "v65";
+  case 67:
+    return "v67";
+  case 68:
+    return "v68";
+  case 69:
+    return "v69";
+  case 71:
+    return "v71";
+  case 73:
+    return "v73";
+  default:
+    return {};
+  }
+}
+
+SubtargetFeatures ELFObjectFileBase::getHexagonFeatures() const {
+  SubtargetFeatures Features;
+  HexagonAttributeParser Parser;
+  if (Error E = getBuildAttributes(Parser)) {
+    // Return no attributes if none can be read.
+    // This behavior is important for backwards compatibility.
+    consumeError(std::move(E));
+    return Features;
+  }
+  std::optional<unsigned> Attr;
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::ARCH))) {
+    if (std::optional<std::string> FeatureString =
+            hexagonAttrToFeatureString(*Attr))
+      Features.AddFeature(*FeatureString);
+  }
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::HVXARCH))) {
+    std::optional<std::string> FeatureString =
+        hexagonAttrToFeatureString(*Attr);
+    // There is no corresponding hvx arch for v5 and v55.
+    if (FeatureString && *Attr >= 60)
+      Features.AddFeature("hvx" + *FeatureString);
+  }
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::HVXIEEEFP)))
+    if (*Attr)
+      Features.AddFeature("hvx-ieee-fp");
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::HVXQFLOAT)))
+    if (*Attr)
+      Features.AddFeature("hvx-qfloat");
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::ZREG)))
+    if (*Attr)
+      Features.AddFeature("zreg");
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::AUDIO)))
+    if (*Attr)
+      Features.AddFeature("audio");
+
+  if ((Attr = Parser.getAttributeValue(HexagonAttrs::CABAC)))
+    if (*Attr)
+      Features.AddFeature("cabac");
+
+  return Features;
+}
+
 Expected<SubtargetFeatures> ELFObjectFileBase::getRISCVFeatures() const {
   SubtargetFeatures Features;
   unsigned PlatformFlags = getPlatformFlags();
@@ -349,6 +425,8 @@ Expected<SubtargetFeatures> ELFObjectFileBase::getFeatures() const {
     return getRISCVFeatures();
   case ELF::EM_LOONGARCH:
     return getLoongArchFeatures();
+  case ELF::EM_HEXAGON:
+    return getHexagonFeatures();
   default:
     return SubtargetFeatures();
   }
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 7dc9822bdd221d..a6871e7855e4e7 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -24,14 +24,14 @@ static_assert((uint64_t)dxbc::FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
 DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) {
-#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str)                      \
   Val = (FlagData & (uint64_t)dxbc::FeatureFlags::Val) > 0;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
 uint64_t DXContainerYAML::ShaderFeatureFlags::getEncodedFlags() {
   uint64_t Flag = 0;
-#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str)                      \
   if (Val)                                                                     \
     Flag |= (uint64_t)dxbc::FeatureFlags::Val;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -105,7 +105,8 @@ void MappingTraits<DXContainerYAML::DXILProgram>::mapping(
 
 void MappingTraits<DXContainerYAML::ShaderFeatureFlags>::mapping(
     IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags) {
-#define SHADER_FEATURE_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
+#define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str)                      \
+  IO.mapRequired(#Val, Flags.Val);
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 9c1a28db592a1c..045211c44b9079 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -716,6 +716,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
     break;
   case ELF::EM_HEXAGON:
     ECase(SHT_HEX_ORDERED);
+    ECase(SHT_HEXAGON_ATTRIBUTES);
     break;
   case ELF::EM_X86_64:
     ECase(SHT_X86_64_UNWIND);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 4d1eb10d2d41c6..9d98ae7dde520f 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -358,7 +358,7 @@ class TriggerVerifierErrorPass
     // Intentionally break the Function by inserting a terminator
     // instruction in the middle of a basic block.
     BasicBlock &BB = F.getEntryBlock();
-    new UnreachableInst(F.getContext(), BB.getTerminator());
+    new UnreachableInst(F.getContext(), BB.getTerminator()->getIterator());
     return PreservedAnalyses::none();
   }
 
@@ -503,15 +503,6 @@ static std::optional<int> parseDevirtPassName(StringRef Name) {
   return Count;
 }
 
-static bool checkParametrizedPassName(StringRef Name, StringRef PassName) {
-  if (!Name.consume_front(PassName))
-    return false;
-  // normal pass name w/o parameters == default parameters
-  if (Name.empty())
-    return true;
-  return Name.starts_with("<") && Name.ends_with(">");
-}
-
 static std::optional<OptimizationLevel> parseOptLevel(StringRef S) {
   return StringSwitch<std::optional<OptimizationLevel>>(S)
       .Case("O0", OptimizationLevel::O0)
@@ -525,42 +516,6 @@ static std::optional<OptimizationLevel> parseOptLevel(StringRef S) {
 
 namespace {
 
-/// This performs customized parsing of pass name with parameters.
-///
-/// We do not need parametrization of passes in textual pipeline very often,
-/// yet on a rare occasion ability to specify parameters right there can be
-/// useful.
-///
-/// \p Name - parameterized specification of a pass from a textual pipeline
-/// is a string in a form of :
-///      PassName '<' parameter-list '>'
-///
-/// Parameter list is being parsed by the parser callable argument, \p Parser,
-/// It takes a string-ref of parameters and returns either StringError or a
-/// parameter list in a form of a custom parameters type, all wrapped into
-/// Expected<> template class.
-///
-template <typename ParametersParseCallableT>
-auto parsePassParameters(ParametersParseCallableT &&Parser, StringRef Name,
-                         StringRef PassName) -> decltype(Parser(StringRef{})) {
-  using ParametersT = typename decltype(Parser(StringRef{}))::value_type;
-
-  StringRef Params = Name;
-  if (!Params.consume_front(PassName)) {
-    assert(false &&
-           "unable to strip pass name from parametrized pass specification");
-  }
-  if (!Params.empty() &&
-      (!Params.consume_front("<") || !Params.consume_back(">"))) {
-    assert(false && "invalid format for parametrized pass name");
-  }
-
-  Expected<ParametersT> Result = Parser(Params);
-  assert((Result || Result.template errorIsA<StringError>()) &&
-         "Pass parameter parser can only return StringErrors.");
-  return Result;
-}
-
 /// Parser of parameters for HardwareLoops  pass.
 Expected<HardwareLoopOptions> parseHardwareLoopOptions(StringRef Params) {
   HardwareLoopOptions HardwareLoopOpts;
@@ -1196,7 +1151,7 @@ static bool isModulePassName(StringRef Name, CallbacksT &Callbacks) {
   if (Name == NAME)                                                            \
     return true;
 #define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
-  if (checkParametrizedPassName(Name, NAME))                                   \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME))                      \
     return true;
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
@@ -1225,7 +1180,7 @@ static bool isCGSCCPassName(StringRef Name, CallbacksT &Callbacks) {
   if (Name == NAME)                                                            \
     return true;
 #define CGSCC_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)       \
-  if (checkParametrizedPassName(Name, NAME))                                   \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME))                      \
     return true;
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
@@ -1252,7 +1207,7 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
   if (Name == NAME)                                                            \
     return true;
 #define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)    \
-  if (checkParametrizedPassName(Name, NAME))                                   \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME))                      \
     return true;
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
@@ -1293,7 +1248,7 @@ static bool isLoopNestPassName(StringRef Name, CallbacksT &Callbacks,
   if (parseRepeatPassName(Name))
     return true;
 
-  if (checkParametrizedPassName(Name, "lnicm")) {
+  if (PassBuilder::checkParametrizedPassName(Name, "lnicm")) {
     UseMemorySSA = true;
     return true;
   }
@@ -1315,7 +1270,7 @@ static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks,
   if (parseRepeatPassName(Name))
     return true;
 
-  if (checkParametrizedPassName(Name, "licm")) {
+  if (PassBuilder::checkParametrizedPassName(Name, "licm")) {
     UseMemorySSA = true;
     return true;
   }
@@ -1324,7 +1279,7 @@ static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks,
   if (Name == NAME)                                                            \
     return true;
 #define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)        \
-  if (checkParametrizedPassName(Name, NAME))                                   \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME))                      \
     return true;
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index e757c735719b1f..d9fe88a00bdfc4 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -60,16 +60,16 @@ class ProfOStream {
   // \c patch can only be called when all data is written and flushed.
   // For raw_string_ostream, the patch is done on the target string
   // directly and it won't be reflected in the stream's internal buffer.
-  void patch(PatchItem *P, int NItems) {
+  void patch(ArrayRef<PatchItem> P) {
     using namespace support;
 
     if (IsFDOStream) {
       raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
       const uint64_t LastPos = FDOStream.tell();
-      for (int K = 0; K < NItems; K++) {
-        FDOStream.seek(P[K].Pos);
-        for (int I = 0; I < P[K].N; I++)
-          write(P[K].D[I]);
+      for (const auto &K : P) {
+        FDOStream.seek(K.Pos);
+        for (int I = 0; I < K.N; I++)
+          write(K.D[I]);
       }
       // Reset the stream to the last position after patching so that users
       // don't accidentally overwrite data. This makes it consistent with
@@ -78,11 +78,11 @@ class ProfOStream {
     } else {
       raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
-      for (int K = 0; K < NItems; K++) {
-        for (int I = 0; I < P[K].N; I++) {
+      for (const auto &K : P) {
+        for (int I = 0; I < K.N; I++) {
           uint64_t Bytes =
-              endian::byte_swap<uint64_t, llvm::endianness::little>(P[K].D[I]);
-          Data.replace(P[K].Pos + I * sizeof(uint64_t), sizeof(uint64_t),
+              endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+          Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
                        (const char *)&Bytes, sizeof(uint64_t));
         }
       }
@@ -575,7 +575,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
         {MemProfSectionStart + sizeof(uint64_t), &FramePayloadOffset, 1},
         {MemProfSectionStart + 2 * sizeof(uint64_t), &FrameTableOffset, 1},
     };
-    OS.patch(PatchItems, 3);
+    OS.patch(PatchItems);
   }
 
   // BinaryIdSection has two parts:
@@ -693,7 +693,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
         {CSSummaryOffset, reinterpret_cast<uint64_t *>(TheCSSummary.get()),
          (int)CSSummarySize}};
 
-    OS.patch(PatchItems, std::size(PatchItems));
+    OS.patch(PatchItems);
   } else {
     // Now do the final patch:
     PatchItem PatchItems[] = {
@@ -713,7 +713,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
         {CSSummaryOffset, reinterpret_cast<uint64_t *>(TheCSSummary.get()),
          (int)CSSummarySize}};
 
-    OS.patch(PatchItems, std::size(PatchItems));
+    OS.patch(PatchItems);
   }
 
   for (const auto &I : FunctionData)
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 7053f3b87682f8..7a383c3236d9b1 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -2022,6 +2022,13 @@ APInt APInt::ushl_ov(unsigned ShAmt, bool &Overflow) const {
   return *this << ShAmt;
 }
 
+APInt APInt::sfloordiv_ov(const APInt &RHS, bool &Overflow) const {
+  APInt quotient = sdiv_ov(RHS, Overflow);
+  if ((quotient * RHS != *this) && (isNegative() != RHS.isNegative()))
+    return quotient - 1;
+  return quotient;
+}
+
 APInt APInt::sadd_sat(const APInt &RHS) const {
   bool Overflow;
   APInt Res = sadd_ov(RHS, Overflow);
@@ -3096,37 +3103,21 @@ void llvm::LoadIntFromMemory(APInt &IntVal, const uint8_t *Src,
 }
 
 APInt APIntOps::avgFloorS(const APInt &C1, const APInt &C2) {
-  // Return floor((C1 + C2)/2)
-  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
-  unsigned FullWidth = C1.getBitWidth() + 1;
-  APInt C1Ext = C1.sext(FullWidth);
-  APInt C2Ext = C2.sext(FullWidth);
-  return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  // Return floor((C1 + C2) / 2)
+  return (C1 & C2) + (C1 ^ C2).ashr(1);
 }
 
 APInt APIntOps::avgFloorU(const APInt &C1, const APInt &C2) {
-  // Return floor((C1 + C2)/2)
-  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
-  unsigned FullWidth = C1.getBitWidth() + 1;
-  APInt C1Ext = C1.zext(FullWidth);
-  APInt C2Ext = C2.zext(FullWidth);
-  return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  // Return floor((C1 + C2) / 2)
+  return (C1 & C2) + (C1 ^ C2).lshr(1);
 }
 
 APInt APIntOps::avgCeilS(const APInt &C1, const APInt &C2) {
-  // Return ceil((C1 + C2)/2)
-  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
-  unsigned FullWidth = C1.getBitWidth() + 1;
-  APInt C1Ext = C1.sext(FullWidth);
-  APInt C2Ext = C2.sext(FullWidth);
-  return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  // Return ceil((C1 + C2) / 2)
+  return (C1 | C2) - (C1 ^ C2).ashr(1);
 }
 
 APInt APIntOps::avgCeilU(const APInt &C1, const APInt &C2) {
-  // Return ceil((C1 + C2)/2)
-  assert(C1.getBitWidth() == C2.getBitWidth() && "Unequal bitwidths");
-  unsigned FullWidth = C1.getBitWidth() + 1;
-  APInt C1Ext = C1.zext(FullWidth);
-  APInt C2Ext = C2.zext(FullWidth);
-  return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  // Return ceil((C1 + C2) / 2)
+  return (C1 | C2) - (C1 ^ C2).lshr(1);
 }
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index b9c13c43e9a7c5..e18beddf7bc5b7 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -188,6 +188,8 @@ add_llvm_component_library(LLVMSupport
   GlobPattern.cpp
   GraphWriter.cpp
   Hashing.cpp
+  HexagonAttributeParser.cpp
+  HexagonAttributes.cpp
   InitLLVM.cpp
   InstructionCost.cpp
   IntEqClasses.cpp
@@ -286,9 +288,6 @@ add_llvm_component_library(LLVMSupport
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Support
   ${Backtrace_INCLUDE_DIRS}
 
-  DEPENDS
-  llvm_vcsrevision_h
-
   LINK_LIBS
   ${system_libs} ${imported_libs} ${delayload_flags}
 
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 42dbc4de200303..c076ae8b843179 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -39,7 +39,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/StringSaver.h"
-#include "llvm/Support/VCSRevision.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
@@ -2539,15 +2538,7 @@ class VersionPrinter {
 #else
     OS << "LLVM (http://llvm.org/):\n  ";
 #endif
-    OS << PACKAGE_NAME << " version " << PACKAGE_VERSION;
-#ifdef LLVM_REPOSITORY
-    OS << " (" << LLVM_REPOSITORY;
-#ifdef LLVM_REVISION
-    OS << ' ' << LLVM_REVISION;
-#endif
-    OS << ')';
-#endif
-    OS << "\n  ";
+    OS << PACKAGE_NAME << " version " << PACKAGE_VERSION << "\n  ";
 #if LLVM_IS_DEBUG_BUILD
     OS << "DEBUG build";
 #else
diff --git a/llvm/lib/Support/HexagonAttributeParser.cpp b/llvm/lib/Support/HexagonAttributeParser.cpp
new file mode 100644
index 00000000000000..2143162d11c79c
--- /dev/null
+++ b/llvm/lib/Support/HexagonAttributeParser.cpp
@@ -0,0 +1,55 @@
+//===-- HexagonAttributeParser.cpp - Hexagon Attribute Parser -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/HexagonAttributeParser.h"
+
+using namespace llvm;
+
+const HexagonAttributeParser::DisplayHandler
+    HexagonAttributeParser::DisplayRoutines[] = {
+        {
+            HexagonAttrs::ARCH,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::HVXARCH,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::HVXIEEEFP,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::HVXQFLOAT,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::ZREG,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::AUDIO,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            HexagonAttrs::CABAC,
+            &ELFAttributeParser::integerAttribute,
+        }};
+
+Error HexagonAttributeParser::handler(uint64_t Tag, bool &Handled) {
+  Handled = false;
+  for (const auto &R : DisplayRoutines) {
+    if (uint64_t(R.Attribute) == Tag) {
+      if (Error E = (this->*R.Routine)(Tag))
+        return E;
+      Handled = true;
+      break;
+    }
+  }
+  return Error::success();
+}
diff --git a/llvm/lib/Support/HexagonAttributes.cpp b/llvm/lib/Support/HexagonAttributes.cpp
new file mode 100644
index 00000000000000..165215c8fb676a
--- /dev/null
+++ b/llvm/lib/Support/HexagonAttributes.cpp
@@ -0,0 +1,27 @@
+//===-- HexagonAttributes.cpp - Qualcomm Hexagon Attributes ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/HexagonAttributes.h"
+
+using namespace llvm;
+using namespace llvm::HexagonAttrs;
+
+static constexpr TagNameItem TagData[] = {
+    {ARCH, "Tag_arch"},
+    {HVXARCH, "Tag_hvx_arch"},
+    {HVXIEEEFP, "Tag_hvx_ieeefp"},
+    {HVXQFLOAT, "Tag_hvx_qfloat"},
+    {ZREG, "Tag_zreg"},
+    {AUDIO, "Tag_audio"},
+    {CABAC, "Tag_cabac"},
+};
+
+constexpr TagNameMap HexagonAttributeTags{TagData};
+const TagNameMap &llvm::HexagonAttrs::getHexagonAttributeTags() {
+  return HexagonAttributeTags;
+}
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 4d625b3eb5b170..092028dd2a5b34 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/JSON.h"
@@ -20,6 +21,7 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <memory>
 #include <mutex>
 #include <string>
 #include <vector>
@@ -64,17 +66,19 @@ using CountAndDurationType = std::pair<size_t, DurationType>;
 using NameAndCountAndDurationType =
     std::pair<std::string, CountAndDurationType>;
 
+} // anonymous namespace
+
 /// Represents an open or completed time section entry to be captured.
-struct TimeTraceProfilerEntry {
+struct llvm::TimeTraceProfilerEntry {
   const TimePointType Start;
   TimePointType End;
   const std::string Name;
   const std::string Detail;
-
+  const bool AsyncEvent = false;
   TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N,
-                         std::string &&Dt)
+                         std::string &&Dt, bool Ae)
       : Start(std::move(S)), End(std::move(E)), Name(std::move(N)),
-        Detail(std::move(Dt)) {}
+        Detail(std::move(Dt)), AsyncEvent(Ae) {}
 
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoids truncation issues causing inner
@@ -92,8 +96,6 @@ struct TimeTraceProfilerEntry {
   }
 };
 
-} // anonymous namespace
-
 struct llvm::TimeTraceProfiler {
   TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "")
       : BeginningOfTime(system_clock::now()), StartTime(ClockType::now()),
@@ -102,22 +104,23 @@ struct llvm::TimeTraceProfiler {
     llvm::get_thread_name(ThreadName);
   }
 
-  void begin(std::string Name, llvm::function_ref<std::string()> Detail) {
-    Stack.emplace_back(ClockType::now(), TimePointType(), std::move(Name),
-                       Detail());
+  TimeTraceProfilerEntry *begin(std::string Name,
+                                llvm::function_ref<std::string()> Detail,
+                                bool AsyncEvent = false) {
+    Stack.emplace_back(std::make_unique<TimeTraceProfilerEntry>(
+        ClockType::now(), TimePointType(), std::move(Name), Detail(),
+        AsyncEvent));
+    return Stack.back().get();
   }
 
   void end() {
     assert(!Stack.empty() && "Must call begin() first");
-    TimeTraceProfilerEntry &E = Stack.back();
-    E.End = ClockType::now();
+    end(*Stack.back().get());
+  }
 
-    // Check that end times monotonically increase.
-    assert((Entries.empty() ||
-            (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
-             Entries.back().getFlameGraphStartUs(StartTime) +
-                 Entries.back().getFlameGraphDurUs())) &&
-           "TimeProfiler scope ended earlier than previous scope");
+  void end(TimeTraceProfilerEntry &E) {
+    assert(!Stack.empty() && "Must call begin() first");
+    E.End = ClockType::now();
 
     // Calculate duration at full precision for overall counts.
     DurationType Duration = E.End - E.Start;
@@ -132,15 +135,18 @@ struct llvm::TimeTraceProfiler {
     // happens to be the ones that don't have any currently open entries above
     // itself.
     if (llvm::none_of(llvm::drop_begin(llvm::reverse(Stack)),
-                      [&](const TimeTraceProfilerEntry &Val) {
-                        return Val.Name == E.Name;
+                      [&](const std::unique_ptr<TimeTraceProfilerEntry> &Val) {
+                        return Val->Name == E.Name;
                       })) {
       auto &CountAndTotal = CountAndTotalPerName[E.Name];
       CountAndTotal.first++;
       CountAndTotal.second += Duration;
-    }
+    };
 
-    Stack.pop_back();
+    llvm::erase_if(Stack,
+                   [&](const std::unique_ptr<TimeTraceProfilerEntry> &Val) {
+                     return Val.get() == &E;
+                   });
   }
 
   // Write events from this TimeTraceProfilerInstance and
@@ -168,14 +174,32 @@ struct llvm::TimeTraceProfiler {
       J.object([&] {
         J.attribute("pid", Pid);
         J.attribute("tid", int64_t(Tid));
-        J.attribute("ph", "X");
         J.attribute("ts", StartUs);
-        J.attribute("dur", DurUs);
+        if (E.AsyncEvent) {
+          J.attribute("cat", E.Name);
+          J.attribute("ph", "b");
+          J.attribute("id", 0);
+        } else {
+          J.attribute("ph", "X");
+          J.attribute("dur", DurUs);
+        }
         J.attribute("name", E.Name);
         if (!E.Detail.empty()) {
           J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
         }
       });
+
+      if (E.AsyncEvent) {
+        J.object([&] {
+          J.attribute("pid", Pid);
+          J.attribute("tid", int64_t(Tid));
+          J.attribute("ts", StartUs + DurUs);
+          J.attribute("cat", E.Name);
+          J.attribute("ph", "e");
+          J.attribute("id", 0);
+          J.attribute("name", E.Name);
+        });
+      }
     };
     for (const TimeTraceProfilerEntry &E : Entries)
       writeEvent(E, this->Tid);
@@ -269,7 +293,7 @@ struct llvm::TimeTraceProfiler {
     J.objectEnd();
   }
 
-  SmallVector<TimeTraceProfilerEntry, 16> Stack;
+  SmallVector<std::unique_ptr<TimeTraceProfilerEntry>, 16> Stack;
   SmallVector<TimeTraceProfilerEntry, 128> Entries;
   StringMap<CountAndDurationType> CountAndTotalPerName;
   // System clock time when the session was begun.
@@ -341,19 +365,36 @@ Error llvm::timeTraceProfilerWrite(StringRef PreferredFileName,
   return Error::success();
 }
 
-void llvm::timeTraceProfilerBegin(StringRef Name, StringRef Detail) {
+TimeTraceProfilerEntry *llvm::timeTraceProfilerBegin(StringRef Name,
+                                                     StringRef Detail) {
   if (TimeTraceProfilerInstance != nullptr)
-    TimeTraceProfilerInstance->begin(std::string(Name),
-                                     [&]() { return std::string(Detail); });
+    return TimeTraceProfilerInstance->begin(
+        std::string(Name), [&]() { return std::string(Detail); }, false);
+  return nullptr;
 }
 
-void llvm::timeTraceProfilerBegin(StringRef Name,
-                                  llvm::function_ref<std::string()> Detail) {
+TimeTraceProfilerEntry *
+llvm::timeTraceProfilerBegin(StringRef Name,
+                             llvm::function_ref<std::string()> Detail) {
   if (TimeTraceProfilerInstance != nullptr)
-    TimeTraceProfilerInstance->begin(std::string(Name), Detail);
+    return TimeTraceProfilerInstance->begin(std::string(Name), Detail, false);
+  return nullptr;
+}
+
+TimeTraceProfilerEntry *llvm::timeTraceAsyncProfilerBegin(StringRef Name,
+                                                          StringRef Detail) {
+  if (TimeTraceProfilerInstance != nullptr)
+    return TimeTraceProfilerInstance->begin(
+        std::string(Name), [&]() { return std::string(Detail); }, true);
+  return nullptr;
 }
 
 void llvm::timeTraceProfilerEnd() {
   if (TimeTraceProfilerInstance != nullptr)
     TimeTraceProfilerInstance->end();
 }
+
+void llvm::timeTraceProfilerEnd(TimeTraceProfilerEntry *E) {
+  if (TimeTraceProfilerInstance != nullptr)
+    TimeTraceProfilerInstance->end(*E);
+}
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 051dd2a67d120f..057f8eae0552c6 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -1344,7 +1344,7 @@ std::error_code RedirectingFileSystem::isLocal(const Twine &Path_,
   SmallString<256> Path;
   Path_.toVector(Path);
 
-  if (makeCanonical(Path))
+  if (makeAbsolute(Path))
     return {};
 
   return ExternalFS->isLocal(Path, Result);
@@ -1411,7 +1411,7 @@ directory_iterator RedirectingFileSystem::dir_begin(const Twine &Dir,
   SmallString<256> Path;
   Dir.toVector(Path);
 
-  EC = makeCanonical(Path);
+  EC = makeAbsolute(Path);
   if (EC)
     return {};
 
@@ -2261,8 +2261,8 @@ void RedirectingFileSystem::LookupResult::getPath(
   llvm::sys::path::append(Result, E->getName());
 }
 
-std::error_code
-RedirectingFileSystem::makeCanonical(SmallVectorImpl<char> &Path) const {
+std::error_code RedirectingFileSystem::makeCanonicalForLookup(
+    SmallVectorImpl<char> &Path) const {
   if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
@@ -2277,12 +2277,16 @@ RedirectingFileSystem::makeCanonical(SmallVectorImpl<char> &Path) const {
 
 ErrorOr<RedirectingFileSystem::LookupResult>
 RedirectingFileSystem::lookupPath(StringRef Path) const {
+  llvm::SmallString<128> CanonicalPath(Path);
+  if (std::error_code EC = makeCanonicalForLookup(CanonicalPath))
+    return EC;
+
   // RedirectOnly means the VFS is always used.
   if (UsageTrackingActive && Redirection == RedirectKind::RedirectOnly)
     HasBeenUsed = true;
 
-  sys::path::const_iterator Start = sys::path::begin(Path);
-  sys::path::const_iterator End = sys::path::end(Path);
+  sys::path::const_iterator Start = sys::path::begin(CanonicalPath);
+  sys::path::const_iterator End = sys::path::end(CanonicalPath);
   llvm::SmallVector<Entry *, 32> Entries;
   for (const auto &Root : Roots) {
     ErrorOr<RedirectingFileSystem::LookupResult> Result =
@@ -2358,14 +2362,14 @@ static Status getRedirectedFileStatus(const Twine &OriginalPath,
 }
 
 ErrorOr<Status> RedirectingFileSystem::status(
-    const Twine &CanonicalPath, const Twine &OriginalPath,
+    const Twine &LookupPath, const Twine &OriginalPath,
     const RedirectingFileSystem::LookupResult &Result) {
   if (std::optional<StringRef> ExtRedirect = Result.getExternalRedirect()) {
-    SmallString<256> CanonicalRemappedPath((*ExtRedirect).str());
-    if (std::error_code EC = makeCanonical(CanonicalRemappedPath))
+    SmallString<256> RemappedPath((*ExtRedirect).str());
+    if (std::error_code EC = makeAbsolute(RemappedPath))
       return EC;
 
-    ErrorOr<Status> S = ExternalFS->status(CanonicalRemappedPath);
+    ErrorOr<Status> S = ExternalFS->status(RemappedPath);
     if (!S)
       return S;
     S = Status::copyWithNewName(*S, *ExtRedirect);
@@ -2375,13 +2379,13 @@ ErrorOr<Status> RedirectingFileSystem::status(
   }
 
   auto *DE = cast<RedirectingFileSystem::DirectoryEntry>(Result.E);
-  return Status::copyWithNewName(DE->getStatus(), CanonicalPath);
+  return Status::copyWithNewName(DE->getStatus(), LookupPath);
 }
 
 ErrorOr<Status>
-RedirectingFileSystem::getExternalStatus(const Twine &CanonicalPath,
+RedirectingFileSystem::getExternalStatus(const Twine &LookupPath,
                                          const Twine &OriginalPath) const {
-  auto Result = ExternalFS->status(CanonicalPath);
+  auto Result = ExternalFS->status(LookupPath);
 
   // The path has been mapped by some nested VFS, don't override it with the
   // original path.
@@ -2391,38 +2395,37 @@ RedirectingFileSystem::getExternalStatus(const Twine &CanonicalPath,
 }
 
 ErrorOr<Status> RedirectingFileSystem::status(const Twine &OriginalPath) {
-  SmallString<256> CanonicalPath;
-  OriginalPath.toVector(CanonicalPath);
+  SmallString<256> Path;
+  OriginalPath.toVector(Path);
 
-  if (std::error_code EC = makeCanonical(CanonicalPath))
+  if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
   if (Redirection == RedirectKind::Fallback) {
     // Attempt to find the original file first, only falling back to the
     // mapped file if that fails.
-    ErrorOr<Status> S = getExternalStatus(CanonicalPath, OriginalPath);
+    ErrorOr<Status> S = getExternalStatus(Path, OriginalPath);
     if (S)
       return S;
   }
 
-  ErrorOr<RedirectingFileSystem::LookupResult> Result =
-      lookupPath(CanonicalPath);
+  ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
   if (!Result) {
     // Was not able to map file, fallthrough to using the original path if
     // that was the specified redirection type.
     if (Redirection == RedirectKind::Fallthrough &&
         isFileNotFound(Result.getError()))
-      return getExternalStatus(CanonicalPath, OriginalPath);
+      return getExternalStatus(Path, OriginalPath);
     return Result.getError();
   }
 
-  ErrorOr<Status> S = status(CanonicalPath, OriginalPath, *Result);
+  ErrorOr<Status> S = status(Path, OriginalPath, *Result);
   if (!S && Redirection == RedirectKind::Fallthrough &&
       isFileNotFound(S.getError(), Result->E)) {
     // Mapped the file but it wasn't found in the underlying filesystem,
     // fallthrough to using the original path if that was the specified
     // redirection type.
-    return getExternalStatus(CanonicalPath, OriginalPath);
+    return getExternalStatus(Path, OriginalPath);
   }
 
   return S;
@@ -2471,30 +2474,27 @@ File::getWithPath(ErrorOr<std::unique_ptr<File>> Result, const Twine &P) {
 
 ErrorOr<std::unique_ptr<File>>
 RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
-  SmallString<256> CanonicalPath;
-  OriginalPath.toVector(CanonicalPath);
+  SmallString<256> Path;
+  OriginalPath.toVector(Path);
 
-  if (std::error_code EC = makeCanonical(CanonicalPath))
+  if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
   if (Redirection == RedirectKind::Fallback) {
     // Attempt to find the original file first, only falling back to the
     // mapped file if that fails.
-    auto F = File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
-                               OriginalPath);
+    auto F = File::getWithPath(ExternalFS->openFileForRead(Path), OriginalPath);
     if (F)
       return F;
   }
 
-  ErrorOr<RedirectingFileSystem::LookupResult> Result =
-      lookupPath(CanonicalPath);
+  ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
   if (!Result) {
     // Was not able to map file, fallthrough to using the original path if
     // that was the specified redirection type.
     if (Redirection == RedirectKind::Fallthrough &&
         isFileNotFound(Result.getError()))
-      return File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
-                               OriginalPath);
+      return File::getWithPath(ExternalFS->openFileForRead(Path), OriginalPath);
     return Result.getError();
   }
 
@@ -2502,22 +2502,21 @@ RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
     return make_error_code(llvm::errc::invalid_argument);
 
   StringRef ExtRedirect = *Result->getExternalRedirect();
-  SmallString<256> CanonicalRemappedPath(ExtRedirect.str());
-  if (std::error_code EC = makeCanonical(CanonicalRemappedPath))
+  SmallString<256> RemappedPath(ExtRedirect.str());
+  if (std::error_code EC = makeAbsolute(RemappedPath))
     return EC;
 
   auto *RE = cast<RedirectingFileSystem::RemapEntry>(Result->E);
 
-  auto ExternalFile = File::getWithPath(
-      ExternalFS->openFileForRead(CanonicalRemappedPath), ExtRedirect);
+  auto ExternalFile =
+      File::getWithPath(ExternalFS->openFileForRead(RemappedPath), ExtRedirect);
   if (!ExternalFile) {
     if (Redirection == RedirectKind::Fallthrough &&
         isFileNotFound(ExternalFile.getError(), Result->E)) {
       // Mapped the file but it wasn't found in the underlying filesystem,
       // fallthrough to using the original path if that was the specified
       // redirection type.
-      return File::getWithPath(ExternalFS->openFileForRead(CanonicalPath),
-                               OriginalPath);
+      return File::getWithPath(ExternalFS->openFileForRead(Path), OriginalPath);
     }
     return ExternalFile;
   }
@@ -2537,28 +2536,27 @@ RedirectingFileSystem::openFileForRead(const Twine &OriginalPath) {
 std::error_code
 RedirectingFileSystem::getRealPath(const Twine &OriginalPath,
                                    SmallVectorImpl<char> &Output) const {
-  SmallString<256> CanonicalPath;
-  OriginalPath.toVector(CanonicalPath);
+  SmallString<256> Path;
+  OriginalPath.toVector(Path);
 
-  if (std::error_code EC = makeCanonical(CanonicalPath))
+  if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
   if (Redirection == RedirectKind::Fallback) {
     // Attempt to find the original file first, only falling back to the
     // mapped file if that fails.
-    std::error_code EC = ExternalFS->getRealPath(CanonicalPath, Output);
+    std::error_code EC = ExternalFS->getRealPath(Path, Output);
     if (!EC)
       return EC;
   }
 
-  ErrorOr<RedirectingFileSystem::LookupResult> Result =
-      lookupPath(CanonicalPath);
+  ErrorOr<RedirectingFileSystem::LookupResult> Result = lookupPath(Path);
   if (!Result) {
     // Was not able to map file, fallthrough to using the original path if
     // that was the specified redirection type.
     if (Redirection == RedirectKind::Fallthrough &&
         isFileNotFound(Result.getError()))
-      return ExternalFS->getRealPath(CanonicalPath, Output);
+      return ExternalFS->getRealPath(Path, Output);
     return Result.getError();
   }
 
@@ -2571,7 +2569,7 @@ RedirectingFileSystem::getRealPath(const Twine &OriginalPath,
       // Mapped the file but it wasn't found in the underlying filesystem,
       // fallthrough to using the original path if that was the specified
       // redirection type.
-      return ExternalFS->getRealPath(CanonicalPath, Output);
+      return ExternalFS->getRealPath(Path, Output);
     }
     return P;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 402c7292d7f81c..6425aa9b091f7f 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -873,6 +873,12 @@ def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
                                    FeatureFuseAdrpAdd,
                                    FeaturePostRAScheduler]>;
 
+def TuneA520AE  : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
+                                   "Cortex-A520AE ARM processors", [
+                                   FeatureFuseAES,
+                                   FeatureFuseAdrpAdd,
+                                   FeaturePostRAScheduler]>;
+
 def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors", [
                                    FeatureFuseAES,
@@ -1001,6 +1007,17 @@ def TuneA720 : SubtargetFeature<"a720", "ARMProcFamily", "CortexA720",
                                  FeatureEnableSelectOptimize,
                                  FeaturePredictableSelectIsExpensive]>;
 
+def TuneA720AE : SubtargetFeature<"a720ae", "ARMProcFamily", "CortexA720",
+                                "Cortex-A720AE ARM processors", [
+                                 FeatureFuseAES,
+                                 FeaturePostRAScheduler,
+                                 FeatureCmpBccFusion,
+                                 FeatureAddrLSLFast,
+                                 FeatureALULSLFast,
+                                 FeatureFuseAdrpAdd,
+                                 FeatureEnableSelectOptimize,
+                                 FeaturePredictableSelectIsExpensive]>;
+
 def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
                                "CortexR82",
                                "Cortex-R82 ARM processors", [
@@ -1423,6 +1440,9 @@ def ProcessorFeatures {
   list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
                                  FeatureFP16FML];
+  list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
+                                 FeatureMTE, FeatureETE, FeatureSVE2BitPerm,
+                                 FeatureFP16FML];
   list<SubtargetFeature> A65  = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureFullFP16, FeatureDotProd,
                                  FeatureRCPC, FeatureSSBS, FeatureRAS,
@@ -1456,6 +1476,9 @@ def ProcessorFeatures {
   list<SubtargetFeature> A720 = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
                                  FeatureTRBE, FeatureSVE2BitPerm, FeatureETE,
                                  FeaturePerfMon, FeatureSPE, FeatureSPE_EEF];
+  list<SubtargetFeature> A720AE = [HasV9_2aOps, FeatureMTE, FeatureFP16FML,
+                                 FeatureTRBE, FeatureSVE2BitPerm, FeatureETE,
+                                 FeaturePerfMon, FeatureSPE, FeatureSPE_EEF];
   list<SubtargetFeature> R82  = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
                                  FeatureFP16FML, FeatureSSBS, FeaturePredRes,
                                  FeatureSB, FeatureRDM, FeatureDotProd,
@@ -1598,6 +1621,8 @@ def : ProcessorModel<"cortex-a510", CortexA510Model, ProcessorFeatures.A510,
                      [TuneA510]>;
 def : ProcessorModel<"cortex-a520", CortexA510Model, ProcessorFeatures.A520,
                      [TuneA520]>;
+def : ProcessorModel<"cortex-a520ae", CortexA510Model, ProcessorFeatures.A520AE,
+                     [TuneA520AE]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, ProcessorFeatures.A53,
                      [TuneA57]>;
 def : ProcessorModel<"cortex-a65", CortexA53Model, ProcessorFeatures.A65,
@@ -1628,6 +1653,8 @@ def : ProcessorModel<"cortex-a715", NeoverseN2Model, ProcessorFeatures.A715,
                      [TuneA715]>;
 def : ProcessorModel<"cortex-a720", NeoverseN2Model, ProcessorFeatures.A720,
                      [TuneA720]>;
+def : ProcessorModel<"cortex-a720ae", NeoverseN2Model, ProcessorFeatures.A720AE,
+                     [TuneA720AE]>;
 def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82,
                      [TuneR82]>;
 def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1,
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index b2c52b443753dc..03f0778bae59d5 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -987,7 +987,7 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
   // Expand the pseudo into smstart or smstop instruction. The pseudo has the
   // following operands:
   //
-  //   MSRpstatePseudo <za|sm|both>, <0|1>, pstate.sm, expectedval, <regmask>
+  //   MSRpstatePseudo <za|sm|both>, <0|1>, condition[, pstate.sm], <regmask>
   //
   // The pseudo is expanded into a conditional smstart/smstop, with a
   // check if pstate.sm (register) equals the expected value, and if not,
@@ -997,9 +997,9 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
   // streaming-compatible function:
   //
   // OrigBB:
-  //   MSRpstatePseudo 3, 0, %0, 0, <regmask>             <- Conditional SMSTOP
+  //   MSRpstatePseudo 3, 0, IfCallerIsStreaming, %0, <regmask>  <- Cond SMSTOP
   //   bl @normal_callee
-  //   MSRpstatePseudo 3, 1, %0, 0, <regmask>             <- Conditional SMSTART
+  //   MSRpstatePseudo 3, 1, IfCallerIsStreaming, %0, <regmask>  <- Cond SMSTART
   //
   // ...which will be transformed into:
   //
@@ -1022,11 +1022,20 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
   // We test the live value of pstate.sm and toggle pstate.sm if this is not the
   // expected value for the callee (0 for a normal callee and 1 for a streaming
   // callee).
-  auto PStateSM = MI.getOperand(2).getReg();
+  unsigned Opc;
+  switch (MI.getOperand(2).getImm()) {
+  case AArch64SME::Always:
+    llvm_unreachable("Should have matched to instruction directly");
+  case AArch64SME::IfCallerIsStreaming:
+    Opc = AArch64::TBNZW;
+    break;
+  case AArch64SME::IfCallerIsNonStreaming:
+    Opc = AArch64::TBZW;
+    break;
+  }
+  auto PStateSM = MI.getOperand(3).getReg();
   auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
   unsigned SMReg32 = TRI->getSubReg(PStateSM, AArch64::sub_32);
-  bool IsStreamingCallee = MI.getOperand(3).getImm();
-  unsigned Opc = IsStreamingCallee ? AArch64::TBZW : AArch64::TBNZW;
   MachineInstrBuilder Tbx =
       BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(SMReg32).addImm(0);
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9665ae5ceb903f..7fab274ab957c8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1131,6 +1131,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
           ISD::FMUL,           ISD::FDIV,           ISD::FMA,
           ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
           ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
+          ISD::FSIN,           ISD::FCOS,           ISD::FPOW,
+          ISD::FLOG,           ISD::FLOG2,          ISD::FLOG10,
+          ISD::FEXP,           ISD::FEXP2,          ISD::FEXP10,
           ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
           ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
           ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
@@ -5270,13 +5273,13 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         AArch64ISD::SMSTART, DL, MVT::Other,
         Op->getOperand(0), // Chain
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
   case Intrinsic::aarch64_sme_za_disable:
     return DAG.getNode(
         AArch64ISD::SMSTOP, DL, MVT::Other,
         Op->getOperand(0), // Chain
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
   }
 }
 
@@ -7197,11 +7200,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
           getRegClassFor(PStateSM.getValueType().getSimpleVT()));
       FuncInfo->setPStateSMReg(Reg);
       Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
-    } else {
-      PStateSM = DAG.getConstant(0, DL, MVT::i64);
-    }
-    Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue, PStateSM,
-                                /*Entry*/ true);
+      Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
+                                  AArch64SME::IfCallerIsNonStreaming, PStateSM);
+    } else
+      Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
+                                  AArch64SME::Always);
 
     // Ensure that the SMSTART happens after the CopyWithChain such that its
     // chain result is used.
@@ -7786,9 +7789,11 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
   }
 }
 
-SDValue AArch64TargetLowering::changeStreamingMode(
-    SelectionDAG &DAG, SDLoc DL, bool Enable,
-    SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {
+SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
+                                                   bool Enable, SDValue Chain,
+                                                   SDValue InGlue,
+                                                   unsigned Condition,
+                                                   SDValue PStateSM) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   FuncInfo->setHasStreamingModeChanges(true);
@@ -7797,10 +7802,13 @@ SDValue AArch64TargetLowering::changeStreamingMode(
   SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
   SDValue MSROp =
       DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
-
-  SDValue ExpectedSMVal =
-      DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64);
-  SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask};
+  SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
+  SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
+  if (Condition != AArch64SME::Always) {
+    assert(PStateSM && "PStateSM should be defined");
+    Ops.push_back(PStateSM);
+  }
+  Ops.push_back(RegMask);
 
   if (InGlue)
     Ops.push_back(InGlue);
@@ -7809,6 +7817,19 @@ SDValue AArch64TargetLowering::changeStreamingMode(
   return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
 }
 
+static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
+                               const SMEAttrs &CalleeAttrs) {
+  if (!CallerAttrs.hasStreamingCompatibleInterface() ||
+      CallerAttrs.hasStreamingBody())
+    return AArch64SME::Always;
+  if (CalleeAttrs.hasNonStreamingInterface())
+    return AArch64SME::IfCallerIsStreaming;
+  if (CalleeAttrs.hasStreamingInterface())
+    return AArch64SME::IfCallerIsNonStreaming;
+
+  llvm_unreachable("Unsupported attributes");
+}
+
 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
 /// and add input and output parameter nodes.
 SDValue
@@ -8028,7 +8049,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = DAG.getNode(
         AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
@@ -8299,9 +8320,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue InGlue;
   if (RequiresSMChange) {
-    SDValue NewChain =
-        changeStreamingMode(DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain,
-                            InGlue, PStateSM, true);
+    SDValue NewChain = changeStreamingMode(
+        DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
+        getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
     Chain = NewChain.getValue(0);
     InGlue = NewChain.getValue(1);
   }
@@ -8455,8 +8476,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (RequiresSMChange) {
     assert(PStateSM && "Expected a PStateSM to be set");
-    Result = changeStreamingMode(DAG, DL, !CalleeAttrs.hasStreamingInterface(),
-                                 Result, InGlue, PStateSM, false);
+    Result = changeStreamingMode(
+        DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
+        getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
   }
 
   if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
@@ -8464,7 +8486,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Result = DAG.getNode(
         AArch64ISD::SMSTART, DL, MVT::Other, Result,
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
 
   if (ShouldPreserveZT0)
     Result =
@@ -8599,13 +8621,12 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       Register Reg = FuncInfo->getPStateSMReg();
       assert(Reg.isValid() && "PStateSM Register is invalid");
       SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
-      Chain =
-          changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
-                              /*Glue*/ SDValue(), PStateSM, /*Entry*/ false);
+      Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
+                                  /*Glue*/ SDValue(),
+                                  AArch64SME::IfCallerIsNonStreaming, PStateSM);
     } else
-      Chain = changeStreamingMode(
-          DAG, DL, /*Enable*/ false, Chain,
-          /*Glue*/ SDValue(), DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true);
+      Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
+                                  /*Glue*/ SDValue(), AArch64SME::Always);
     Glue = Chain.getValue(1);
   }
 
@@ -16595,6 +16616,38 @@ bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
   return IsLegal;
 }
 
+bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {
+  // We will only emit addvl/inc* instructions for SVE2
+  if (!Subtarget->hasSVE2())
+    return false;
+
+  // addvl's immediates are in terms of the number of bytes in a register.
+  // Since there are 16 in the base supported size (128bits), we need to
+  // divide the immediate by that much to give us a useful immediate to
+  // multiply by vscale. We can't have a remainder as a result of this.
+  if (Imm % 16 == 0)
+    return isInt<6>(Imm / 16);
+
+  // Inc[b|h|w|d] instructions take a pattern and a positive immediate
+  // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
+  // of addvl as a result, so only take h|w|d into account.
+  // Dec[h|w|d] will cover subtractions.
+  // Immediates are in the range [1,16], so we can't do a 2's complement check.
+  // FIXME: Can we make use of other patterns to cover other immediates?
+
+  // inch|dech
+  if (Imm % 8 == 0)
+    return std::labs(Imm / 8) <= 16;
+  // incw|decw
+  if (Imm % 4 == 0)
+    return std::labs(Imm / 4) <= 16;
+  // incd|decd
+  if (Imm % 2 == 0)
+    return std::labs(Imm / 2) <= 16;
+
+  return false;
+}
+
 // Return false to prevent folding
 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
 // if the folding leads to worse code.
@@ -16671,15 +16724,29 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
   if (Ty->isScalableTy()) {
     if (isa<ScalableVectorType>(Ty)) {
+      // See if we have a foldable vscale-based offset, for vector types which
+      // are either legal or smaller than the minimum; more work will be
+      // required if we need to consider addressing for types which need
+      // legalization by splitting.
+      uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
+      if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
+          (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
+          isPowerOf2_64(VecNumBytes))
+        return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
+
       uint64_t VecElemNumBytes =
           DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
-      return AM.HasBaseReg && !AM.BaseOffs &&
+      return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
              (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
     }
 
-    return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
+    return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
   }
 
+  // No scalable offsets allowed for non-scalable types.
+  if (AM.ScalableOffset)
+    return false;
+
   // check reg + imm case:
   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
   uint64_t NumBytes = 0;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 68341c199e0a2a..3465f3be887543 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -689,6 +689,7 @@ class AArch64TargetLowering : public TargetLowering {
                                        StoreInst *SI) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
+  bool isLegalAddScalableImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
   bool isMulAddWithConstProfitable(SDValue AddNode,
@@ -968,12 +969,12 @@ class AArch64TargetLowering : public TargetLowering {
   bool shouldExpandCttzElements(EVT VT) const override;
 
   /// If a change in streaming mode is required on entry to/return from a
-  /// function call it emits and returns the corresponding SMSTART or SMSTOP node.
-  /// \p Entry tells whether this is before/after the Call, which is necessary
-  /// because PSTATE.SM is only queried once.
+  /// function call it emits and returns the corresponding SMSTART or SMSTOP
+  /// node. \p Condition should be one of the enum values from
+  /// AArch64SME::ToggleCondition.
   SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
-                              SDValue Chain, SDValue InGlue,
-                              SDValue PStateSM, bool Entry) const;
+                              SDValue Chain, SDValue InGlue, unsigned Condition,
+                              SDValue PStateSM = SDValue()) const;
 
   bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 0ab2c401b1749d..d0adb78b231a76 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -2362,7 +2362,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
       // Get the needed alignments to check them if
       // ldp-aligned-only/stp-aligned-only features are opted.
       uint64_t MemAlignment = MemOp->getAlign().value();
-      uint64_t TypeAlignment = Align(MemOp->getSize()).value();
+      uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();
 
       if (MemAlignment < 2 * TypeAlignment) {
         NumFailedAlignmentCheck++;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ec59778cbb0efa..ad29003f1e8173 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1072,3 +1072,8 @@ bool AArch64RegisterInfo::shouldCoalesce(
 
   return true;
 }
+
+bool AArch64RegisterInfo::shouldAnalyzePhysregInMachineLoopInfo(
+    MCRegister R) const {
+  return R == AArch64::VG;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index e93352c13b3c72..5c8a5e029584fc 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -145,6 +145,8 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
 
   void getOffsetOpcodes(const StackOffset &Offset,
                         SmallVectorImpl<uint64_t> &Ops) const override;
+
+  bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2907ba74ff8108..b65c67e70a4e0f 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -10,12 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-def AArch64_smstart : SDNode<"AArch64ISD::SMSTART", SDTypeProfile<0, 3,
-                             [SDTCisInt<0>, SDTCisInt<0>, SDTCisInt<0>]>,
+def AArch64_smstart : SDNode<"AArch64ISD::SMSTART", SDTypeProfile<0, 2,
+                             [SDTCisInt<0>, SDTCisInt<0>]>,
                              [SDNPHasChain, SDNPSideEffect, SDNPVariadic,
                               SDNPOptInGlue, SDNPOutGlue]>;
-def AArch64_smstop  : SDNode<"AArch64ISD::SMSTOP", SDTypeProfile<0, 3,
-                             [SDTCisInt<0>, SDTCisInt<0>, SDTCisInt<0>]>,
+def AArch64_smstop  : SDNode<"AArch64ISD::SMSTOP", SDTypeProfile<0, 2,
+                             [SDTCisInt<0>, SDTCisInt<0>]>,
                              [SDNPHasChain, SDNPSideEffect, SDNPVariadic,
                               SDNPOptInGlue, SDNPOutGlue]>;
 def AArch64_restore_za : SDNode<"AArch64ISD::RESTORE_ZA", SDTypeProfile<0, 3,
@@ -158,40 +158,14 @@ def : Pat<(AArch64_restore_za
             (i64 GPR64:$tpidr2_el0), (i64 GPR64sp:$tpidr2obj), (i64 texternalsym:$restore_routine)),
           (RestoreZAPseudo GPR64:$tpidr2_el0, GPR64sp:$tpidr2obj, texternalsym:$restore_routine)>;
 
-// Scenario A:
-//
-//   %pstate.before.call = 1
-//   if (%pstate.before.call != 0)
-//     smstop (pstate_za|pstate_sm)
-//   call fn()
-//   if (%pstate.before.call != 0)
-//     smstart (pstate_za|pstate_sm)
-//
-def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 1), (i64 0)),   // before call
-          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
-def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 1), (i64 0)),  // after call
-          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b1)>;
-
-// Scenario B:
-//
-//   %pstate.before.call = 0
-//   if (%pstate.before.call != 1)
-//     smstart (pstate_za|pstate_sm)
-//   call fn()
-//   if (%pstate.before.call != 1)
-//     smstop (pstate_za|pstate_sm)
-//
-def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 0), (i64 1)),  // before call
-          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b1)>;
-def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 0), (i64 1)),   // after call
-          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
-
 // Read and write TPIDR2_EL0
 def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
           (MSR 0xde85, GPR64:$val)>;
 def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
           (MRS 0xde85)>;
 
+} // End let Predicates = [HasSME]
+
 multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> {
   def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> {
     let Constraints = "$dst = $src";
@@ -211,8 +185,6 @@ multiclass CoalescerBarriers {
 
 defm COALESCER_BARRIER : CoalescerBarriers;
 
-} // End let Predicates = [HasSME]
-
 // Pseudo to match to smstart/smstop. This expands:
 //
 //  pseudonode (pstate_za|pstate_sm), before_call, expected_value
@@ -230,17 +202,24 @@ defm COALESCER_BARRIER : CoalescerBarriers;
 // SME instructions.
 def MSRpstatePseudo :
   Pseudo<(outs),
-           (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>,
+           (ins svcr_op:$pstatefield, timm0_1:$imm, timm0_31:$condition, variable_ops), []>,
     Sched<[WriteSys]> {
   let hasPostISelHook = 1;
   let Uses = [VG];
   let Defs = [VG];
 }
 
-def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
-          (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
-def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
-          (MSRpstatePseudo svcr_op:$pstate, 0b0, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
+def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 timm0_31:$condition)),
+          (MSRpstatePseudo svcr_op:$pstate, 0b1, timm0_31:$condition)>;
+def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 timm0_31:$condition)),
+          (MSRpstatePseudo svcr_op:$pstate, 0b0, timm0_31:$condition)>;
+
+// Unconditional start/stop
+def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)),
+          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b1)>;
+def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)),
+          (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
+
 
 //===----------------------------------------------------------------------===//
 // SME2 Instructions
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e0a010af415534..dd5e11c0f5e35d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2042,9 +2042,8 @@ let Predicates = [HasSVEorSME] in {
   defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;
 
   def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)),
-                 (i64 (!cast<Instruction>(CNTP_XPP_B)
-                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1)),
-                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1))))>;
+            (CNTP_XPP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op1),
+                        (BRKB_PPzP (PTRUE_B 31), PPR:$Op1))>;
 }
 
   defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
@@ -2131,15 +2130,12 @@ let Predicates = [HasSVEorSME] in {
   defm DECP_ZP     : sve_int_count_v<0b10100, "decp">;
 
   def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv16i1:$Op2)))),
-                 (i64 (!cast<Instruction>(INCP_XP_B)
-                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
-                        GPR64:$Op1))>;
+            (INCP_XP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op2), GPR64:$Op1)>;
 
   def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv16i1:$Op2))))),
-                 (i32 (EXTRACT_SUBREG (i64 (!cast<Instruction>(INCP_XP_B)
-                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
-                        (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Op1, sub_32))),
-                      sub_32))>;
+            (EXTRACT_SUBREG (INCP_XP_B (BRKB_PPzP (PTRUE_B 31), PPR:$Op2),
+                                       (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Op1, sub_32)),
+                            sub_32)>;
 
   defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
   defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
@@ -2521,9 +2517,9 @@ let Predicates = [HasSVEorSME] in {
               (ADDVL_XXI GPR64:$op, $imm)>;
 
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                             GPR32:$op, sub_32), $imm),
-                                   sub_32))>;
+              (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (IMPLICIT_DEF),
+                                        GPR32:$op, sub_32), $imm),
+                              sub_32)>;
 
     def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))),
               (INCH_XPiI GPR64:$op, 31, $imm)>;
@@ -2540,30 +2536,30 @@ let Predicates = [HasSVEorSME] in {
               (DECD_XPiI GPR64:$op, 31, $imm)>;
 
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (INCW_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (INCW_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (INCD_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (INCD_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
 
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm_neg i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (DECW_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (DECW_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
     def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm_neg i32:$imm))))),
-              (i32 (EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-                                               GPR32:$op, sub_32), 31, $imm),
-                                    sub_32))>;
+              (EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (IMPLICIT_DEF),
+                                          GPR32:$op, sub_32), 31, $imm),
+                               sub_32)>;
   }
 
   // FIXME: BigEndian requires an additional REV instruction to satisfy the
@@ -3266,58 +3262,58 @@ let Predicates = [HasSVEorSME] in {
   // Extract element from vector with immediate index that's within the bottom 128-bits.
   let Predicates = [IsNeonAvailable], AddedComplexity = 1 in {
   def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)),
-            (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+            (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)),
-            (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+            (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
   def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)),
-            (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+            (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
   def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)),
-            (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>;
+            (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index)>;
   } // End IsNeonAvailable
 
   let Predicates = [IsNeonAvailable] in {
   def : Pat<(sext_inreg (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index), i8),
-            (i32 (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+            (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
   def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index))), i8),
-            (i64 (SMOVvi8to64 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>;
+            (SMOVvi8to64 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
 
   def : Pat<(sext_inreg (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index), i16),
-            (i32 (SMOVvi16to32 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+            (SMOVvi16to32 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
   def : Pat<(sext_inreg (anyext (i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index))), i16),
-            (i64 (SMOVvi16to64 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>;
+            (SMOVvi16to64 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
 
   def : Pat<(sext (i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index))),
-            (i64 (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>;
+            (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
   } // End IsNeonAvailable
 
   // Extract first element from vector.
   let AddedComplexity = 2 in {
-  def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)),
-            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv8i16 ZPR:$Zs), (i64 0)),
-            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv4i32 ZPR:$Zs), (i64 0)),
-            (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv2i64 ZPR:$Zs), (i64 0)),
-            (i64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
-  def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
-            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv4f16 ZPR:$Zs), (i64 0)),
-            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv2f16 ZPR:$Zs), (i64 0)),
-            (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv8bf16 ZPR:$Zs), (i64 0)),
-            (bf16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv4bf16 ZPR:$Zs), (i64 0)),
-            (bf16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv2bf16 ZPR:$Zs), (i64 0)),
-            (bf16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>;
-  def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
-            (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv2f32 ZPR:$Zs), (i64 0)),
-            (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>;
-  def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
-            (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, dsub)>;
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(bf16 (vector_extract (nxv8bf16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(bf16 (vector_extract (nxv4bf16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(bf16 (vector_extract (nxv2bf16 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, hsub)>;
+  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, ssub)>;
+  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$Zs), (i64 0))),
+            (EXTRACT_SUBREG ZPR:$Zs, dsub)>;
   }
 
   multiclass sve_predicated_add<SDNode extend, int value> {
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index ef7c517732ef3f..aabc5d5d22e2d3 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -532,8 +532,10 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
                               ConstantInt::get(IRB.getInt64Ty(), Tag)});
     if (Info.AI->hasName())
       TagPCall->setName(Info.AI->getName() + ".tag");
-    // Does not replace metadata, so we don't have to handle DPValues.
-    Info.AI->replaceNonMetadataUsesWith(TagPCall);
+    // Does not replace metadata, so we don't have to handle DbgVariableRecords.
+    Info.AI->replaceUsesWithIf(TagPCall, [&](const Use &U) {
+      return !memtag::isLifetimeIntrinsic(U.getUser());
+    });
     TagPCall->setOperand(0, Info.AI);
 
     // Calls to functions that may return twice (e.g. setjmp) confuse the
@@ -550,7 +552,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       uint64_t Size =
           cast<ConstantInt>(Start->getArgOperand(0))->getZExtValue();
       Size = alignTo(Size, kTagGranuleSize);
-      tagAlloca(AI, Start->getNextNode(), Start->getArgOperand(1), Size);
+      tagAlloca(AI, Start->getNextNode(), TagPCall, Size);
 
       auto TagEnd = [&](Instruction *Node) { untagAlloca(AI, Node, Size); };
       if (!DT || !PDT ||
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 755b034764ed2d..7a86c5c6088120 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1129,7 +1129,7 @@ static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
       auto *NewRHS =
           IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
       auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
-          OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
+          OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
       return IC.replaceInstUsesWith(II, NewBinOp);
     }
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 7a49422c064b7c..677dd0b502b956 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2852,8 +2852,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       return false;
     }
 
-    uint64_t MemSizeInBytes = LdSt.getMemSize();
-    unsigned MemSizeInBits = LdSt.getMemSizeInBits();
+    uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
+    unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
@@ -3276,7 +3276,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
       if (LoadMI && IsGPR) {
         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
-        unsigned BytesLoaded = MemOp->getSize();
+        unsigned BytesLoaded = MemOp->getSize().getValue();
         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
           return selectCopy(I, TII, MRI, TRI, RBI);
       }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 36adada2796531..996abe8e47396b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -373,6 +373,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalForTypesWithMemDesc(
           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
       .widenScalarToNextPow2(0, /* MinSize = */ 8)
+      .clampMaxNumElements(0, s8, 16)
+      .clampMaxNumElements(0, s16, 8)
+      .clampMaxNumElements(0, s32, 4)
+      .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotByteSizePow2()
       .clampScalar(0, s8, s64)
       .narrowScalarIf(
@@ -383,11 +388,6 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                    Query.Types[0].getSizeInBits() > 32;
           },
           changeTo(0, s32))
-      .clampMaxNumElements(0, s8, 16)
-      .clampMaxNumElements(0, s16, 8)
-      .clampMaxNumElements(0, s32, 4)
-      .clampMaxNumElements(0, s64, 2)
-      .clampMaxNumElements(0, p0, 2)
       // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
       .bitcastIf(typeInSet(0, {v4s8}),
                  [=](const LegalityQuery &Query) {
@@ -600,7 +600,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalIf(ExtLegalFunc)
       .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}})
       .clampScalar(0, s64, s64) // Just for s128, others are handled above.
-      .moreElementsToNextPow2(1)
+      .moreElementsToNextPow2(0)
       .clampMaxNumElements(1, s8, 8)
       .clampMaxNumElements(1, s16, 4)
       .clampMaxNumElements(1, s32, 2)
@@ -628,7 +628,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
                DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
       })
-
+      .clampMinNumElements(0, s8, 8)
+      .clampMinNumElements(0, s16, 4)
+      .clampMinNumElements(0, s32, 2)
       .alwaysLegal();
 
   getActionDefinitionsBuilder(G_SEXT_INREG)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 51c52aad359497..d8ca5494ba50a4 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -311,7 +311,7 @@ bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) {
   LLT ValTy = MRI.getType(Store.getValueReg());
   if (!ValTy.isVector() || ValTy.getSizeInBits() != 128)
     return false;
-  if (ValTy.getSizeInBits() != Store.getMemSizeInBits())
+  if (Store.getMemSizeInBits() != ValTy.getSizeInBits())
     return false; // Don't split truncating stores.
   if (!MRI.hasOneNonDBGUse(Store.getValueReg()))
     return false;
@@ -658,7 +658,7 @@ bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing(
         APInt Offset;
         LLT StoredValTy = MRI.getType(St->getValueReg());
         unsigned ValSize = StoredValTy.getSizeInBits();
-        if (ValSize < 32 || ValSize != St->getMMO().getSizeInBits())
+        if (ValSize < 32 || St->getMMO().getSizeInBits() != ValSize)
           continue;
 
         Register PtrReg = St->getPointerReg();
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index c19e02bb03d1f9..1e76d58669da11 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -958,39 +958,39 @@ multiclass sve_int_count_r_x64<bits<5> opc, string asm,
 
   // combine_op(x, trunc(cntp(all_active, p))) ==> inst p, x
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv16i1 (SVEAllActive)), (nxv16i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv8i1 (SVEAllActive)), (nxv8i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv4i1 (SVEAllActive)), (nxv4i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv2i1 (SVEAllActive)), (nxv2i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
 
   // combine_op(x, trunc(cntp(p, p))) ==> inst p, x
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv16i1 PPRAny:$pred), (nxv16i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _B) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv8i1 PPRAny:$pred), (nxv8i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _H) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv4i1 PPRAny:$pred), (nxv4i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _S) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
   def : Pat<(i32 (combine_op GPR32:$Rn, (trunc (int_aarch64_sve_cntp_oneuse (nxv2i1 PPRAny:$pred), (nxv2i1 PPRAny:$pred))))),
-            (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$pred,
-                                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32)),
-                                 sub_32))>;
+            (EXTRACT_SUBREG (!cast<Instruction>(NAME # _D) PPRAny:$pred,
+                                     (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$Rn, sub_32)),
+                                 sub_32)>;
 }
 
 class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
@@ -1195,19 +1195,19 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm,
               (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>;
 
     def : Pat<(i32 (op GPR32:$Rdn, (i32 (trunc (opcnt (sve_pred_enum:$pattern)))))),
-              (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF),
                                                GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, 1),
-                                    sub_32))>;
+                                    sub_32)>;
 
     def : Pat<(i32 (op GPR32:$Rdn, (mul (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_mul_imm_i32 i32:$imm)))),
-              (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF),
                                                GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm),
-                                    sub_32))>;
+                                    sub_32)>;
 
     def : Pat<(i32 (op GPR32:$Rdn, (shl (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_shl_imm i32:$imm)))),
-              (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+              (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (IMPLICIT_DEF),
                                                GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm),
-                                    sub_32))>;
+                                    sub_32)>;
   }
 }
 
@@ -5160,9 +5160,9 @@ multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
             (cmp $Op1, $Op2, $Op3)>;
   def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)),
             (cmp $Op1, $Op3, $Op2)>;
-  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, cc))),
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (SVEAllActive)), intvt:$Op2, intvt:$Op3, cc))),
             (cmp $Pg, $Op2, $Op3)>;
-  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, invcc))),
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (SVEAllActive)), intvt:$Op2, intvt:$Op3, invcc))),
             (cmp $Pg, $Op3, $Op2)>;
 }
 
@@ -5172,9 +5172,9 @@ multiclass SVE_SETCC_Pat_With_Zero<CondCode cc, CondCode invcc, ValueType predvt
             (cmp $Op1, $Op2)>;
   def : Pat<(predvt (AArch64setcc_z predvt:$Op1, (SVEDup0), intvt:$Op2, invcc)),
             (cmp $Op1, $Op2)>;
-  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)), intvt:$Op1, (SVEDup0), cc))),
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (SVEAllActive)), intvt:$Op1, (SVEDup0), cc))),
             (cmp $Pg, $Op1)>;
-  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)), (SVEDup0), intvt:$Op1, invcc))),
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z_oneuse (predvt (SVEAllActive)), (SVEDup0), intvt:$Op1, invcc))),
             (cmp $Pg, $Op1)>;
 }
 
@@ -5258,13 +5258,13 @@ multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
                                     commuted_cc)),
             (cmp $Pg, $Zs1, immtype:$imm)>;
   def : Pat<(predvt (and predvt:$Pg,
-                         (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)),
+                         (AArch64setcc_z_oneuse (predvt (SVEAllActive)),
                                          (intvt ZPR:$Zs1),
                                          (intvt (splat_vector (immtype:$imm))),
                                          cc))),
             (cmp $Pg, $Zs1, immtype:$imm)>;
   def : Pat<(predvt (and predvt:$Pg,
-                         (AArch64setcc_z_oneuse (predvt (AArch64ptrue 31)),
+                         (AArch64setcc_z_oneuse (predvt (SVEAllActive)),
                                          (intvt (splat_vector (immtype:$imm))),
                                          (intvt ZPR:$Zs1),
                                          commuted_cc))),
@@ -5743,23 +5743,23 @@ multiclass sve_int_index_ir<string asm, SDPatternOperator mulop, SDPatternOperat
             (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // mul(step_vector(1), dup(Y)) -> index(0, Y).
-  def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv16i1 (SVEAllActive)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_B") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv8i1 (SVEAllActive)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_H") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),
+  def : Pat<(mulop (nxv4i1 (SVEAllActive)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),
             (!cast<Instruction>(NAME # "_S") (i32 0), GPR32:$Rm)>;
-  def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),
+  def : Pat<(mulop (nxv2i1 (SVEAllActive)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),
             (!cast<Instruction>(NAME # "_D") (i64 0), GPR64:$Rm)>;
 
   // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
-  def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv16i1 (SVEAllActive)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(simm5_8b:$imm5)))),
             (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv8i1 (SVEAllActive)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))), (nxv8i16 (splat_vector(simm5_16b:$imm5)))),
             (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv4i1 (SVEAllActive)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))), (nxv4i32 (splat_vector(simm5_32b:$imm5)))),
             (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
-  def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
+  def : Pat<(add (muloneuseop (nxv2i1 (SVEAllActive)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))), (nxv2i64 (splat_vector(simm5_64b:$imm5)))),
             (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>;
 }
 
@@ -5837,13 +5837,13 @@ multiclass sve_int_index_rr<string asm, SDPatternOperator mulop> {
             (!cast<Instruction>(NAME # "_D") GPR64:$Rn, (SUBREG_TO_REG (i64 0), (!cast<Instruction>("MOVi32imm") (!cast<SDNodeXForm>("trunc_imm") $imm)), sub_32))>;
 
   // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y).
-  def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv16i1 (SVEAllActive)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (splat_vector(i32 GPR32:$Rm)))), (nxv16i8 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_B") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),(nxv8i16 (splat_vector(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv8i1 (SVEAllActive)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (splat_vector(i32 GPR32:$Rm)))),(nxv8i16 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_H") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),(nxv4i32 (splat_vector(i32 GPR32:$Rn)))),
+  def : Pat<(add (mulop (nxv4i1 (SVEAllActive)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (splat_vector(i32 GPR32:$Rm)))),(nxv4i32 (splat_vector(i32 GPR32:$Rn)))),
             (!cast<Instruction>(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>;
-  def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),(nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
+  def : Pat<(add (mulop (nxv2i1 (SVEAllActive)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (splat_vector(i64 GPR64:$Rm)))),(nxv2i64 (splat_vector(i64 GPR64:$Rn)))),
             (!cast<Instruction>(NAME # "_D") GPR64:$Rn, GPR64:$Rm)>;
 }
 
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index ed8336a2e8ad34..f821bb527aedb8 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -591,6 +591,14 @@ namespace AArch64BTIHint {
   #include "AArch64GenSystemOperands.inc"
 }
 
+namespace AArch64SME {
+enum ToggleCondition : unsigned {
+  Always,
+  IfCallerIsStreaming,
+  IfCallerIsNonStreaming
+};
+}
+
 namespace AArch64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 6d05c3678bf09e..7e1f041fa10933 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1301,6 +1301,9 @@ bool AMDGPUCallLowering::lowerTailCall(
   if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
     return false;
 
+  if (Info.ConvergenceCtrlToken) {
+    MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
+  }
   handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
                               ImplicitArgRegs);
 
@@ -1483,6 +1486,9 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
+  if (Info.ConvergenceCtrlToken) {
+    MIB.addUse(Info.ConvergenceCtrlToken, RegState::Implicit);
+  }
   handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
                               ImplicitArgRegs);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 0edbbf7cb0af54..bddf3d958a1ae6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1749,7 +1749,7 @@ static bool isInterestingPHIIncomingValue(const Value *V) {
     // Non constant index/out of bounds index -> folding is unlikely.
     // The latter is more of a sanity check because canonical IR should just
     // have replaced those with poison.
-    if (!Idx || Idx->getSExtValue() >= FVT->getNumElements())
+    if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
       return false;
 
     const auto *VecSrc = IE->getOperand(0);
@@ -1761,7 +1761,7 @@ static bool isInterestingPHIIncomingValue(const Value *V) {
       return false;
 
     CurVal = VecSrc;
-    EltsCovered.set(Idx->getSExtValue());
+    EltsCovered.set(Idx->getZExtValue());
 
     // All elements covered.
     if (EltsCovered.all())
@@ -1987,7 +1987,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
   for (VectorSlice &S : Slices) {
     // We need to reset the build on each iteration, because getSlicedVal may
     // have inserted something into I's BB.
-    B.SetInsertPoint(I.getParent()->getFirstNonPHI());
+    B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
     S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
 
     for (const auto &[Idx, BB] : enumerate(I.blocks())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c99e490014668b..bba7682cd7a0d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3556,7 +3556,10 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
   if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
     return false;
 
-  return Ld->getAlign() >= Align(std::min(MMO->getSize(), uint64_t(4))) &&
+  return MMO->getSize().hasValue() &&
+         Ld->getAlign() >=
+             Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
+                            uint64_t(4))) &&
          ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
            Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
           (Subtarget->getScalarizeGlobalBehavior() &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8a71550e5532cd..bee43b6c18c880 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -582,6 +582,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setMaxAtomicSizeInBitsSupported(64);
   setMaxDivRemBitWidthSupported(64);
+  setMaxLargeFPConvertBitWidthSupported(64);
 }
 
 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index fb829fab0a2c19..5b7fa13f2e835f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -767,19 +767,21 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     // Checking for NaN before canonicalization provides better fidelity when
     // mapping other operations onto fmed3 since the order of operands is
     // unchanged.
-    CallInst *NewCall = nullptr;
+    Value *V = nullptr;
     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
-      NewCall = IC.Builder.CreateMinNum(Src1, Src2);
+      V = IC.Builder.CreateMinNum(Src1, Src2);
     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
-      NewCall = IC.Builder.CreateMinNum(Src0, Src2);
+      V = IC.Builder.CreateMinNum(Src0, Src2);
     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
-      NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
+      V = IC.Builder.CreateMaxNum(Src0, Src1);
     }
 
-    if (NewCall) {
-      NewCall->copyFastMathFlags(&II);
-      NewCall->takeName(&II);
-      return IC.replaceInstUsesWith(II, NewCall);
+    if (V) {
+      if (auto *CI = dyn_cast<CallInst>(V)) {
+        CI->copyFastMathFlags(&II);
+        CI->takeName(&II);
+      }
+      return IC.replaceInstUsesWith(II, V);
     }
 
     bool Swap = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0029c51231f286..90872516dd6db1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5661,7 +5661,7 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
   Register RSrc = MI.getOperand(2).getReg();
 
   MachineMemOperand *MMO = *MI.memoperands_begin();
-  const int MemSize = MMO->getSize();
+  const int MemSize = MMO->getSize().getValue();
 
   unsigned ImmOffset;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 2da896edc79a7f..84b4ccc1ae7ba7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -470,9 +470,11 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
     nf.setId(AMDGPULibFunc::EI_COS);
     FunctionCallee cosExpr = getFunction(M, nf);
     if (sinExpr && cosExpr) {
-      Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI);
-      Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI);
-      new StoreInst(cosval, aCI->getArgOperand(1), aCI);
+      Value *sinval =
+          CallInst::Create(sinExpr, opr0, "splitsin", aCI->getIterator());
+      Value *cosval =
+          CallInst::Create(cosExpr, opr0, "splitcos", aCI->getIterator());
+      new StoreInst(cosval, aCI->getArgOperand(1), aCI->getIterator());
 
       DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
                                           << " with native version of sin/cos");
@@ -1655,7 +1657,7 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
     // sincos
     assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
            "math function with ptr arg not supported yet");
-    new StoreInst(nval1, aCI->getArgOperand(1), aCI);
+    new StoreInst(nval1, aCI->getArgOperand(1), aCI->getIterator());
   }
 
   replaceCall(aCI, nval0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 2cfd1de93a04f9..9083150b338488 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -801,7 +801,7 @@ Value *FatPtrConstMaterializer::materialize(Value *V) {
         Ops.push_back(cast<Constant>(U.get()));
       auto *NewGEP = ConstantExpr::getGetElementPtr(
           NewSrcTy, Ops[0], ArrayRef<Constant *>(Ops).slice(1),
-          GEPO->isInBounds(), GEPO->getInRangeIndex());
+          GEPO->isInBounds(), GEPO->getInRange());
       LLVM_DEBUG(dbgs() << "p7-getting GEP: " << *GEPO << " becomes " << *NewGEP
                         << "\n");
       Value *FurtherMap = materialize(NewGEP);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 7b5dc3795b0227..aef0ade6d9be63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -290,8 +290,8 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
     Value *sumC = ConstantInt::get(SizetTy, Sum, false);
     SmallVector<Value *, 1> alloc_args;
     alloc_args.push_back(sumC);
-    CallInst *pcall =
-        CallInst::Create(PrintfAllocFn, alloc_args, "printf_alloc_fn", CI);
+    CallInst *pcall = CallInst::Create(PrintfAllocFn, alloc_args,
+                                       "printf_alloc_fn", CI->getIterator());
 
     //
     // Insert code to split basicblock with a
@@ -309,25 +309,27 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
     SplitBlock(CI->getParent(), cmp);
     Instruction *Brnch =
         SplitBlockAndInsertIfThen(cmp, cmp->getNextNode(), false);
+    BasicBlock::iterator BrnchPoint = Brnch->getIterator();
 
     Builder.SetInsertPoint(Brnch);
 
     // store unique printf id in the buffer
     //
     GetElementPtrInst *BufferIdx = GetElementPtrInst::Create(
-        I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID", Brnch);
+        I8Ty, pcall, ConstantInt::get(Ctx, APInt(32, 0)), "PrintBuffID",
+        BrnchPoint);
 
     Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
     Value *id_gep_cast =
-        new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch);
+        new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", BrnchPoint);
 
-    new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch);
+    new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, BrnchPoint);
 
     // 1st 4 bytes hold the printf_id
     // the following GEP is the buffer pointer
     BufferIdx = GetElementPtrInst::Create(I8Ty, pcall,
                                           ConstantInt::get(Ctx, APInt(32, 4)),
-                                          "PrintBuffGep", Brnch);
+                                          "PrintBuffGep", BrnchPoint);
 
     Type *Int32Ty = Type::getInt32Ty(Ctx);
     for (unsigned ArgCount = 1;
@@ -405,7 +407,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
       for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) {
         Value *TheBtCast = WhatToStore[I];
         unsigned ArgSize = TD->getTypeAllocSize(TheBtCast->getType());
-        StoreInst *StBuff = new StoreInst(TheBtCast, BufferIdx, Brnch);
+        StoreInst *StBuff = new StoreInst(TheBtCast, BufferIdx, BrnchPoint);
         LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n"
                           << *StBuff << '\n');
         (void)StBuff;
@@ -413,7 +415,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
           break;
         BufferIdx = GetElementPtrInst::Create(
             I8Ty, BufferIdx, {ConstantInt::get(I32Ty, ArgSize)},
-            "PrintBuffNextPtr", Brnch);
+            "PrintBuffNextPtr", BrnchPoint);
         LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
                           << *BufferIdx << '\n');
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index b1b15e9915aea3..6f3cdf54dceec7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
@@ -39,6 +40,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -64,10 +66,17 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
     cl::desc("Maximum byte size to consider promote alloca to vector"),
     cl::init(0));
 
+static cl::opt<unsigned>
+    LoopUserWeight("promote-alloca-vector-loop-user-weight",
+                   cl::desc("The bonus weight of users of allocas within loop "
+                            "when sorting profitable allocas"),
+                   cl::init(4));
+
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
 private:
   const TargetMachine &TM;
+  LoopInfo &LI;
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
 
@@ -101,8 +110,11 @@ class AMDGPUPromoteAllocaImpl {
   bool tryPromoteAllocaToVector(AllocaInst &I);
   bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
 
+  void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
+
 public:
-  AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
+  AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
+
     const Triple &TT = TM.getTargetTriple();
     IsAMDGCN = TT.getArch() == Triple::amdgcn;
     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
@@ -122,7 +134,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
     if (skipFunction(F))
       return false;
     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-      return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+      return AMDGPUPromoteAllocaImpl(
+                 TPC->getTM<TargetMachine>(),
+                 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
           .run(F, /*PromoteToLDS*/ true);
     return false;
   }
@@ -131,6 +145,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -145,7 +160,9 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
     if (skipFunction(F))
       return false;
     if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
-      return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+      return AMDGPUPromoteAllocaImpl(
+                 TPC->getTM<TargetMachine>(),
+                 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
           .run(F, /*PromoteToLDS*/ false);
     return false;
   }
@@ -156,6 +173,7 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -186,18 +204,23 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
 // Move LDS uses from functions to kernels before promote alloca for accurate
 // estimation of LDS available
 INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
                     "AMDGPU promote alloca to vector or LDS", false, false)
 
-INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
-                "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                      "AMDGPU promote alloca to vector", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                    "AMDGPU promote alloca to vector", false, false)
 
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
 
 PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
-  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -208,7 +231,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
 
 PreservedAnalyses
 AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
-  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -225,6 +249,55 @@ FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
   return new AMDGPUPromoteAllocaToVector();
 }
 
+static void collectAllocaUses(AllocaInst &Alloca,
+                              SmallVectorImpl<Use *> &Uses) {
+  SmallVector<Instruction *, 4> WorkList({&Alloca});
+  while (!WorkList.empty()) {
+    auto *Cur = WorkList.pop_back_val();
+    for (auto &U : Cur->uses()) {
+      Uses.push_back(&U);
+
+      if (isa<GetElementPtrInst>(U.getUser()))
+        WorkList.push_back(cast<Instruction>(U.getUser()));
+    }
+  }
+}
+
+void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
+    SmallVectorImpl<AllocaInst *> &Allocas) {
+  DenseMap<AllocaInst *, unsigned> Scores;
+
+  for (auto *Alloca : Allocas) {
+    LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
+    unsigned &Score = Scores[Alloca];
+    // Increment score by one for each user + a bonus for users within loops.
+    SmallVector<Use *, 8> Uses;
+    collectAllocaUses(*Alloca, Uses);
+    for (auto *U : Uses) {
+      Instruction *Inst = cast<Instruction>(U->getUser());
+      if (isa<GetElementPtrInst>(Inst))
+        continue;
+      unsigned UserScore =
+          1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
+      LLVM_DEBUG(dbgs() << "  [+" << UserScore << "]:\t" << *Inst << "\n");
+      Score += UserScore;
+    }
+    LLVM_DEBUG(dbgs() << "  => Final Score:" << Score << "\n");
+  }
+
+  stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
+    return Scores.at(A) > Scores.at(B);
+  });
+
+  // clang-format off
+  LLVM_DEBUG(
+    dbgs() << "Sorted Worklist:\n";
+    for (auto *A: Allocas)
+      dbgs() << "  " << *A << "\n";
+  );
+  // clang-format on
+}
+
 bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   Mod = F.getParent();
   DL = &Mod->getDataLayout();
@@ -237,6 +310,13 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
 
   bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
 
+  // Use up to 1/4 of available register budget for vectorization.
+  // FIXME: Increase the limit for whole function budgets? Perhaps x2?
+  unsigned VectorizationBudget =
+      (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+                                  : (MaxVGPRs * 32)) /
+      4;
+
   SmallVector<AllocaInst *, 16> Allocas;
   for (Instruction &I : F.getEntryBlock()) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -248,11 +328,27 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
     }
   }
 
+  sortAllocasToPromote(Allocas);
+
   bool Changed = false;
   for (AllocaInst *AI : Allocas) {
-    if (tryPromoteAllocaToVector(*AI))
+    const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
+    if (AllocaCost > VectorizationBudget) {
+      LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization: " << *AI
+                        << "\n");
+      return false;
+    }
+
+    if (tryPromoteAllocaToVector(*AI)) {
       Changed = true;
-    else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
+      assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
+             "Underflow!");
+      VectorizationBudget -= AllocaCost;
+      LLVM_DEBUG(dbgs() << "  Remaining vectorization budget:"
+                        << VectorizationBudget << "\n");
+      if (VectorizationBudget == 0)
+        break;
+    } else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
       Changed = true;
   }
 
@@ -641,16 +737,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
                                       ArrayTy->getNumElements());
   }
 
-  // Use up to 1/4 of available register budget for vectorization.
-  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
-                                              : (MaxVGPRs * 32);
-
-  if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
-    LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with " << MaxVGPRs
-                      << " registers available\n");
-    return false;
-  }
-
   // FIXME: There is no reason why we can't support larger arrays, we
   // are just being conservative for now.
   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
@@ -671,7 +757,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   SmallVector<Instruction *> WorkList;
   SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;
-  SmallVector<Use *, 8> Uses;
   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
 
   const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
@@ -680,15 +765,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     return false;
   };
 
-  for (Use &U : Alloca.uses())
-    Uses.push_back(&U);
+  SmallVector<Use *, 8> Uses;
+  collectAllocaUses(Alloca, Uses);
 
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
   Type *VecEltTy = VectorTy->getElementType();
   unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
-  while (!Uses.empty()) {
-    Use *U = Uses.pop_back_val();
+  for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
@@ -724,14 +808,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       continue;
     }
 
-    if (isa<BitCastInst>(Inst)) {
-      // Look through bitcasts.
-      for (Use &U : Inst->uses())
-        Uses.push_back(&U);
-      UsersToRemove.push_back(Inst);
-      continue;
-    }
-
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
@@ -740,8 +816,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
       GEPVectorIdx[GEP] = Index;
-      for (Use &U : Inst->uses())
-        Uses.push_back(&U);
       UsersToRemove.push_back(Inst);
       continue;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b174d57bd57656..0037825ce08938 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -449,7 +449,7 @@ bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
   const unsigned AS = MMO->getAddrSpace();
   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
-  const unsigned MemSize = 8 * MMO->getSize();
+  const unsigned MemSize = 8 * MMO->getSize().getValue();
 
   // Require 4-byte alignment.
   return (MMO->getAlign() >= Align(4) ||
@@ -1070,7 +1070,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
       return false;
 
     MachineMemOperand *MMO = *MI.memoperands_begin();
-    const unsigned MemSize = 8 * MMO->getSize();
+    const unsigned MemSize = 8 * MMO->getSize().getValue();
     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
     // scalar loads should have a load size of 32 but memory access size of less
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 027dd0f2c22448..529705479646fc 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDKernelCodeT.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
@@ -1816,6 +1817,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
 public:
   void onBeginOfFile() override;
+  bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
 
   ParseStatus parseCustomOperand(OperandVector &Operands, unsigned MCK);
 
@@ -8330,6 +8332,59 @@ void AMDGPUAsmParser::onBeginOfFile() {
     getTargetStreamer().EmitDirectiveAMDGCNTarget();
 }
 
+/// Parse AMDGPU specific expressions.
+///
+///  expr ::= or(expr, ...) |
+///           max(expr, ...)
+///
+bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  using AGVK = AMDGPUVariadicMCExpr::VariadicKind;
+
+  if (isToken(AsmToken::Identifier)) {
+    StringRef TokenId = getTokenStr();
+    AGVK VK = StringSwitch<AGVK>(TokenId)
+                  .Case("max", AGVK::AGVK_Max)
+                  .Case("or", AGVK::AGVK_Or)
+                  .Default(AGVK::AGVK_None);
+
+    if (VK != AGVK::AGVK_None && peekToken().is(AsmToken::LParen)) {
+      SmallVector<const MCExpr *, 4> Exprs;
+      uint64_t CommaCount = 0;
+      lex(); // Eat 'max'/'or'
+      lex(); // Eat '('
+      while (true) {
+        if (trySkipToken(AsmToken::RParen)) {
+          if (Exprs.empty()) {
+            Error(getToken().getLoc(),
+                  "empty " + Twine(TokenId) + " expression");
+            return true;
+          }
+          if (CommaCount + 1 != Exprs.size()) {
+            Error(getToken().getLoc(),
+                  "mismatch of commas in " + Twine(TokenId) + " expression");
+            return true;
+          }
+          Res = AMDGPUVariadicMCExpr::create(VK, Exprs, getContext());
+          return false;
+        }
+        const MCExpr *Expr;
+        if (getParser().parseExpression(Expr, EndLoc))
+          return true;
+        Exprs.push_back(Expr);
+        bool LastTokenWasComma = trySkipToken(AsmToken::Comma);
+        if (LastTokenWasComma)
+          CommaCount++;
+        if (!LastTokenWasComma && !isToken(AsmToken::RParen)) {
+          Error(getToken().getLoc(),
+                "unexpected token in " + Twine(TokenId) + " expression");
+          return true;
+        }
+      }
+    }
+  }
+  return getParser().parsePrimaryExpr(Res, EndLoc, nullptr);
+}
+
 ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) {
   StringRef Name = getTokenStr();
   if (Name == "mul") {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f42d4ae416bd76..6526a7f9fc4182 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -115,8 +115,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   bits<1> lds = ps.lds; // LDS DMA for global and scratch
 
   // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
-  bits<2> seg = !if(ps.is_flat_global, 0b10,
-                  !if(ps.is_flat_scratch, 0b01, 0));
+  bits<2> seg = {ps.is_flat_global, ps.is_flat_scratch};
 
   // Signed offset. Highest bit ignored for flat and treated as 12-bit
   // unsigned for flat accesses.
@@ -174,7 +173,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   bits<8> vaddr;
   bits<24> offset;
 
-  let Inst{6-0} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0);
+  let Inst{6-0} = !if(ps.enabled_saddr, saddr, SGPR_NULL_gfx11plus.Index);
   let Inst{21-14} = op;
   let Inst{31-26} = 0x3b;
   let Inst{39-32} = !if(ps.has_vdst, vdst, ?);
@@ -240,7 +239,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
 }
 
 multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
-  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
+  let is_flat_global = 1 in {
     def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
       GlobalSaddrTable<0, opName>;
     def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
@@ -277,7 +276,7 @@ multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterClass regClass,
 }
 
 multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
-  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
+  let is_flat_global = 1 in {
     def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
       GlobalSaddrTable<0, opName>;
     def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
@@ -390,6 +389,7 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
      !if(HasTiedOutput, (ins CPol:$cpol, getLdStRegisterOperand<regClass>.ret:$vdst_in),
                         (ins CPol_0:$cpol))),
   " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
+  let is_flat_scratch = 1;
   let has_data = 0;
   let mayLoad = 1;
   let has_saddr = 1;
@@ -417,6 +417,7 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
         (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, CPol_0:$cpol),
         (ins vdata_op:$vdata, flat_offset:$offset, CPol_0:$cpol)))),
   " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$cpol"> {
+  let is_flat_scratch = 1;
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
@@ -429,37 +430,33 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En
 }
 
 multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> {
-  let is_flat_scratch = 1 in {
-    def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>,
-             FlatScratchInst<opName, "SV">;
-    def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
-                 FlatScratchInst<opName, "SS">;
-
-    let SubtargetPredicate = HasFlatScratchSVSMode in
-    def _SVS : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1, 1>,
-               FlatScratchInst<opName, "SVS">;
+  def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>,
+           FlatScratchInst<opName, "SV">;
+  def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
+               FlatScratchInst<opName, "SS">;
 
-    let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST  : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0, 0>,
-               FlatScratchInst<opName, "ST">;
-  }
+  let SubtargetPredicate = HasFlatScratchSVSMode in
+  def _SVS : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1, 1>,
+             FlatScratchInst<opName, "SVS">;
+
+  let SubtargetPredicate = HasFlatScratchSTMode in
+  def _ST  : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0, 0>,
+             FlatScratchInst<opName, "ST">;
 }
 
 multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
-  let is_flat_scratch = 1 in {
-    def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>,
-             FlatScratchInst<opName, "SV">;
-    def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
-                 FlatScratchInst<opName, "SS">;
-
-    let SubtargetPredicate = HasFlatScratchSVSMode in
-    def _SVS : FLAT_Scratch_Store_Pseudo<opName, regClass, 1, 1>,
-               FlatScratchInst<opName, "SVS">;
+  def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>,
+           FlatScratchInst<opName, "SV">;
+  def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
+               FlatScratchInst<opName, "SS">;
 
-    let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST  : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0, 0>,
-               FlatScratchInst<opName, "ST">;
-  }
+  let SubtargetPredicate = HasFlatScratchSVSMode in
+  def _SVS : FLAT_Scratch_Store_Pseudo<opName, regClass, 1, 1>,
+             FlatScratchInst<opName, "SVS">;
+
+  let SubtargetPredicate = HasFlatScratchSTMode in
+  def _ST  : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0, 0>,
+             FlatScratchInst<opName, "ST">;
 }
 
 class FLAT_Scratch_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0,
@@ -584,25 +581,27 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
   RegisterClass data_rc = vdst_rc,
   RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
 
-  def "" : FLAT_AtomicNoRet_Pseudo <opName,
-    (outs),
-    (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
-    " $vaddr, $vdata, off$offset$cpol">,
-    GlobalSaddrTable<0, opName> {
-    let has_saddr = 1;
-    let PseudoInstr = NAME;
-    let FPAtomic = data_vt.isFP;
-  }
+  let is_flat_global = 1 in {
+    def "" : FLAT_AtomicNoRet_Pseudo <opName,
+      (outs),
+      (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_0:$cpol),
+      " $vaddr, $vdata, off$offset$cpol">,
+      GlobalSaddrTable<0, opName> {
+      let has_saddr = 1;
+      let PseudoInstr = NAME;
+      let FPAtomic = data_vt.isFP;
+    }
 
-  def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
-    (outs),
-    (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
-    " $vaddr, $vdata, $saddr$offset$cpol">,
-    GlobalSaddrTable<1, opName> {
-    let has_saddr = 1;
-    let enabled_saddr = 1;
-    let PseudoInstr = NAME#"_SADDR";
-    let FPAtomic = data_vt.isFP;
+    def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
+      (outs),
+      (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol),
+      " $vaddr, $vdata, $saddr$offset$cpol">,
+      GlobalSaddrTable<1, opName> {
+      let has_saddr = 1;
+      let enabled_saddr = 1;
+      let PseudoInstr = NAME#"_SADDR";
+      let FPAtomic = data_vt.isFP;
+    }
   }
 }
 
@@ -615,24 +614,26 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
   RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
   RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
 
-  def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_op:$vdst),
-      (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
-    " $vdst, $vaddr, $vdata, off$offset$cpol">,
-    GlobalSaddrTable<0, opName#"_rtn"> {
-    let has_saddr = 1;
-    let FPAtomic = data_vt.isFP;
-  }
+  let is_flat_global = 1 in {
+    def _RTN : FLAT_AtomicRet_Pseudo <opName,
+      (outs vdst_op:$vdst),
+        (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
+      " $vdst, $vaddr, $vdata, off$offset$cpol">,
+      GlobalSaddrTable<0, opName#"_rtn"> {
+      let has_saddr = 1;
+      let FPAtomic = data_vt.isFP;
+    }
 
-  def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_op:$vdst),
-      (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
-    " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
-    GlobalSaddrTable<1, opName#"_rtn"> {
-     let has_saddr = 1;
-     let enabled_saddr = 1;
-     let PseudoInstr = NAME#"_SADDR_RTN";
-     let FPAtomic = data_vt.isFP;
+    def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
+      (outs vdst_op:$vdst),
+        (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol),
+      " $vdst, $vaddr, $vdata, $saddr$offset$cpol">,
+      GlobalSaddrTable<1, opName#"_rtn"> {
+       let has_saddr = 1;
+       let enabled_saddr = 1;
+       let PseudoInstr = NAME#"_SADDR_RTN";
+       let FPAtomic = data_vt.isFP;
+    }
   }
 }
 
@@ -642,10 +643,8 @@ multiclass FLAT_Global_Atomic_Pseudo<
   ValueType vt,
   ValueType data_vt = vt,
   RegisterClass data_rc = vdst_rc> {
-  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
-    defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
-    defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
-  }
+  defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
+  defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -849,7 +848,6 @@ defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_
 defm GLOBAL_STORE_BYTE_D16_HI  : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
 defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
 
-let is_flat_global = 1 in {
 defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap",
                                VGPR_32, i32, v2i32, VReg_64>;
 
@@ -948,9 +946,6 @@ let SubtargetPredicate = isGFX12Plus in {
   def GLOBAL_WBINV  : FLAT_Global_Invalidate_Writeback<"global_wbinv">;
 } // End SubtargetPredicate = isGFX12Plus
 
-} // End is_flat_global = 1
-
-let SubtargetPredicate = HasFlatScratchInsts in {
 defm SCRATCH_LOAD_UBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>;
 defm SCRATCH_LOAD_SBYTE    : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte", VGPR_32>;
 defm SCRATCH_LOAD_USHORT   : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort", VGPR_32>;
@@ -985,8 +980,6 @@ defm SCRATCH_LOAD_LDS_USHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_u
 defm SCRATCH_LOAD_LDS_SSHORT : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_sshort">;
 defm SCRATCH_LOAD_LDS_DWORD  : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_dword">;
 
-} // End SubtargetPredicate = HasFlatScratchInsts
-
 let SubtargetPredicate = isGFX12Plus in {
   let WaveSizePredicate = isWave32 in {
     defm GLOBAL_LOAD_TR_B128_w32  : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
@@ -998,7 +991,7 @@ let SubtargetPredicate = isGFX12Plus in {
   }
 } // End SubtargetPredicate = isGFX12Plus
 
-let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
+let SubtargetPredicate = isGFX10Plus in {
   defm GLOBAL_ATOMIC_FCMPSWAP :
     FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
   defm GLOBAL_ATOMIC_FMIN :
@@ -1011,9 +1004,8 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
     FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
   defm GLOBAL_ATOMIC_FMAX_X2 :
     FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
-} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
+} // End SubtargetPredicate = isGFX10Plus
 
-let is_flat_global = 1 in {
 let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
   defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
     "global_atomic_add_f32", VGPR_32, f32
@@ -1030,7 +1022,6 @@ let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
   defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
     "global_atomic_pk_add_f16", VGPR_32, v2f16
   >;
-} // End is_flat_global = 1
 
 //===----------------------------------------------------------------------===//
 // Flat Patterns
@@ -1787,45 +1778,46 @@ def FLAT_STORE_DWORDX2_ci      : FLAT_Real_ci <0x1d, FLAT_STORE_DWORDX2>;
 def FLAT_STORE_DWORDX4_ci      : FLAT_Real_ci <0x1e, FLAT_STORE_DWORDX4>;
 def FLAT_STORE_DWORDX3_ci      : FLAT_Real_ci <0x1f, FLAT_STORE_DWORDX3>;
 
-multiclass FLAT_Real_Atomics_ci <bits<7> op, FLAT_Pseudo ps> {
+multiclass FLAT_Real_Atomics_ci <bits<7> op> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
   def _ci     : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
   def _RTN_ci : FLAT_Real_ci<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
 }
 
-defm FLAT_ATOMIC_SWAP          : FLAT_Real_Atomics_ci <0x30, FLAT_ATOMIC_SWAP>;
-defm FLAT_ATOMIC_CMPSWAP       : FLAT_Real_Atomics_ci <0x31, FLAT_ATOMIC_CMPSWAP>;
-defm FLAT_ATOMIC_ADD           : FLAT_Real_Atomics_ci <0x32, FLAT_ATOMIC_ADD>;
-defm FLAT_ATOMIC_SUB           : FLAT_Real_Atomics_ci <0x33, FLAT_ATOMIC_SUB>;
-defm FLAT_ATOMIC_SMIN          : FLAT_Real_Atomics_ci <0x35, FLAT_ATOMIC_SMIN>;
-defm FLAT_ATOMIC_UMIN          : FLAT_Real_Atomics_ci <0x36, FLAT_ATOMIC_UMIN>;
-defm FLAT_ATOMIC_SMAX          : FLAT_Real_Atomics_ci <0x37, FLAT_ATOMIC_SMAX>;
-defm FLAT_ATOMIC_UMAX          : FLAT_Real_Atomics_ci <0x38, FLAT_ATOMIC_UMAX>;
-defm FLAT_ATOMIC_AND           : FLAT_Real_Atomics_ci <0x39, FLAT_ATOMIC_AND>;
-defm FLAT_ATOMIC_OR            : FLAT_Real_Atomics_ci <0x3a, FLAT_ATOMIC_OR>;
-defm FLAT_ATOMIC_XOR           : FLAT_Real_Atomics_ci <0x3b, FLAT_ATOMIC_XOR>;
-defm FLAT_ATOMIC_INC           : FLAT_Real_Atomics_ci <0x3c, FLAT_ATOMIC_INC>;
-defm FLAT_ATOMIC_DEC           : FLAT_Real_Atomics_ci <0x3d, FLAT_ATOMIC_DEC>;
-defm FLAT_ATOMIC_SWAP_X2       : FLAT_Real_Atomics_ci <0x50, FLAT_ATOMIC_SWAP_X2>;
-defm FLAT_ATOMIC_CMPSWAP_X2    : FLAT_Real_Atomics_ci <0x51, FLAT_ATOMIC_CMPSWAP_X2>;
-defm FLAT_ATOMIC_ADD_X2        : FLAT_Real_Atomics_ci <0x52, FLAT_ATOMIC_ADD_X2>;
-defm FLAT_ATOMIC_SUB_X2        : FLAT_Real_Atomics_ci <0x53, FLAT_ATOMIC_SUB_X2>;
-defm FLAT_ATOMIC_SMIN_X2       : FLAT_Real_Atomics_ci <0x55, FLAT_ATOMIC_SMIN_X2>;
-defm FLAT_ATOMIC_UMIN_X2       : FLAT_Real_Atomics_ci <0x56, FLAT_ATOMIC_UMIN_X2>;
-defm FLAT_ATOMIC_SMAX_X2       : FLAT_Real_Atomics_ci <0x57, FLAT_ATOMIC_SMAX_X2>;
-defm FLAT_ATOMIC_UMAX_X2       : FLAT_Real_Atomics_ci <0x58, FLAT_ATOMIC_UMAX_X2>;
-defm FLAT_ATOMIC_AND_X2        : FLAT_Real_Atomics_ci <0x59, FLAT_ATOMIC_AND_X2>;
-defm FLAT_ATOMIC_OR_X2         : FLAT_Real_Atomics_ci <0x5a, FLAT_ATOMIC_OR_X2>;
-defm FLAT_ATOMIC_XOR_X2        : FLAT_Real_Atomics_ci <0x5b, FLAT_ATOMIC_XOR_X2>;
-defm FLAT_ATOMIC_INC_X2        : FLAT_Real_Atomics_ci <0x5c, FLAT_ATOMIC_INC_X2>;
-defm FLAT_ATOMIC_DEC_X2        : FLAT_Real_Atomics_ci <0x5d, FLAT_ATOMIC_DEC_X2>;
+defm FLAT_ATOMIC_SWAP          : FLAT_Real_Atomics_ci <0x30>;
+defm FLAT_ATOMIC_CMPSWAP       : FLAT_Real_Atomics_ci <0x31>;
+defm FLAT_ATOMIC_ADD           : FLAT_Real_Atomics_ci <0x32>;
+defm FLAT_ATOMIC_SUB           : FLAT_Real_Atomics_ci <0x33>;
+defm FLAT_ATOMIC_SMIN          : FLAT_Real_Atomics_ci <0x35>;
+defm FLAT_ATOMIC_UMIN          : FLAT_Real_Atomics_ci <0x36>;
+defm FLAT_ATOMIC_SMAX          : FLAT_Real_Atomics_ci <0x37>;
+defm FLAT_ATOMIC_UMAX          : FLAT_Real_Atomics_ci <0x38>;
+defm FLAT_ATOMIC_AND           : FLAT_Real_Atomics_ci <0x39>;
+defm FLAT_ATOMIC_OR            : FLAT_Real_Atomics_ci <0x3a>;
+defm FLAT_ATOMIC_XOR           : FLAT_Real_Atomics_ci <0x3b>;
+defm FLAT_ATOMIC_INC           : FLAT_Real_Atomics_ci <0x3c>;
+defm FLAT_ATOMIC_DEC           : FLAT_Real_Atomics_ci <0x3d>;
+defm FLAT_ATOMIC_SWAP_X2       : FLAT_Real_Atomics_ci <0x50>;
+defm FLAT_ATOMIC_CMPSWAP_X2    : FLAT_Real_Atomics_ci <0x51>;
+defm FLAT_ATOMIC_ADD_X2        : FLAT_Real_Atomics_ci <0x52>;
+defm FLAT_ATOMIC_SUB_X2        : FLAT_Real_Atomics_ci <0x53>;
+defm FLAT_ATOMIC_SMIN_X2       : FLAT_Real_Atomics_ci <0x55>;
+defm FLAT_ATOMIC_UMIN_X2       : FLAT_Real_Atomics_ci <0x56>;
+defm FLAT_ATOMIC_SMAX_X2       : FLAT_Real_Atomics_ci <0x57>;
+defm FLAT_ATOMIC_UMAX_X2       : FLAT_Real_Atomics_ci <0x58>;
+defm FLAT_ATOMIC_AND_X2        : FLAT_Real_Atomics_ci <0x59>;
+defm FLAT_ATOMIC_OR_X2         : FLAT_Real_Atomics_ci <0x5a>;
+defm FLAT_ATOMIC_XOR_X2        : FLAT_Real_Atomics_ci <0x5b>;
+defm FLAT_ATOMIC_INC_X2        : FLAT_Real_Atomics_ci <0x5c>;
+defm FLAT_ATOMIC_DEC_X2        : FLAT_Real_Atomics_ci <0x5d>;
 
 // CI Only flat instructions
-defm FLAT_ATOMIC_FCMPSWAP      : FLAT_Real_Atomics_ci <0x3e, FLAT_ATOMIC_FCMPSWAP>;
-defm FLAT_ATOMIC_FMIN          : FLAT_Real_Atomics_ci <0x3f, FLAT_ATOMIC_FMIN>;
-defm FLAT_ATOMIC_FMAX          : FLAT_Real_Atomics_ci <0x40, FLAT_ATOMIC_FMAX>;
-defm FLAT_ATOMIC_FCMPSWAP_X2   : FLAT_Real_Atomics_ci <0x5e, FLAT_ATOMIC_FCMPSWAP_X2>;
-defm FLAT_ATOMIC_FMIN_X2       : FLAT_Real_Atomics_ci <0x5f, FLAT_ATOMIC_FMIN_X2>;
-defm FLAT_ATOMIC_FMAX_X2       : FLAT_Real_Atomics_ci <0x60, FLAT_ATOMIC_FMAX_X2>;
+defm FLAT_ATOMIC_FCMPSWAP      : FLAT_Real_Atomics_ci <0x3e>;
+defm FLAT_ATOMIC_FMIN          : FLAT_Real_Atomics_ci <0x3f>;
+defm FLAT_ATOMIC_FMAX          : FLAT_Real_Atomics_ci <0x40>;
+defm FLAT_ATOMIC_FCMPSWAP_X2   : FLAT_Real_Atomics_ci <0x5e>;
+defm FLAT_ATOMIC_FMIN_X2       : FLAT_Real_Atomics_ci <0x5f>;
+defm FLAT_ATOMIC_FMAX_X2       : FLAT_Real_Atomics_ci <0x60>;
 
 
 //===----------------------------------------------------------------------===//
@@ -1925,8 +1917,9 @@ def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
 def FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
 def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
 
-multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps,
+multiclass FLAT_Real_Atomics_vi <bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
   def _vi     : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
   def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
 }
@@ -1939,32 +1932,32 @@ multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
 }
 
 
-defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40, FLAT_ATOMIC_SWAP>;
-defm FLAT_ATOMIC_CMPSWAP    : FLAT_Real_Atomics_vi <0x41, FLAT_ATOMIC_CMPSWAP>;
-defm FLAT_ATOMIC_ADD        : FLAT_Real_Atomics_vi <0x42, FLAT_ATOMIC_ADD>;
-defm FLAT_ATOMIC_SUB        : FLAT_Real_Atomics_vi <0x43, FLAT_ATOMIC_SUB>;
-defm FLAT_ATOMIC_SMIN       : FLAT_Real_Atomics_vi <0x44, FLAT_ATOMIC_SMIN>;
-defm FLAT_ATOMIC_UMIN       : FLAT_Real_Atomics_vi <0x45, FLAT_ATOMIC_UMIN>;
-defm FLAT_ATOMIC_SMAX       : FLAT_Real_Atomics_vi <0x46, FLAT_ATOMIC_SMAX>;
-defm FLAT_ATOMIC_UMAX       : FLAT_Real_Atomics_vi <0x47, FLAT_ATOMIC_UMAX>;
-defm FLAT_ATOMIC_AND        : FLAT_Real_Atomics_vi <0x48, FLAT_ATOMIC_AND>;
-defm FLAT_ATOMIC_OR         : FLAT_Real_Atomics_vi <0x49, FLAT_ATOMIC_OR>;
-defm FLAT_ATOMIC_XOR        : FLAT_Real_Atomics_vi <0x4a, FLAT_ATOMIC_XOR>;
-defm FLAT_ATOMIC_INC        : FLAT_Real_Atomics_vi <0x4b, FLAT_ATOMIC_INC>;
-defm FLAT_ATOMIC_DEC        : FLAT_Real_Atomics_vi <0x4c, FLAT_ATOMIC_DEC>;
-defm FLAT_ATOMIC_SWAP_X2    : FLAT_Real_Atomics_vi <0x60, FLAT_ATOMIC_SWAP_X2>;
-defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_vi <0x61, FLAT_ATOMIC_CMPSWAP_X2>;
-defm FLAT_ATOMIC_ADD_X2     : FLAT_Real_Atomics_vi <0x62, FLAT_ATOMIC_ADD_X2>;
-defm FLAT_ATOMIC_SUB_X2     : FLAT_Real_Atomics_vi <0x63, FLAT_ATOMIC_SUB_X2>;
-defm FLAT_ATOMIC_SMIN_X2    : FLAT_Real_Atomics_vi <0x64, FLAT_ATOMIC_SMIN_X2>;
-defm FLAT_ATOMIC_UMIN_X2    : FLAT_Real_Atomics_vi <0x65, FLAT_ATOMIC_UMIN_X2>;
-defm FLAT_ATOMIC_SMAX_X2    : FLAT_Real_Atomics_vi <0x66, FLAT_ATOMIC_SMAX_X2>;
-defm FLAT_ATOMIC_UMAX_X2    : FLAT_Real_Atomics_vi <0x67, FLAT_ATOMIC_UMAX_X2>;
-defm FLAT_ATOMIC_AND_X2     : FLAT_Real_Atomics_vi <0x68, FLAT_ATOMIC_AND_X2>;
-defm FLAT_ATOMIC_OR_X2      : FLAT_Real_Atomics_vi <0x69, FLAT_ATOMIC_OR_X2>;
-defm FLAT_ATOMIC_XOR_X2     : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
-defm FLAT_ATOMIC_INC_X2     : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
-defm FLAT_ATOMIC_DEC_X2     : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
+defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40>;
+defm FLAT_ATOMIC_CMPSWAP    : FLAT_Real_Atomics_vi <0x41>;
+defm FLAT_ATOMIC_ADD        : FLAT_Real_Atomics_vi <0x42>;
+defm FLAT_ATOMIC_SUB        : FLAT_Real_Atomics_vi <0x43>;
+defm FLAT_ATOMIC_SMIN       : FLAT_Real_Atomics_vi <0x44>;
+defm FLAT_ATOMIC_UMIN       : FLAT_Real_Atomics_vi <0x45>;
+defm FLAT_ATOMIC_SMAX       : FLAT_Real_Atomics_vi <0x46>;
+defm FLAT_ATOMIC_UMAX       : FLAT_Real_Atomics_vi <0x47>;
+defm FLAT_ATOMIC_AND        : FLAT_Real_Atomics_vi <0x48>;
+defm FLAT_ATOMIC_OR         : FLAT_Real_Atomics_vi <0x49>;
+defm FLAT_ATOMIC_XOR        : FLAT_Real_Atomics_vi <0x4a>;
+defm FLAT_ATOMIC_INC        : FLAT_Real_Atomics_vi <0x4b>;
+defm FLAT_ATOMIC_DEC        : FLAT_Real_Atomics_vi <0x4c>;
+defm FLAT_ATOMIC_SWAP_X2    : FLAT_Real_Atomics_vi <0x60>;
+defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_Real_Atomics_vi <0x61>;
+defm FLAT_ATOMIC_ADD_X2     : FLAT_Real_Atomics_vi <0x62>;
+defm FLAT_ATOMIC_SUB_X2     : FLAT_Real_Atomics_vi <0x63>;
+defm FLAT_ATOMIC_SMIN_X2    : FLAT_Real_Atomics_vi <0x64>;
+defm FLAT_ATOMIC_UMIN_X2    : FLAT_Real_Atomics_vi <0x65>;
+defm FLAT_ATOMIC_SMAX_X2    : FLAT_Real_Atomics_vi <0x66>;
+defm FLAT_ATOMIC_UMAX_X2    : FLAT_Real_Atomics_vi <0x67>;
+defm FLAT_ATOMIC_AND_X2     : FLAT_Real_Atomics_vi <0x68>;
+defm FLAT_ATOMIC_OR_X2      : FLAT_Real_Atomics_vi <0x69>;
+defm FLAT_ATOMIC_XOR_X2     : FLAT_Real_Atomics_vi <0x6a>;
+defm FLAT_ATOMIC_INC_X2     : FLAT_Real_Atomics_vi <0x6b>;
+defm FLAT_ATOMIC_DEC_X2     : FLAT_Real_Atomics_vi <0x6c>;
 
 defm GLOBAL_LOAD_UBYTE : FLAT_Real_AllAddr_vi <0x10>;
 defm GLOBAL_LOAD_SBYTE : FLAT_Real_AllAddr_vi <0x11>;
@@ -2060,9 +2053,9 @@ let SubtargetPredicate = isGFX8GFX9NotGFX940 in {
 }
 
 let SubtargetPredicate = isGFX90AOnly in {
-  defm FLAT_ATOMIC_ADD_F64   : FLAT_Real_Atomics_vi<0x4f, FLAT_ATOMIC_ADD_F64, 0>;
-  defm FLAT_ATOMIC_MIN_F64   : FLAT_Real_Atomics_vi<0x50, FLAT_ATOMIC_MIN_F64, 0>;
-  defm FLAT_ATOMIC_MAX_F64   : FLAT_Real_Atomics_vi<0x51, FLAT_ATOMIC_MAX_F64, 0>;
+  defm FLAT_ATOMIC_ADD_F64   : FLAT_Real_Atomics_vi<0x4f, 0>;
+  defm FLAT_ATOMIC_MIN_F64   : FLAT_Real_Atomics_vi<0x50, 0>;
+  defm FLAT_ATOMIC_MAX_F64   : FLAT_Real_Atomics_vi<0x51, 0>;
   defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_vi<0x4f, 0>;
   defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_vi<0x50, 0>;
   defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>;
@@ -2073,7 +2066,8 @@ multiclass FLAT_Real_AllAddr_gfx940<bits<7> op> {
   def _SADDR_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
 }
 
-multiclass FLAT_Real_Atomics_gfx940 <bits<7> op, FLAT_Pseudo ps> {
+multiclass FLAT_Real_Atomics_gfx940 <bits<7> op> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
   def _gfx940     : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>;
   def _RTN_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>;
 }
@@ -2089,15 +2083,15 @@ let SubtargetPredicate = isGFX940Plus in {
   defm GLOBAL_ATOMIC_ADD_F32     : FLAT_Global_Real_Atomics_gfx940 <0x04d>;
   defm GLOBAL_ATOMIC_PK_ADD_F16  : FLAT_Global_Real_Atomics_gfx940 <0x04e>;
 
-  defm FLAT_ATOMIC_ADD_F64       : FLAT_Real_Atomics_gfx940<0x4f, FLAT_ATOMIC_ADD_F64>;
-  defm FLAT_ATOMIC_MIN_F64       : FLAT_Real_Atomics_gfx940<0x50, FLAT_ATOMIC_MIN_F64>;
-  defm FLAT_ATOMIC_MAX_F64       : FLAT_Real_Atomics_gfx940<0x51, FLAT_ATOMIC_MAX_F64>;
+  defm FLAT_ATOMIC_ADD_F64       : FLAT_Real_Atomics_gfx940<0x4f>;
+  defm FLAT_ATOMIC_MIN_F64       : FLAT_Real_Atomics_gfx940<0x50>;
+  defm FLAT_ATOMIC_MAX_F64       : FLAT_Real_Atomics_gfx940<0x51>;
   defm GLOBAL_ATOMIC_ADD_F64     : FLAT_Global_Real_Atomics_gfx940<0x4f>;
   defm GLOBAL_ATOMIC_MIN_F64     : FLAT_Global_Real_Atomics_gfx940<0x50>;
   defm GLOBAL_ATOMIC_MAX_F64     : FLAT_Global_Real_Atomics_gfx940<0x51>;
-  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi<0x4d, FLAT_ATOMIC_ADD_F32>;
-  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi<0x4e, FLAT_ATOMIC_PK_ADD_F16>;
-  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi<0x52, FLAT_ATOMIC_PK_ADD_BF16>;
+  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi<0x4d>;
+  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi<0x4e>;
+  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi<0x52>;
   defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
 } // End SubtargetPredicate = isGFX940Plus
 
@@ -2112,7 +2106,9 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
 
   let Inst{11-0}  = offset{11-0};
   let Inst{12}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue);
-  let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d);
+  let Inst{54-48} = !cond(ps.enabled_saddr : saddr,
+                          !and(ps.is_flat_scratch, !not(ps.has_vaddr)) : EXEC_HI.Index{6-0}, // ST mode
+                          true : SGPR_NULL_gfxpre11.Index{6-0});
   let Inst{55}    = 0;
 }
 
@@ -2139,10 +2135,7 @@ multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> {
 
 multiclass FLAT_Real_ST_gfx10<bits<7> op> {
   def _ST_gfx10 :
-    FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")> {
-      let Inst{54-48} = EXEC_HI.Index;
-      let OtherPredicates = [HasFlatScratchSTMode];
-    }
+    FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
 }
 
 multiclass FLAT_Real_AllAddr_gfx10<bits<7> op> :
@@ -2340,421 +2333,398 @@ defm SCRATCH_LOAD_LDS_DWORD     : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00c>;
 // GFX11
 //===----------------------------------------------------------------------===//
 
-class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
-  FLAT_Real <op, ps, opName>,
-  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX11> {
-  let AssemblerPredicate = isGFX11Only;
-  let DecoderNamespace = "GFX11";
-
-  let Inst{13}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue);
-  let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue);
-  let Inst{15}    = cpol{CPolBit.SLC};
-  let Inst{17-16} = seg;
-  let Inst{55}    = ps.sve;
+class get_FLAT_ps<string name> {
+  string Mnemonic = !cast<FLAT_Pseudo>(name).Mnemonic;
 }
 
-multiclass FLAT_Aliases_gfx11<string ps, string opName, int renamed> {
-  if renamed then
-    def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Only]>;
-}
+multiclass FLAT_Real_gfx11 <bits<7> op,
+                            string name = get_FLAT_ps<NAME>.Mnemonic> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
+  def _gfx11 : FLAT_Real <op, ps, name>,
+               SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX11> {
+    let AssemblerPredicate = isGFX11Only;
+    let DecoderNamespace = "GFX11";
 
-multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
-  FLAT_Aliases_gfx11<ps, opName, renamed> {
-  def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName> {
-    let Inst{54-48} = SGPR_NULL_gfx11plus.Index;
+    let Inst{13}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue);
+    let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glcValue);
+    let Inst{15}    = cpol{CPolBit.SLC};
+    let Inst{17-16} = seg;
+    let Inst{54-48} = !if(ps.enabled_saddr, saddr, SGPR_NULL_gfx11plus.Index);
+    let Inst{55}    = ps.sve;
   }
 }
 
-multiclass FLAT_Real_RTN_gfx11<bits<7> op, string ps, string opName> {
-  def _RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> {
-    let Inst{54-48} = SGPR_NULL_gfx11plus.Index;
-  }
+multiclass FLAT_Aliases_gfx11<string name> {
+  defvar ps = get_FLAT_ps<NAME>;
+  if !ne(ps.Mnemonic, name) then
+    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
 }
 
-multiclass FLAT_Real_SADDR_gfx11<bits<7> op, string ps, string opName> {
-  def _SADDR_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SADDR"), opName>;
-}
+multiclass FLAT_Real_Base_gfx11<bits<7> op,
+                                string name = get_FLAT_ps<NAME>.Mnemonic> :
+  FLAT_Aliases_gfx11<name>,
+  FLAT_Real_gfx11<op, name>;
 
-multiclass FLAT_Real_SADDR_RTN_gfx11<bits<7> op, string ps, string opName> {
-  def _SADDR_RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SADDR_RTN"), opName>;
+multiclass FLAT_Real_AllAddr_gfx11<bits<7> op,
+                                   string name = get_FLAT_ps<NAME>.Mnemonic> :
+  FLAT_Real_Base_gfx11<op, name> {
+  defm _SADDR : FLAT_Real_gfx11<op, name>;
 }
 
-multiclass FLAT_Real_ST_gfx11<bits<7> op, string ps, string opName> {
-  def _ST_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> {
-    let Inst{54-48} = SGPR_NULL_gfx11plus.Index;
-    let OtherPredicates = [HasFlatScratchSTMode];
-  }
+multiclass FLAT_Real_Atomics_gfx11<bits<7> op,
+                                   string name = get_FLAT_ps<NAME>.Mnemonic> :
+  FLAT_Real_Base_gfx11<op, name> {
+  defm _RTN : FLAT_Real_gfx11<op, name>;
 }
 
-multiclass FLAT_Real_SVS_gfx11<bits<7> op, string ps, string opName> {
-  def _SVS_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName> {
-    let OtherPredicates = [HasFlatScratchSVSMode];
-  }
+multiclass GLOBAL_Real_AllAddr_gfx11<bits<7> op,
+                                     string name = get_FLAT_ps<NAME>.Mnemonic> :
+  FLAT_Aliases_gfx11<name>,
+  FLAT_Real_gfx11<op, name> {
+  defm _SADDR : FLAT_Real_gfx11<op, name>;
 }
 
-multiclass FLAT_Real_AllAddr_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
-  FLAT_Real_Base_gfx11<op, ps, opName, renamed>,
-  FLAT_Real_SADDR_gfx11<op, ps, opName>;
-
-multiclass FLAT_Real_Atomics_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
-  FLAT_Real_Base_gfx11<op, ps, opName, renamed>,
-  FLAT_Real_RTN_gfx11<op, ps, opName>;
-
-multiclass FLAT_Real_GlblAtomics_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
-  FLAT_Real_AllAddr_gfx11<op, ps, opName, renamed>,
-  FLAT_Real_RTN_gfx11<op, ps, opName>,
-  FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>;
-
-multiclass FLAT_Real_GlblAtomics_RTN_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
-  FLAT_Aliases_gfx11<ps#"_RTN", opName, renamed>,
-  FLAT_Real_RTN_gfx11<op, ps, opName>,
-  FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>;
+multiclass GLOBAL_Real_Atomics_gfx11<bits<7> op,
+                                     string name = get_FLAT_ps<NAME>.Mnemonic> :
+  GLOBAL_Real_AllAddr_gfx11<op, name> {
+  defm _RTN : FLAT_Real_gfx11<op, name>;
+  defm _SADDR_RTN : FLAT_Real_gfx11<op, name>;
+}
 
-multiclass FLAT_Real_ScratchAllAddr_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
-  FLAT_Real_Base_gfx11<op, ps, opName, renamed>,
-  FLAT_Real_SADDR_gfx11<op, ps, opName>,
-  FLAT_Real_ST_gfx11<op, ps, opName>,
-  FLAT_Real_SVS_gfx11<op, ps, opName>;
+multiclass SCRATCH_Real_AllAddr_gfx11<bits<7> op,
+                                     string name = get_FLAT_ps<NAME>.Mnemonic> :
+  FLAT_Aliases_gfx11<name>,
+  FLAT_Real_gfx11<op, name> {
+  defm _SADDR : FLAT_Real_gfx11<op, name>;
+  defm _ST : FLAT_Real_gfx11<op, name>;
+  defm _SVS : FLAT_Real_gfx11<op, name>;
+}
 
 // ENC_FLAT.
-defm FLAT_LOAD_U8               : FLAT_Real_Base_gfx11<0x010, "FLAT_LOAD_UBYTE", "flat_load_u8", true>;
-defm FLAT_LOAD_I8               : FLAT_Real_Base_gfx11<0x011, "FLAT_LOAD_SBYTE", "flat_load_i8", true>;
-defm FLAT_LOAD_U16              : FLAT_Real_Base_gfx11<0x012, "FLAT_LOAD_USHORT", "flat_load_u16", true>;
-defm FLAT_LOAD_I16              : FLAT_Real_Base_gfx11<0x013, "FLAT_LOAD_SSHORT", "flat_load_i16", true>;
-defm FLAT_LOAD_B32              : FLAT_Real_Base_gfx11<0x014, "FLAT_LOAD_DWORD", "flat_load_b32", true>;
-defm FLAT_LOAD_B64              : FLAT_Real_Base_gfx11<0x015, "FLAT_LOAD_DWORDX2", "flat_load_b64", true>;
-defm FLAT_LOAD_B96              : FLAT_Real_Base_gfx11<0x016, "FLAT_LOAD_DWORDX3", "flat_load_b96", true>;
-defm FLAT_LOAD_B128             : FLAT_Real_Base_gfx11<0x017, "FLAT_LOAD_DWORDX4", "flat_load_b128", true>;
-defm FLAT_STORE_B8              : FLAT_Real_Base_gfx11<0x018, "FLAT_STORE_BYTE", "flat_store_b8", true>;
-defm FLAT_STORE_B16             : FLAT_Real_Base_gfx11<0x019, "FLAT_STORE_SHORT", "flat_store_b16", true>;
-defm FLAT_STORE_B32             : FLAT_Real_Base_gfx11<0x01a, "FLAT_STORE_DWORD", "flat_store_b32", true>;
-defm FLAT_STORE_B64             : FLAT_Real_Base_gfx11<0x01b, "FLAT_STORE_DWORDX2", "flat_store_b64", true>;
-defm FLAT_STORE_B96             : FLAT_Real_Base_gfx11<0x01c, "FLAT_STORE_DWORDX3", "flat_store_b96", true>;
-defm FLAT_STORE_B128            : FLAT_Real_Base_gfx11<0x01d, "FLAT_STORE_DWORDX4", "flat_store_b128", true>;
-defm FLAT_LOAD_D16_U8           : FLAT_Real_Base_gfx11<0x01e, "FLAT_LOAD_UBYTE_D16", "flat_load_d16_u8">;
-defm FLAT_LOAD_D16_I8           : FLAT_Real_Base_gfx11<0x01f, "FLAT_LOAD_SBYTE_D16", "flat_load_d16_i8">;
-defm FLAT_LOAD_D16_B16          : FLAT_Real_Base_gfx11<0x020, "FLAT_LOAD_SHORT_D16", "flat_load_d16_b16">;
-defm FLAT_LOAD_D16_HI_U8        : FLAT_Real_Base_gfx11<0x021, "FLAT_LOAD_UBYTE_D16_HI", "flat_load_d16_hi_u8">;
-defm FLAT_LOAD_D16_HI_I8        : FLAT_Real_Base_gfx11<0x022, "FLAT_LOAD_SBYTE_D16_HI", "flat_load_d16_hi_i8">;
-defm FLAT_LOAD_D16_HI_B16       : FLAT_Real_Base_gfx11<0x023, "FLAT_LOAD_SHORT_D16_HI", "flat_load_d16_hi_b16">;
-defm FLAT_STORE_D16_HI_B8       : FLAT_Real_Base_gfx11<0x024, "FLAT_STORE_BYTE_D16_HI", "flat_store_d16_hi_b8">;
-defm FLAT_STORE_D16_HI_B16      : FLAT_Real_Base_gfx11<0x025, "FLAT_STORE_SHORT_D16_HI", "flat_store_d16_hi_b16">;
-defm FLAT_ATOMIC_SWAP_B32       : FLAT_Real_Atomics_gfx11<0x033, "FLAT_ATOMIC_SWAP", "flat_atomic_swap_b32", true>;
-defm FLAT_ATOMIC_CMPSWAP_B32    : FLAT_Real_Atomics_gfx11<0x034, "FLAT_ATOMIC_CMPSWAP", "flat_atomic_cmpswap_b32", true>;
-defm FLAT_ATOMIC_ADD_U32        : FLAT_Real_Atomics_gfx11<0x035, "FLAT_ATOMIC_ADD", "flat_atomic_add_u32", true>;
-defm FLAT_ATOMIC_SUB_U32        : FLAT_Real_Atomics_gfx11<0x036, "FLAT_ATOMIC_SUB", "flat_atomic_sub_u32", true>;
-defm FLAT_ATOMIC_MIN_I32        : FLAT_Real_Atomics_gfx11<0x038, "FLAT_ATOMIC_SMIN", "flat_atomic_min_i32", true>;
-defm FLAT_ATOMIC_MIN_U32        : FLAT_Real_Atomics_gfx11<0x039, "FLAT_ATOMIC_UMIN", "flat_atomic_min_u32", true>;
-defm FLAT_ATOMIC_MAX_I32        : FLAT_Real_Atomics_gfx11<0x03a, "FLAT_ATOMIC_SMAX", "flat_atomic_max_i32", true>;
-defm FLAT_ATOMIC_MAX_U32        : FLAT_Real_Atomics_gfx11<0x03b, "FLAT_ATOMIC_UMAX", "flat_atomic_max_u32", true>;
-defm FLAT_ATOMIC_AND_B32        : FLAT_Real_Atomics_gfx11<0x03c, "FLAT_ATOMIC_AND", "flat_atomic_and_b32", true>;
-defm FLAT_ATOMIC_OR_B32         : FLAT_Real_Atomics_gfx11<0x03d, "FLAT_ATOMIC_OR", "flat_atomic_or_b32", true>;
-defm FLAT_ATOMIC_XOR_B32        : FLAT_Real_Atomics_gfx11<0x03e, "FLAT_ATOMIC_XOR", "flat_atomic_xor_b32", true>;
-defm FLAT_ATOMIC_INC_U32        : FLAT_Real_Atomics_gfx11<0x03f, "FLAT_ATOMIC_INC", "flat_atomic_inc_u32", true>;
-defm FLAT_ATOMIC_DEC_U32        : FLAT_Real_Atomics_gfx11<0x040, "FLAT_ATOMIC_DEC", "flat_atomic_dec_u32", true>;
-defm FLAT_ATOMIC_SWAP_B64       : FLAT_Real_Atomics_gfx11<0x041, "FLAT_ATOMIC_SWAP_X2", "flat_atomic_swap_b64", true>;
-defm FLAT_ATOMIC_CMPSWAP_B64    : FLAT_Real_Atomics_gfx11<0x042, "FLAT_ATOMIC_CMPSWAP_X2", "flat_atomic_cmpswap_b64", true>;
-defm FLAT_ATOMIC_ADD_U64        : FLAT_Real_Atomics_gfx11<0x043, "FLAT_ATOMIC_ADD_X2", "flat_atomic_add_u64", true>;
-defm FLAT_ATOMIC_SUB_U64        : FLAT_Real_Atomics_gfx11<0x044, "FLAT_ATOMIC_SUB_X2", "flat_atomic_sub_u64", true>;
-defm FLAT_ATOMIC_MIN_I64        : FLAT_Real_Atomics_gfx11<0x045, "FLAT_ATOMIC_SMIN_X2", "flat_atomic_min_i64", true>;
-defm FLAT_ATOMIC_MIN_U64        : FLAT_Real_Atomics_gfx11<0x046, "FLAT_ATOMIC_UMIN_X2", "flat_atomic_min_u64", true>;
-defm FLAT_ATOMIC_MAX_I64        : FLAT_Real_Atomics_gfx11<0x047, "FLAT_ATOMIC_SMAX_X2", "flat_atomic_max_i64", true>;
-defm FLAT_ATOMIC_MAX_U64        : FLAT_Real_Atomics_gfx11<0x048, "FLAT_ATOMIC_UMAX_X2", "flat_atomic_max_u64", true>;
-defm FLAT_ATOMIC_AND_B64        : FLAT_Real_Atomics_gfx11<0x049, "FLAT_ATOMIC_AND_X2", "flat_atomic_and_b64", true>;
-defm FLAT_ATOMIC_OR_B64         : FLAT_Real_Atomics_gfx11<0x04a, "FLAT_ATOMIC_OR_X2", "flat_atomic_or_b64", true>;
-defm FLAT_ATOMIC_XOR_B64        : FLAT_Real_Atomics_gfx11<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>;
-defm FLAT_ATOMIC_INC_U64        : FLAT_Real_Atomics_gfx11<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>;
-defm FLAT_ATOMIC_DEC_U64        : FLAT_Real_Atomics_gfx11<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>;
-defm FLAT_ATOMIC_CMPSWAP_F32    : FLAT_Real_Atomics_gfx11<0x050, "FLAT_ATOMIC_FCMPSWAP", "flat_atomic_cmpswap_f32">;
-defm FLAT_ATOMIC_MIN_F32        : FLAT_Real_Atomics_gfx11<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_f32">;
-defm FLAT_ATOMIC_MAX_F32        : FLAT_Real_Atomics_gfx11<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_f32">;
-defm FLAT_ATOMIC_ADD_F32        : FLAT_Real_Atomics_gfx11<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">;
+defm FLAT_LOAD_UBYTE            : FLAT_Real_Base_gfx11<0x010, "flat_load_u8">;
+defm FLAT_LOAD_SBYTE            : FLAT_Real_Base_gfx11<0x011, "flat_load_i8">;
+defm FLAT_LOAD_USHORT           : FLAT_Real_Base_gfx11<0x012, "flat_load_u16">;
+defm FLAT_LOAD_SSHORT           : FLAT_Real_Base_gfx11<0x013, "flat_load_i16">;
+defm FLAT_LOAD_DWORD            : FLAT_Real_Base_gfx11<0x014, "flat_load_b32">;
+defm FLAT_LOAD_DWORDX2          : FLAT_Real_Base_gfx11<0x015, "flat_load_b64">;
+defm FLAT_LOAD_DWORDX3          : FLAT_Real_Base_gfx11<0x016, "flat_load_b96">;
+defm FLAT_LOAD_DWORDX4          : FLAT_Real_Base_gfx11<0x017, "flat_load_b128">;
+defm FLAT_STORE_BYTE            : FLAT_Real_Base_gfx11<0x018, "flat_store_b8">;
+defm FLAT_STORE_SHORT           : FLAT_Real_Base_gfx11<0x019, "flat_store_b16">;
+defm FLAT_STORE_DWORD           : FLAT_Real_Base_gfx11<0x01a, "flat_store_b32">;
+defm FLAT_STORE_DWORDX2         : FLAT_Real_Base_gfx11<0x01b, "flat_store_b64">;
+defm FLAT_STORE_DWORDX3         : FLAT_Real_Base_gfx11<0x01c, "flat_store_b96">;
+defm FLAT_STORE_DWORDX4         : FLAT_Real_Base_gfx11<0x01d, "flat_store_b128">;
+defm FLAT_LOAD_UBYTE_D16        : FLAT_Real_Base_gfx11<0x01e, "flat_load_d16_u8">;
+defm FLAT_LOAD_SBYTE_D16        : FLAT_Real_Base_gfx11<0x01f, "flat_load_d16_i8">;
+defm FLAT_LOAD_SHORT_D16        : FLAT_Real_Base_gfx11<0x020, "flat_load_d16_b16">;
+defm FLAT_LOAD_UBYTE_D16_HI     : FLAT_Real_Base_gfx11<0x021, "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_SBYTE_D16_HI     : FLAT_Real_Base_gfx11<0x022, "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_SHORT_D16_HI     : FLAT_Real_Base_gfx11<0x023, "flat_load_d16_hi_b16">;
+defm FLAT_STORE_BYTE_D16_HI     : FLAT_Real_Base_gfx11<0x024, "flat_store_d16_hi_b8">;
+defm FLAT_STORE_SHORT_D16_HI    : FLAT_Real_Base_gfx11<0x025, "flat_store_d16_hi_b16">;
+defm FLAT_ATOMIC_SWAP           : FLAT_Real_Atomics_gfx11<0x033, "flat_atomic_swap_b32">;
+defm FLAT_ATOMIC_CMPSWAP        : FLAT_Real_Atomics_gfx11<0x034, "flat_atomic_cmpswap_b32">;
+defm FLAT_ATOMIC_ADD            : FLAT_Real_Atomics_gfx11<0x035, "flat_atomic_add_u32">;
+defm FLAT_ATOMIC_SUB            : FLAT_Real_Atomics_gfx11<0x036, "flat_atomic_sub_u32">;
+defm FLAT_ATOMIC_SMIN           : FLAT_Real_Atomics_gfx11<0x038, "flat_atomic_min_i32">;
+defm FLAT_ATOMIC_UMIN           : FLAT_Real_Atomics_gfx11<0x039, "flat_atomic_min_u32">;
+defm FLAT_ATOMIC_SMAX           : FLAT_Real_Atomics_gfx11<0x03a, "flat_atomic_max_i32">;
+defm FLAT_ATOMIC_UMAX           : FLAT_Real_Atomics_gfx11<0x03b, "flat_atomic_max_u32">;
+defm FLAT_ATOMIC_AND            : FLAT_Real_Atomics_gfx11<0x03c, "flat_atomic_and_b32">;
+defm FLAT_ATOMIC_OR             : FLAT_Real_Atomics_gfx11<0x03d, "flat_atomic_or_b32">;
+defm FLAT_ATOMIC_XOR            : FLAT_Real_Atomics_gfx11<0x03e, "flat_atomic_xor_b32">;
+defm FLAT_ATOMIC_INC            : FLAT_Real_Atomics_gfx11<0x03f, "flat_atomic_inc_u32">;
+defm FLAT_ATOMIC_DEC            : FLAT_Real_Atomics_gfx11<0x040, "flat_atomic_dec_u32">;
+defm FLAT_ATOMIC_SWAP_X2        : FLAT_Real_Atomics_gfx11<0x041, "flat_atomic_swap_b64">;
+defm FLAT_ATOMIC_CMPSWAP_X2     : FLAT_Real_Atomics_gfx11<0x042, "flat_atomic_cmpswap_b64">;
+defm FLAT_ATOMIC_ADD_X2         : FLAT_Real_Atomics_gfx11<0x043, "flat_atomic_add_u64">;
+defm FLAT_ATOMIC_SUB_X2         : FLAT_Real_Atomics_gfx11<0x044, "flat_atomic_sub_u64">;
+defm FLAT_ATOMIC_SMIN_X2        : FLAT_Real_Atomics_gfx11<0x045, "flat_atomic_min_i64">;
+defm FLAT_ATOMIC_UMIN_X2        : FLAT_Real_Atomics_gfx11<0x046, "flat_atomic_min_u64">;
+defm FLAT_ATOMIC_SMAX_X2        : FLAT_Real_Atomics_gfx11<0x047, "flat_atomic_max_i64">;
+defm FLAT_ATOMIC_UMAX_X2        : FLAT_Real_Atomics_gfx11<0x048, "flat_atomic_max_u64">;
+defm FLAT_ATOMIC_AND_X2         : FLAT_Real_Atomics_gfx11<0x049, "flat_atomic_and_b64">;
+defm FLAT_ATOMIC_OR_X2          : FLAT_Real_Atomics_gfx11<0x04a, "flat_atomic_or_b64">;
+defm FLAT_ATOMIC_XOR_X2         : FLAT_Real_Atomics_gfx11<0x04b, "flat_atomic_xor_b64">;
+defm FLAT_ATOMIC_INC_X2         : FLAT_Real_Atomics_gfx11<0x04c, "flat_atomic_inc_u64">;
+defm FLAT_ATOMIC_DEC_X2         : FLAT_Real_Atomics_gfx11<0x04d, "flat_atomic_dec_u64">;
+defm FLAT_ATOMIC_FCMPSWAP       : FLAT_Real_Atomics_gfx11<0x050, "flat_atomic_cmpswap_f32">;
+defm FLAT_ATOMIC_FMIN           : FLAT_Real_Atomics_gfx11<0x051, "flat_atomic_min_f32">;
+defm FLAT_ATOMIC_FMAX           : FLAT_Real_Atomics_gfx11<0x052, "flat_atomic_max_f32">;
+defm FLAT_ATOMIC_ADD_F32        : FLAT_Real_Atomics_gfx11<0x056>;
 
 // ENC_FLAT_GLBL.
-defm GLOBAL_LOAD_U8             : FLAT_Real_AllAddr_gfx11<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>;
-defm GLOBAL_LOAD_I8             : FLAT_Real_AllAddr_gfx11<0x011, "GLOBAL_LOAD_SBYTE", "global_load_i8", true>;
-defm GLOBAL_LOAD_U16            : FLAT_Real_AllAddr_gfx11<0x012, "GLOBAL_LOAD_USHORT", "global_load_u16", true>;
-defm GLOBAL_LOAD_I16            : FLAT_Real_AllAddr_gfx11<0x013, "GLOBAL_LOAD_SSHORT", "global_load_i16", true>;
-defm GLOBAL_LOAD_B32            : FLAT_Real_AllAddr_gfx11<0x014, "GLOBAL_LOAD_DWORD", "global_load_b32", true>;
-defm GLOBAL_LOAD_B64            : FLAT_Real_AllAddr_gfx11<0x015, "GLOBAL_LOAD_DWORDX2", "global_load_b64", true>;
-defm GLOBAL_LOAD_B96            : FLAT_Real_AllAddr_gfx11<0x016, "GLOBAL_LOAD_DWORDX3", "global_load_b96", true>;
-defm GLOBAL_LOAD_B128           : FLAT_Real_AllAddr_gfx11<0x017, "GLOBAL_LOAD_DWORDX4", "global_load_b128", true>;
-defm GLOBAL_STORE_B8            : FLAT_Real_AllAddr_gfx11<0x018, "GLOBAL_STORE_BYTE", "global_store_b8", true>;
-defm GLOBAL_STORE_B16           : FLAT_Real_AllAddr_gfx11<0x019, "GLOBAL_STORE_SHORT", "global_store_b16", true>;
-defm GLOBAL_STORE_B32           : FLAT_Real_AllAddr_gfx11<0x01a, "GLOBAL_STORE_DWORD", "global_store_b32", true>;
-defm GLOBAL_STORE_B64           : FLAT_Real_AllAddr_gfx11<0x01b, "GLOBAL_STORE_DWORDX2", "global_store_b64", true>;
-defm GLOBAL_STORE_B96           : FLAT_Real_AllAddr_gfx11<0x01c, "GLOBAL_STORE_DWORDX3", "global_store_b96", true>;
-defm GLOBAL_STORE_B128          : FLAT_Real_AllAddr_gfx11<0x01d, "GLOBAL_STORE_DWORDX4", "global_store_b128", true>;
-defm GLOBAL_LOAD_D16_U8         : FLAT_Real_AllAddr_gfx11<0x01e, "GLOBAL_LOAD_UBYTE_D16", "global_load_d16_u8">;
-defm GLOBAL_LOAD_D16_I8         : FLAT_Real_AllAddr_gfx11<0x01f, "GLOBAL_LOAD_SBYTE_D16", "global_load_d16_i8">;
-defm GLOBAL_LOAD_D16_B16        : FLAT_Real_AllAddr_gfx11<0x020, "GLOBAL_LOAD_SHORT_D16", "global_load_d16_b16">;
-defm GLOBAL_LOAD_D16_HI_U8      : FLAT_Real_AllAddr_gfx11<0x021, "GLOBAL_LOAD_UBYTE_D16_HI", "global_load_d16_hi_u8">;
-defm GLOBAL_LOAD_D16_HI_I8      : FLAT_Real_AllAddr_gfx11<0x022, "GLOBAL_LOAD_SBYTE_D16_HI", "global_load_d16_hi_i8">;
-defm GLOBAL_LOAD_D16_HI_B16     : FLAT_Real_AllAddr_gfx11<0x023, "GLOBAL_LOAD_SHORT_D16_HI", "global_load_d16_hi_b16">;
-defm GLOBAL_STORE_D16_HI_B8     : FLAT_Real_AllAddr_gfx11<0x024, "GLOBAL_STORE_BYTE_D16_HI", "global_store_d16_hi_b8">;
-defm GLOBAL_STORE_D16_HI_B16    : FLAT_Real_AllAddr_gfx11<0x025, "GLOBAL_STORE_SHORT_D16_HI", "global_store_d16_hi_b16">;
-defm GLOBAL_LOAD_ADDTID_B32     : FLAT_Real_AllAddr_gfx11<0x028, "GLOBAL_LOAD_DWORD_ADDTID", "global_load_addtid_b32">;
-defm GLOBAL_STORE_ADDTID_B32    : FLAT_Real_AllAddr_gfx11<0x029, "GLOBAL_STORE_DWORD_ADDTID", "global_store_addtid_b32">;
-defm GLOBAL_ATOMIC_SWAP_B32     : FLAT_Real_GlblAtomics_gfx11<0x033, "GLOBAL_ATOMIC_SWAP", "global_atomic_swap_b32", true>;
-defm GLOBAL_ATOMIC_CMPSWAP_B32  : FLAT_Real_GlblAtomics_gfx11<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>;
-defm GLOBAL_ATOMIC_ADD_U32      : FLAT_Real_GlblAtomics_gfx11<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>;
-defm GLOBAL_ATOMIC_SUB_U32      : FLAT_Real_GlblAtomics_gfx11<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>;
-defm GLOBAL_ATOMIC_CSUB_U32     : FLAT_Real_GlblAtomics_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32", true>;
-defm GLOBAL_ATOMIC_MIN_I32      : FLAT_Real_GlblAtomics_gfx11<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>;
-defm GLOBAL_ATOMIC_MIN_U32      : FLAT_Real_GlblAtomics_gfx11<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>;
-defm GLOBAL_ATOMIC_MAX_I32      : FLAT_Real_GlblAtomics_gfx11<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>;
-defm GLOBAL_ATOMIC_MAX_U32      : FLAT_Real_GlblAtomics_gfx11<0x03b, "GLOBAL_ATOMIC_UMAX", "global_atomic_max_u32", true>;
-defm GLOBAL_ATOMIC_AND_B32      : FLAT_Real_GlblAtomics_gfx11<0x03c, "GLOBAL_ATOMIC_AND", "global_atomic_and_b32", true>;
-defm GLOBAL_ATOMIC_OR_B32       : FLAT_Real_GlblAtomics_gfx11<0x03d, "GLOBAL_ATOMIC_OR", "global_atomic_or_b32", true>;
-defm GLOBAL_ATOMIC_XOR_B32      : FLAT_Real_GlblAtomics_gfx11<0x03e, "GLOBAL_ATOMIC_XOR", "global_atomic_xor_b32", true>;
-defm GLOBAL_ATOMIC_INC_U32      : FLAT_Real_GlblAtomics_gfx11<0x03f, "GLOBAL_ATOMIC_INC", "global_atomic_inc_u32", true>;
-defm GLOBAL_ATOMIC_DEC_U32      : FLAT_Real_GlblAtomics_gfx11<0x040, "GLOBAL_ATOMIC_DEC", "global_atomic_dec_u32", true>;
-defm GLOBAL_ATOMIC_SWAP_B64     : FLAT_Real_GlblAtomics_gfx11<0x041, "GLOBAL_ATOMIC_SWAP_X2", "global_atomic_swap_b64", true>;
-defm GLOBAL_ATOMIC_CMPSWAP_B64  : FLAT_Real_GlblAtomics_gfx11<0x042, "GLOBAL_ATOMIC_CMPSWAP_X2", "global_atomic_cmpswap_b64", true>;
-defm GLOBAL_ATOMIC_ADD_U64      : FLAT_Real_GlblAtomics_gfx11<0x043, "GLOBAL_ATOMIC_ADD_X2", "global_atomic_add_u64", true>;
-defm GLOBAL_ATOMIC_SUB_U64      : FLAT_Real_GlblAtomics_gfx11<0x044, "GLOBAL_ATOMIC_SUB_X2", "global_atomic_sub_u64", true>;
-defm GLOBAL_ATOMIC_MIN_I64      : FLAT_Real_GlblAtomics_gfx11<0x045, "GLOBAL_ATOMIC_SMIN_X2", "global_atomic_min_i64", true>;
-defm GLOBAL_ATOMIC_MIN_U64      : FLAT_Real_GlblAtomics_gfx11<0x046, "GLOBAL_ATOMIC_UMIN_X2", "global_atomic_min_u64", true>;
-defm GLOBAL_ATOMIC_MAX_I64      : FLAT_Real_GlblAtomics_gfx11<0x047, "GLOBAL_ATOMIC_SMAX_X2", "global_atomic_max_i64", true>;
-defm GLOBAL_ATOMIC_MAX_U64      : FLAT_Real_GlblAtomics_gfx11<0x048, "GLOBAL_ATOMIC_UMAX_X2", "global_atomic_max_u64", true>;
-defm GLOBAL_ATOMIC_AND_B64      : FLAT_Real_GlblAtomics_gfx11<0x049, "GLOBAL_ATOMIC_AND_X2", "global_atomic_and_b64", true>;
-defm GLOBAL_ATOMIC_OR_B64       : FLAT_Real_GlblAtomics_gfx11<0x04a, "GLOBAL_ATOMIC_OR_X2", "global_atomic_or_b64", true>;
-defm GLOBAL_ATOMIC_XOR_B64      : FLAT_Real_GlblAtomics_gfx11<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>;
-defm GLOBAL_ATOMIC_INC_U64      : FLAT_Real_GlblAtomics_gfx11<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>;
-defm GLOBAL_ATOMIC_DEC_U64      : FLAT_Real_GlblAtomics_gfx11<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>;
-defm GLOBAL_ATOMIC_CMPSWAP_F32  : FLAT_Real_GlblAtomics_gfx11<0x050, "GLOBAL_ATOMIC_FCMPSWAP", "global_atomic_cmpswap_f32">;
-defm GLOBAL_ATOMIC_MIN_F32      : FLAT_Real_GlblAtomics_gfx11<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_f32">;
-defm GLOBAL_ATOMIC_MAX_F32      : FLAT_Real_GlblAtomics_gfx11<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_f32">;
-defm GLOBAL_ATOMIC_ADD_F32      : FLAT_Real_GlblAtomics_gfx11<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
+defm GLOBAL_LOAD_UBYTE          : FLAT_Real_AllAddr_gfx11<0x010, "global_load_u8">;
+defm GLOBAL_LOAD_SBYTE          : FLAT_Real_AllAddr_gfx11<0x011, "global_load_i8">;
+defm GLOBAL_LOAD_USHORT         : FLAT_Real_AllAddr_gfx11<0x012, "global_load_u16">;
+defm GLOBAL_LOAD_SSHORT         : FLAT_Real_AllAddr_gfx11<0x013, "global_load_i16">;
+defm GLOBAL_LOAD_DWORD          : FLAT_Real_AllAddr_gfx11<0x014, "global_load_b32">;
+defm GLOBAL_LOAD_DWORDX2        : FLAT_Real_AllAddr_gfx11<0x015, "global_load_b64">;
+defm GLOBAL_LOAD_DWORDX3        : FLAT_Real_AllAddr_gfx11<0x016, "global_load_b96">;
+defm GLOBAL_LOAD_DWORDX4        : FLAT_Real_AllAddr_gfx11<0x017, "global_load_b128">;
+defm GLOBAL_STORE_BYTE          : FLAT_Real_AllAddr_gfx11<0x018, "global_store_b8">;
+defm GLOBAL_STORE_SHORT         : FLAT_Real_AllAddr_gfx11<0x019, "global_store_b16">;
+defm GLOBAL_STORE_DWORD         : FLAT_Real_AllAddr_gfx11<0x01a, "global_store_b32">;
+defm GLOBAL_STORE_DWORDX2       : FLAT_Real_AllAddr_gfx11<0x01b, "global_store_b64">;
+defm GLOBAL_STORE_DWORDX3       : FLAT_Real_AllAddr_gfx11<0x01c, "global_store_b96">;
+defm GLOBAL_STORE_DWORDX4       : FLAT_Real_AllAddr_gfx11<0x01d, "global_store_b128">;
+defm GLOBAL_LOAD_UBYTE_D16      : FLAT_Real_AllAddr_gfx11<0x01e, "global_load_d16_u8">;
+defm GLOBAL_LOAD_SBYTE_D16      : FLAT_Real_AllAddr_gfx11<0x01f, "global_load_d16_i8">;
+defm GLOBAL_LOAD_SHORT_D16      : FLAT_Real_AllAddr_gfx11<0x020, "global_load_d16_b16">;
+defm GLOBAL_LOAD_UBYTE_D16_HI   : FLAT_Real_AllAddr_gfx11<0x021, "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_SBYTE_D16_HI   : FLAT_Real_AllAddr_gfx11<0x022, "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_SHORT_D16_HI   : FLAT_Real_AllAddr_gfx11<0x023, "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_BYTE_D16_HI   : FLAT_Real_AllAddr_gfx11<0x024, "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_SHORT_D16_HI  : FLAT_Real_AllAddr_gfx11<0x025, "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_DWORD_ADDTID   : FLAT_Real_AllAddr_gfx11<0x028, "global_load_addtid_b32">;
+defm GLOBAL_STORE_DWORD_ADDTID  : FLAT_Real_AllAddr_gfx11<0x029, "global_store_addtid_b32">;
+defm GLOBAL_ATOMIC_SWAP         : GLOBAL_Real_Atomics_gfx11<0x033, "global_atomic_swap_b32">;
+defm GLOBAL_ATOMIC_CMPSWAP      : GLOBAL_Real_Atomics_gfx11<0x034, "global_atomic_cmpswap_b32">;
+defm GLOBAL_ATOMIC_ADD          : GLOBAL_Real_Atomics_gfx11<0x035, "global_atomic_add_u32">;
+defm GLOBAL_ATOMIC_SUB          : GLOBAL_Real_Atomics_gfx11<0x036, "global_atomic_sub_u32">;
+defm GLOBAL_ATOMIC_CSUB         : GLOBAL_Real_Atomics_gfx11<0x037, "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_SMIN         : GLOBAL_Real_Atomics_gfx11<0x038, "global_atomic_min_i32">;
+defm GLOBAL_ATOMIC_UMIN         : GLOBAL_Real_Atomics_gfx11<0x039, "global_atomic_min_u32">;
+defm GLOBAL_ATOMIC_SMAX         : GLOBAL_Real_Atomics_gfx11<0x03a, "global_atomic_max_i32">;
+defm GLOBAL_ATOMIC_UMAX         : GLOBAL_Real_Atomics_gfx11<0x03b, "global_atomic_max_u32">;
+defm GLOBAL_ATOMIC_AND          : GLOBAL_Real_Atomics_gfx11<0x03c, "global_atomic_and_b32">;
+defm GLOBAL_ATOMIC_OR           : GLOBAL_Real_Atomics_gfx11<0x03d, "global_atomic_or_b32">;
+defm GLOBAL_ATOMIC_XOR          : GLOBAL_Real_Atomics_gfx11<0x03e, "global_atomic_xor_b32">;
+defm GLOBAL_ATOMIC_INC          : GLOBAL_Real_Atomics_gfx11<0x03f, "global_atomic_inc_u32">;
+defm GLOBAL_ATOMIC_DEC          : GLOBAL_Real_Atomics_gfx11<0x040, "global_atomic_dec_u32">;
+defm GLOBAL_ATOMIC_SWAP_X2      : GLOBAL_Real_Atomics_gfx11<0x041, "global_atomic_swap_b64">;
+defm GLOBAL_ATOMIC_CMPSWAP_X2   : GLOBAL_Real_Atomics_gfx11<0x042, "global_atomic_cmpswap_b64">;
+defm GLOBAL_ATOMIC_ADD_X2       : GLOBAL_Real_Atomics_gfx11<0x043, "global_atomic_add_u64">;
+defm GLOBAL_ATOMIC_SUB_X2       : GLOBAL_Real_Atomics_gfx11<0x044, "global_atomic_sub_u64">;
+defm GLOBAL_ATOMIC_SMIN_X2      : GLOBAL_Real_Atomics_gfx11<0x045, "global_atomic_min_i64">;
+defm GLOBAL_ATOMIC_UMIN_X2      : GLOBAL_Real_Atomics_gfx11<0x046, "global_atomic_min_u64">;
+defm GLOBAL_ATOMIC_SMAX_X2      : GLOBAL_Real_Atomics_gfx11<0x047, "global_atomic_max_i64">;
+defm GLOBAL_ATOMIC_UMAX_X2      : GLOBAL_Real_Atomics_gfx11<0x048, "global_atomic_max_u64">;
+defm GLOBAL_ATOMIC_AND_X2       : GLOBAL_Real_Atomics_gfx11<0x049, "global_atomic_and_b64">;
+defm GLOBAL_ATOMIC_OR_X2        : GLOBAL_Real_Atomics_gfx11<0x04a, "global_atomic_or_b64">;
+defm GLOBAL_ATOMIC_XOR_X2       : GLOBAL_Real_Atomics_gfx11<0x04b, "global_atomic_xor_b64">;
+defm GLOBAL_ATOMIC_INC_X2       : GLOBAL_Real_Atomics_gfx11<0x04c, "global_atomic_inc_u64">;
+defm GLOBAL_ATOMIC_DEC_X2       : GLOBAL_Real_Atomics_gfx11<0x04d, "global_atomic_dec_u64">;
+defm GLOBAL_ATOMIC_FCMPSWAP     : GLOBAL_Real_Atomics_gfx11<0x050, "global_atomic_cmpswap_f32">;
+defm GLOBAL_ATOMIC_FMIN         : GLOBAL_Real_Atomics_gfx11<0x051, "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_FMAX         : GLOBAL_Real_Atomics_gfx11<0x052, "global_atomic_max_f32">;
+defm GLOBAL_ATOMIC_ADD_F32      : GLOBAL_Real_Atomics_gfx11<0x056>;
 
 // ENC_FLAT_SCRATCH.
-defm SCRATCH_LOAD_U8            : FLAT_Real_ScratchAllAddr_gfx11<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
-defm SCRATCH_LOAD_I8            : FLAT_Real_ScratchAllAddr_gfx11<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;
-defm SCRATCH_LOAD_U16           : FLAT_Real_ScratchAllAddr_gfx11<0x12, "SCRATCH_LOAD_USHORT", "scratch_load_u16", true>;
-defm SCRATCH_LOAD_I16           : FLAT_Real_ScratchAllAddr_gfx11<0x13, "SCRATCH_LOAD_SSHORT", "scratch_load_i16", true>;
-defm SCRATCH_LOAD_B32           : FLAT_Real_ScratchAllAddr_gfx11<0x14, "SCRATCH_LOAD_DWORD", "scratch_load_b32", true>;
-defm SCRATCH_LOAD_B64           : FLAT_Real_ScratchAllAddr_gfx11<0x15, "SCRATCH_LOAD_DWORDX2", "scratch_load_b64", true>;
-defm SCRATCH_LOAD_B96           : FLAT_Real_ScratchAllAddr_gfx11<0x16, "SCRATCH_LOAD_DWORDX3", "scratch_load_b96", true>;
-defm SCRATCH_LOAD_B128          : FLAT_Real_ScratchAllAddr_gfx11<0x17, "SCRATCH_LOAD_DWORDX4", "scratch_load_b128", true>;
-defm SCRATCH_STORE_B8           : FLAT_Real_ScratchAllAddr_gfx11<0x18, "SCRATCH_STORE_BYTE", "scratch_store_b8", true>;
-defm SCRATCH_STORE_B16          : FLAT_Real_ScratchAllAddr_gfx11<0x19, "SCRATCH_STORE_SHORT", "scratch_store_b16", true>;
-defm SCRATCH_STORE_B32          : FLAT_Real_ScratchAllAddr_gfx11<0x1a, "SCRATCH_STORE_DWORD", "scratch_store_b32", true>;
-defm SCRATCH_STORE_B64          : FLAT_Real_ScratchAllAddr_gfx11<0x1b, "SCRATCH_STORE_DWORDX2", "scratch_store_b64", true>;
-defm SCRATCH_STORE_B96          : FLAT_Real_ScratchAllAddr_gfx11<0x1c, "SCRATCH_STORE_DWORDX3", "scratch_store_b96", true>;
-defm SCRATCH_STORE_B128         : FLAT_Real_ScratchAllAddr_gfx11<0x1d, "SCRATCH_STORE_DWORDX4", "scratch_store_b128", true>;
-defm SCRATCH_LOAD_D16_U8        : FLAT_Real_ScratchAllAddr_gfx11<0x1e, "SCRATCH_LOAD_UBYTE_D16", "scratch_load_d16_u8">;
-defm SCRATCH_LOAD_D16_I8        : FLAT_Real_ScratchAllAddr_gfx11<0x1f, "SCRATCH_LOAD_SBYTE_D16", "scratch_load_d16_i8">;
-defm SCRATCH_LOAD_D16_B16       : FLAT_Real_ScratchAllAddr_gfx11<0x20, "SCRATCH_LOAD_SHORT_D16", "scratch_load_d16_b16">;
-defm SCRATCH_LOAD_D16_HI_U8     : FLAT_Real_ScratchAllAddr_gfx11<0x21, "SCRATCH_LOAD_UBYTE_D16_HI", "scratch_load_d16_hi_u8">;
-defm SCRATCH_LOAD_D16_HI_I8     : FLAT_Real_ScratchAllAddr_gfx11<0x22, "SCRATCH_LOAD_SBYTE_D16_HI", "scratch_load_d16_hi_i8">;
-defm SCRATCH_LOAD_D16_HI_B16    : FLAT_Real_ScratchAllAddr_gfx11<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">;
-defm SCRATCH_STORE_D16_HI_B8    : FLAT_Real_ScratchAllAddr_gfx11<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">;
-defm SCRATCH_STORE_D16_HI_B16   : FLAT_Real_ScratchAllAddr_gfx11<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">;
+defm SCRATCH_LOAD_UBYTE         : SCRATCH_Real_AllAddr_gfx11<0x10, "scratch_load_u8">;
+defm SCRATCH_LOAD_SBYTE         : SCRATCH_Real_AllAddr_gfx11<0x11, "scratch_load_i8">;
+defm SCRATCH_LOAD_USHORT        : SCRATCH_Real_AllAddr_gfx11<0x12, "scratch_load_u16">;
+defm SCRATCH_LOAD_SSHORT        : SCRATCH_Real_AllAddr_gfx11<0x13, "scratch_load_i16">;
+defm SCRATCH_LOAD_DWORD         : SCRATCH_Real_AllAddr_gfx11<0x14, "scratch_load_b32">;
+defm SCRATCH_LOAD_DWORDX2       : SCRATCH_Real_AllAddr_gfx11<0x15, "scratch_load_b64">;
+defm SCRATCH_LOAD_DWORDX3       : SCRATCH_Real_AllAddr_gfx11<0x16, "scratch_load_b96">;
+defm SCRATCH_LOAD_DWORDX4       : SCRATCH_Real_AllAddr_gfx11<0x17, "scratch_load_b128">;
+defm SCRATCH_STORE_BYTE         : SCRATCH_Real_AllAddr_gfx11<0x18, "scratch_store_b8">;
+defm SCRATCH_STORE_SHORT        : SCRATCH_Real_AllAddr_gfx11<0x19, "scratch_store_b16">;
+defm SCRATCH_STORE_DWORD        : SCRATCH_Real_AllAddr_gfx11<0x1a, "scratch_store_b32">;
+defm SCRATCH_STORE_DWORDX2      : SCRATCH_Real_AllAddr_gfx11<0x1b, "scratch_store_b64">;
+defm SCRATCH_STORE_DWORDX3      : SCRATCH_Real_AllAddr_gfx11<0x1c, "scratch_store_b96">;
+defm SCRATCH_STORE_DWORDX4      : SCRATCH_Real_AllAddr_gfx11<0x1d, "scratch_store_b128">;
+defm SCRATCH_LOAD_UBYTE_D16     : SCRATCH_Real_AllAddr_gfx11<0x1e, "scratch_load_d16_u8">;
+defm SCRATCH_LOAD_SBYTE_D16     : SCRATCH_Real_AllAddr_gfx11<0x1f, "scratch_load_d16_i8">;
+defm SCRATCH_LOAD_SHORT_D16     : SCRATCH_Real_AllAddr_gfx11<0x20, "scratch_load_d16_b16">;
+defm SCRATCH_LOAD_UBYTE_D16_HI  : SCRATCH_Real_AllAddr_gfx11<0x21, "scratch_load_d16_hi_u8">;
+defm SCRATCH_LOAD_SBYTE_D16_HI  : SCRATCH_Real_AllAddr_gfx11<0x22, "scratch_load_d16_hi_i8">;
+defm SCRATCH_LOAD_SHORT_D16_HI  : SCRATCH_Real_AllAddr_gfx11<0x23, "scratch_load_d16_hi_b16">;
+defm SCRATCH_STORE_BYTE_D16_HI  : SCRATCH_Real_AllAddr_gfx11<0x24, "scratch_store_d16_hi_b8">;
+defm SCRATCH_STORE_SHORT_D16_HI : SCRATCH_Real_AllAddr_gfx11<0x25, "scratch_store_d16_hi_b16">;
 
 //===----------------------------------------------------------------------===//
 // GFX12
 //===----------------------------------------------------------------------===//
 
-class VFLAT_Real_gfx12 <bits<8> op, FLAT_Pseudo ps,
-                        string opName = ps.Mnemonic> :
-  VFLAT_Real <op, ps, opName>,
-  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX12> {
-  let AssemblerPredicate = isGFX12Plus;
-  let DecoderNamespace = "GFX12";
-
-  let Inst{25-24} = !if(ps.is_flat_scratch, 0b01,
-                        !if(ps.is_flat_global, 0b10, 0b00));
-}
+multiclass VFLAT_Real_gfx12 <bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
+  def _gfx12 : VFLAT_Real <op, ps, name>,
+               SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX12> {
+    let AssemblerPredicate = isGFX12Only;
+    let DecoderNamespace = "GFX12";
 
-multiclass VFLAT_Aliases_gfx12<string ps, string opName, int renamed, string alias> {
-  if renamed then
-    def _renamed_gfx12 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX12Plus]>;
-  if !not(!empty(alias)) then
-    def _alias_gfx12 : MnemonicAlias<alias, opName>, Requires<[isGFX12Plus]>;
-}
-
-multiclass VFLAT_Real_Base_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
-                                 int renamed = false, string alias = ""> :
-  VFLAT_Aliases_gfx12<ps, opName, renamed, alias> {
-  def _gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps), opName> {
-    let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+    let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
   }
 }
 
-multiclass VFLAT_Real_RTN_gfx12<bits<8> op, string ps, string opName> {
-  def _RTN_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> {
-    let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
-  }
+multiclass VFLAT_Aliases_gfx12<string name, string alias = name> {
+  defvar ps = get_FLAT_ps<NAME>;
+  if !ne(ps.Mnemonic, name) then
+    def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Only]>;
+  if !ne(alias, name) then
+    def : MnemonicAlias<alias, name>, Requires<[isGFX12Only]>;
 }
 
-multiclass VFLAT_Real_SADDR_gfx12<bits<8> op, string ps, string opName> {
-  def _SADDR_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SADDR"), opName>;
-}
+multiclass VFLAT_Real_Base_gfx12<bits<8> op,
+                                 string name = get_FLAT_ps<NAME>.Mnemonic,
+                                 string alias = name> :
+  VFLAT_Aliases_gfx12<name, alias>,
+  VFLAT_Real_gfx12<op, name>;
 
-multiclass VFLAT_Real_SADDR_RTN_gfx12<bits<8> op, string ps, string opName> {
-  def _SADDR_RTN_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SADDR_RTN"), opName>;
+multiclass VFLAT_Real_Atomics_gfx12<bits<8> op,
+                                    string name = get_FLAT_ps<NAME>.Mnemonic,
+                                    string alias = ""> :
+  VFLAT_Real_Base_gfx12<op, name, alias> {
+  defm _RTN : VFLAT_Real_gfx12<op, name>;
 }
 
-multiclass VFLAT_Real_ST_gfx12<bits<8> op, string ps, string opName> {
-  def _ST_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> {
-    let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
-    let OtherPredicates = [HasFlatScratchSTMode];
-  }
+multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
+                                      string name = get_FLAT_ps<NAME>.Mnemonic,
+                                      string alias = name> :
+  VFLAT_Aliases_gfx12<name, alias>,
+  VFLAT_Real_gfx12<op, name> {
+  defm _SADDR : VFLAT_Real_gfx12<op, name>;
 }
 
-multiclass VFLAT_Real_SVS_gfx12<bits<8> op, string ps, string opName> {
-  def _SVS_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName> {
-    let OtherPredicates = [HasFlatScratchSVSMode];
+multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
+                                       string name = get_FLAT_ps<NAME>.Mnemonic> :
+  VFLAT_Aliases_gfx12<name> {
+  let DecoderNamespace = "GFX12W64" in {
+    defm "" : VFLAT_Real_gfx12<op, name>;
+    defm _SADDR : VFLAT_Real_gfx12<op, name>;
   }
 }
 
-multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
-                                    int renamed = false, string alias = ""> :
-  VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>,
-  VFLAT_Real_RTN_gfx12<op, ps, opName>;
-
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
-                                      int renamed = false, string alias = ""> :
-  VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>,
-  VFLAT_Real_SADDR_gfx12<op, ps, opName>;
-
-multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
-                                      int renamed = false, string alias = ""> :
-  VGLOBAL_Real_AllAddr_gfx12<op, ps, opName, renamed, alias>,
-  VFLAT_Real_RTN_gfx12<op, ps, opName>,
-  VFLAT_Real_SADDR_RTN_gfx12<op, ps, opName>;
+multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op,
+                                      string name = get_FLAT_ps<NAME>.Mnemonic,
+                                      string alias = ""> :
+  VGLOBAL_Real_AllAddr_gfx12<op, name, alias> {
+  defm _RTN : VFLAT_Real_gfx12<op, name>;
+  defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>;
+}
 
-multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
-                                       int renamed = false> :
-  VFLAT_Real_Base_gfx12<op, ps, opName, renamed>,
-  VFLAT_Real_SADDR_gfx12<op, ps, opName>,
-  VFLAT_Real_ST_gfx12<op, ps, opName>,
-  VFLAT_Real_SVS_gfx12<op, ps, opName>;
+multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op,
+                                       string name = get_FLAT_ps<NAME>.Mnemonic> :
+  VFLAT_Aliases_gfx12<name>,
+  VFLAT_Real_gfx12<op, name> {
+  defm _SADDR : VFLAT_Real_gfx12<op, name>;
+  defm _ST : VFLAT_Real_gfx12<op, name>;
+  defm _SVS : VFLAT_Real_gfx12<op, name>;
+}
 
 // ENC_VFLAT.
-defm FLAT_LOAD_U8                  : VFLAT_Real_Base_gfx12<0x010, "FLAT_LOAD_UBYTE", "flat_load_u8", true>;
-defm FLAT_LOAD_I8                  : VFLAT_Real_Base_gfx12<0x011, "FLAT_LOAD_SBYTE", "flat_load_i8", true>;
-defm FLAT_LOAD_U16                 : VFLAT_Real_Base_gfx12<0x012, "FLAT_LOAD_USHORT", "flat_load_u16", true>;
-defm FLAT_LOAD_I16                 : VFLAT_Real_Base_gfx12<0x013, "FLAT_LOAD_SSHORT", "flat_load_i16", true>;
-defm FLAT_LOAD_B32                 : VFLAT_Real_Base_gfx12<0x014, "FLAT_LOAD_DWORD", "flat_load_b32", true>;
-defm FLAT_LOAD_B64                 : VFLAT_Real_Base_gfx12<0x015, "FLAT_LOAD_DWORDX2", "flat_load_b64", true>;
-defm FLAT_LOAD_B96                 : VFLAT_Real_Base_gfx12<0x016, "FLAT_LOAD_DWORDX3", "flat_load_b96", true>;
-defm FLAT_LOAD_B128                : VFLAT_Real_Base_gfx12<0x017, "FLAT_LOAD_DWORDX4", "flat_load_b128", true>;
-defm FLAT_STORE_B8                 : VFLAT_Real_Base_gfx12<0x018, "FLAT_STORE_BYTE", "flat_store_b8", true>;
-defm FLAT_STORE_B16                : VFLAT_Real_Base_gfx12<0x019, "FLAT_STORE_SHORT", "flat_store_b16", true>;
-defm FLAT_STORE_B32                : VFLAT_Real_Base_gfx12<0x01a, "FLAT_STORE_DWORD", "flat_store_b32", true>;
-defm FLAT_STORE_B64                : VFLAT_Real_Base_gfx12<0x01b, "FLAT_STORE_DWORDX2", "flat_store_b64", true>;
-defm FLAT_STORE_B96                : VFLAT_Real_Base_gfx12<0x01c, "FLAT_STORE_DWORDX3", "flat_store_b96", true>;
-defm FLAT_STORE_B128               : VFLAT_Real_Base_gfx12<0x01d, "FLAT_STORE_DWORDX4", "flat_store_b128", true>;
-defm FLAT_LOAD_D16_U8              : VFLAT_Real_Base_gfx12<0x01e, "FLAT_LOAD_UBYTE_D16">;
-defm FLAT_LOAD_D16_I8              : VFLAT_Real_Base_gfx12<0x01f, "FLAT_LOAD_SBYTE_D16">;
-defm FLAT_LOAD_D16_B16             : VFLAT_Real_Base_gfx12<0x020, "FLAT_LOAD_SHORT_D16">;
-defm FLAT_LOAD_D16_HI_U8           : VFLAT_Real_Base_gfx12<0x021, "FLAT_LOAD_UBYTE_D16_HI">;
-defm FLAT_LOAD_D16_HI_I8           : VFLAT_Real_Base_gfx12<0x022, "FLAT_LOAD_SBYTE_D16_HI">;
-defm FLAT_LOAD_D16_HI_B16          : VFLAT_Real_Base_gfx12<0x023, "FLAT_LOAD_SHORT_D16_HI">;
-defm FLAT_STORE_D16_HI_B8          : VFLAT_Real_Base_gfx12<0x024, "FLAT_STORE_BYTE_D16_HI">;
-defm FLAT_STORE_D16_HI_B16         : VFLAT_Real_Base_gfx12<0x025, "FLAT_STORE_SHORT_D16_HI">;
-defm FLAT_ATOMIC_SWAP_B32          : VFLAT_Real_Atomics_gfx12<0x033, "FLAT_ATOMIC_SWAP", "flat_atomic_swap_b32", true>;
-defm FLAT_ATOMIC_CMPSWAP_B32       : VFLAT_Real_Atomics_gfx12<0x034, "FLAT_ATOMIC_CMPSWAP", "flat_atomic_cmpswap_b32", true>;
-defm FLAT_ATOMIC_ADD_U32           : VFLAT_Real_Atomics_gfx12<0x035, "FLAT_ATOMIC_ADD", "flat_atomic_add_u32", true>;
-defm FLAT_ATOMIC_SUB_U32           : VFLAT_Real_Atomics_gfx12<0x036, "FLAT_ATOMIC_SUB", "flat_atomic_sub_u32", true>;
-defm FLAT_ATOMIC_SUB_CLAMP_U32     : VFLAT_Real_Atomics_gfx12<0x037, "FLAT_ATOMIC_CSUB_U32", "flat_atomic_sub_clamp_u32", true>;
-defm FLAT_ATOMIC_MIN_I32           : VFLAT_Real_Atomics_gfx12<0x038, "FLAT_ATOMIC_SMIN", "flat_atomic_min_i32", true>;
-defm FLAT_ATOMIC_MIN_U32           : VFLAT_Real_Atomics_gfx12<0x039, "FLAT_ATOMIC_UMIN", "flat_atomic_min_u32", true>;
-defm FLAT_ATOMIC_MAX_I32           : VFLAT_Real_Atomics_gfx12<0x03a, "FLAT_ATOMIC_SMAX", "flat_atomic_max_i32", true>;
-defm FLAT_ATOMIC_MAX_U32           : VFLAT_Real_Atomics_gfx12<0x03b, "FLAT_ATOMIC_UMAX", "flat_atomic_max_u32", true>;
-defm FLAT_ATOMIC_AND_B32           : VFLAT_Real_Atomics_gfx12<0x03c, "FLAT_ATOMIC_AND", "flat_atomic_and_b32", true>;
-defm FLAT_ATOMIC_OR_B32            : VFLAT_Real_Atomics_gfx12<0x03d, "FLAT_ATOMIC_OR", "flat_atomic_or_b32", true>;
-defm FLAT_ATOMIC_XOR_B32           : VFLAT_Real_Atomics_gfx12<0x03e, "FLAT_ATOMIC_XOR", "flat_atomic_xor_b32", true>;
-defm FLAT_ATOMIC_INC_U32           : VFLAT_Real_Atomics_gfx12<0x03f, "FLAT_ATOMIC_INC", "flat_atomic_inc_u32", true>;
-defm FLAT_ATOMIC_DEC_U32           : VFLAT_Real_Atomics_gfx12<0x040, "FLAT_ATOMIC_DEC", "flat_atomic_dec_u32", true>;
-defm FLAT_ATOMIC_SWAP_B64          : VFLAT_Real_Atomics_gfx12<0x041, "FLAT_ATOMIC_SWAP_X2", "flat_atomic_swap_b64", true>;
-defm FLAT_ATOMIC_CMPSWAP_B64       : VFLAT_Real_Atomics_gfx12<0x042, "FLAT_ATOMIC_CMPSWAP_X2", "flat_atomic_cmpswap_b64", true>;
-defm FLAT_ATOMIC_ADD_U64           : VFLAT_Real_Atomics_gfx12<0x043, "FLAT_ATOMIC_ADD_X2", "flat_atomic_add_u64", true>;
-defm FLAT_ATOMIC_SUB_U64           : VFLAT_Real_Atomics_gfx12<0x044, "FLAT_ATOMIC_SUB_X2", "flat_atomic_sub_u64", true>;
-defm FLAT_ATOMIC_MIN_I64           : VFLAT_Real_Atomics_gfx12<0x045, "FLAT_ATOMIC_SMIN_X2", "flat_atomic_min_i64", true>;
-defm FLAT_ATOMIC_MIN_U64           : VFLAT_Real_Atomics_gfx12<0x046, "FLAT_ATOMIC_UMIN_X2", "flat_atomic_min_u64", true>;
-defm FLAT_ATOMIC_MAX_I64           : VFLAT_Real_Atomics_gfx12<0x047, "FLAT_ATOMIC_SMAX_X2", "flat_atomic_max_i64", true>;
-defm FLAT_ATOMIC_MAX_U64           : VFLAT_Real_Atomics_gfx12<0x048, "FLAT_ATOMIC_UMAX_X2", "flat_atomic_max_u64", true>;
-defm FLAT_ATOMIC_AND_B64           : VFLAT_Real_Atomics_gfx12<0x049, "FLAT_ATOMIC_AND_X2", "flat_atomic_and_b64", true>;
-defm FLAT_ATOMIC_OR_B64            : VFLAT_Real_Atomics_gfx12<0x04a, "FLAT_ATOMIC_OR_X2", "flat_atomic_or_b64", true>;
-defm FLAT_ATOMIC_XOR_B64           : VFLAT_Real_Atomics_gfx12<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>;
-defm FLAT_ATOMIC_INC_U64           : VFLAT_Real_Atomics_gfx12<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>;
-defm FLAT_ATOMIC_DEC_U64           : VFLAT_Real_Atomics_gfx12<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>;
-defm FLAT_ATOMIC_COND_SUB_U32      : VFLAT_Real_Atomics_gfx12<0x050, "FLAT_ATOMIC_COND_SUB_U32", "flat_atomic_cond_sub_u32">;
-defm FLAT_ATOMIC_MIN_NUM_F32       : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">;
-defm FLAT_ATOMIC_MAX_NUM_F32       : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">;
+defm FLAT_LOAD_UBYTE               : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">;
+defm FLAT_LOAD_SBYTE               : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">;
+defm FLAT_LOAD_USHORT              : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">;
+defm FLAT_LOAD_SSHORT              : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">;
+defm FLAT_LOAD_DWORD               : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">;
+defm FLAT_LOAD_DWORDX2             : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">;
+defm FLAT_LOAD_DWORDX3             : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">;
+defm FLAT_LOAD_DWORDX4             : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">;
+defm FLAT_STORE_BYTE               : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">;
+defm FLAT_STORE_SHORT              : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">;
+defm FLAT_STORE_DWORD              : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">;
+defm FLAT_STORE_DWORDX2            : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">;
+defm FLAT_STORE_DWORDX3            : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">;
+defm FLAT_STORE_DWORDX4            : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">;
+defm FLAT_LOAD_UBYTE_D16           : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">;
+defm FLAT_LOAD_SBYTE_D16           : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">;
+defm FLAT_LOAD_SHORT_D16           : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">;
+defm FLAT_LOAD_UBYTE_D16_HI        : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_SBYTE_D16_HI        : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_SHORT_D16_HI        : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">;
+defm FLAT_STORE_BYTE_D16_HI        : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">;
+defm FLAT_STORE_SHORT_D16_HI       : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">;
+defm FLAT_ATOMIC_SWAP              : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">;
+defm FLAT_ATOMIC_CMPSWAP           : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">;
+defm FLAT_ATOMIC_ADD               : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">;
+defm FLAT_ATOMIC_SUB               : VFLAT_Real_Atomics_gfx12<0x036, "flat_atomic_sub_u32">;
+defm FLAT_ATOMIC_CSUB_U32          : VFLAT_Real_Atomics_gfx12<0x037, "flat_atomic_sub_clamp_u32">;
+defm FLAT_ATOMIC_SMIN              : VFLAT_Real_Atomics_gfx12<0x038, "flat_atomic_min_i32">;
+defm FLAT_ATOMIC_UMIN              : VFLAT_Real_Atomics_gfx12<0x039, "flat_atomic_min_u32">;
+defm FLAT_ATOMIC_SMAX              : VFLAT_Real_Atomics_gfx12<0x03a, "flat_atomic_max_i32">;
+defm FLAT_ATOMIC_UMAX              : VFLAT_Real_Atomics_gfx12<0x03b, "flat_atomic_max_u32">;
+defm FLAT_ATOMIC_AND               : VFLAT_Real_Atomics_gfx12<0x03c, "flat_atomic_and_b32">;
+defm FLAT_ATOMIC_OR                : VFLAT_Real_Atomics_gfx12<0x03d, "flat_atomic_or_b32">;
+defm FLAT_ATOMIC_XOR               : VFLAT_Real_Atomics_gfx12<0x03e, "flat_atomic_xor_b32">;
+defm FLAT_ATOMIC_INC               : VFLAT_Real_Atomics_gfx12<0x03f, "flat_atomic_inc_u32">;
+defm FLAT_ATOMIC_DEC               : VFLAT_Real_Atomics_gfx12<0x040, "flat_atomic_dec_u32">;
+defm FLAT_ATOMIC_SWAP_X2           : VFLAT_Real_Atomics_gfx12<0x041, "flat_atomic_swap_b64">;
+defm FLAT_ATOMIC_CMPSWAP_X2        : VFLAT_Real_Atomics_gfx12<0x042, "flat_atomic_cmpswap_b64">;
+defm FLAT_ATOMIC_ADD_X2            : VFLAT_Real_Atomics_gfx12<0x043, "flat_atomic_add_u64">;
+defm FLAT_ATOMIC_SUB_X2            : VFLAT_Real_Atomics_gfx12<0x044, "flat_atomic_sub_u64">;
+defm FLAT_ATOMIC_SMIN_X2           : VFLAT_Real_Atomics_gfx12<0x045, "flat_atomic_min_i64">;
+defm FLAT_ATOMIC_UMIN_X2           : VFLAT_Real_Atomics_gfx12<0x046, "flat_atomic_min_u64">;
+defm FLAT_ATOMIC_SMAX_X2           : VFLAT_Real_Atomics_gfx12<0x047, "flat_atomic_max_i64">;
+defm FLAT_ATOMIC_UMAX_X2           : VFLAT_Real_Atomics_gfx12<0x048, "flat_atomic_max_u64">;
+defm FLAT_ATOMIC_AND_X2            : VFLAT_Real_Atomics_gfx12<0x049, "flat_atomic_and_b64">;
+defm FLAT_ATOMIC_OR_X2             : VFLAT_Real_Atomics_gfx12<0x04a, "flat_atomic_or_b64">;
+defm FLAT_ATOMIC_XOR_X2            : VFLAT_Real_Atomics_gfx12<0x04b, "flat_atomic_xor_b64">;
+defm FLAT_ATOMIC_INC_X2            : VFLAT_Real_Atomics_gfx12<0x04c, "flat_atomic_inc_u64">;
+defm FLAT_ATOMIC_DEC_X2            : VFLAT_Real_Atomics_gfx12<0x04d, "flat_atomic_dec_u64">;
+defm FLAT_ATOMIC_COND_SUB_U32      : VFLAT_Real_Atomics_gfx12<0x050>;
+defm FLAT_ATOMIC_FMIN              : VFLAT_Real_Atomics_gfx12<0x051, "flat_atomic_min_num_f32", "flat_atomic_min_f32">;
+defm FLAT_ATOMIC_FMAX              : VFLAT_Real_Atomics_gfx12<0x052, "flat_atomic_max_num_f32", "flat_atomic_max_f32">;
 defm FLAT_ATOMIC_ADD_F32           : VFLAT_Real_Atomics_gfx12<0x056>;
 defm FLAT_ATOMIC_PK_ADD_F16        : VFLAT_Real_Atomics_gfx12<0x059>;
 defm FLAT_ATOMIC_PK_ADD_BF16       : VFLAT_Real_Atomics_gfx12<0x05a>;
 
 // ENC_VGLOBAL.
-defm GLOBAL_LOAD_U8                : VGLOBAL_Real_AllAddr_gfx12<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>;
-defm GLOBAL_LOAD_I8                : VGLOBAL_Real_AllAddr_gfx12<0x011, "GLOBAL_LOAD_SBYTE", "global_load_i8", true>;
-defm GLOBAL_LOAD_U16               : VGLOBAL_Real_AllAddr_gfx12<0x012, "GLOBAL_LOAD_USHORT", "global_load_u16", true>;
-defm GLOBAL_LOAD_I16               : VGLOBAL_Real_AllAddr_gfx12<0x013, "GLOBAL_LOAD_SSHORT", "global_load_i16", true>;
-defm GLOBAL_LOAD_B32               : VGLOBAL_Real_AllAddr_gfx12<0x014, "GLOBAL_LOAD_DWORD", "global_load_b32", true>;
-defm GLOBAL_LOAD_B64               : VGLOBAL_Real_AllAddr_gfx12<0x015, "GLOBAL_LOAD_DWORDX2", "global_load_b64", true>;
-defm GLOBAL_LOAD_B96               : VGLOBAL_Real_AllAddr_gfx12<0x016, "GLOBAL_LOAD_DWORDX3", "global_load_b96", true>;
-defm GLOBAL_LOAD_B128              : VGLOBAL_Real_AllAddr_gfx12<0x017, "GLOBAL_LOAD_DWORDX4", "global_load_b128", true>;
-defm GLOBAL_STORE_B8               : VGLOBAL_Real_AllAddr_gfx12<0x018, "GLOBAL_STORE_BYTE", "global_store_b8", true>;
-defm GLOBAL_STORE_B16              : VGLOBAL_Real_AllAddr_gfx12<0x019, "GLOBAL_STORE_SHORT", "global_store_b16", true>;
-defm GLOBAL_STORE_B32              : VGLOBAL_Real_AllAddr_gfx12<0x01a, "GLOBAL_STORE_DWORD", "global_store_b32", true>;
-defm GLOBAL_STORE_B64              : VGLOBAL_Real_AllAddr_gfx12<0x01b, "GLOBAL_STORE_DWORDX2", "global_store_b64", true>;
-defm GLOBAL_STORE_B96              : VGLOBAL_Real_AllAddr_gfx12<0x01c, "GLOBAL_STORE_DWORDX3", "global_store_b96", true>;
-defm GLOBAL_STORE_B128             : VGLOBAL_Real_AllAddr_gfx12<0x01d, "GLOBAL_STORE_DWORDX4", "global_store_b128", true>;
-defm GLOBAL_LOAD_D16_U8            : VGLOBAL_Real_AllAddr_gfx12<0x01e, "GLOBAL_LOAD_UBYTE_D16">;
-defm GLOBAL_LOAD_D16_I8            : VGLOBAL_Real_AllAddr_gfx12<0x01f, "GLOBAL_LOAD_SBYTE_D16">;
-defm GLOBAL_LOAD_D16_B16           : VGLOBAL_Real_AllAddr_gfx12<0x020, "GLOBAL_LOAD_SHORT_D16">;
-defm GLOBAL_LOAD_D16_HI_U8         : VGLOBAL_Real_AllAddr_gfx12<0x021, "GLOBAL_LOAD_UBYTE_D16_HI">;
-defm GLOBAL_LOAD_D16_HI_I8         : VGLOBAL_Real_AllAddr_gfx12<0x022, "GLOBAL_LOAD_SBYTE_D16_HI">;
-defm GLOBAL_LOAD_D16_HI_B16        : VGLOBAL_Real_AllAddr_gfx12<0x023, "GLOBAL_LOAD_SHORT_D16_HI">;
-defm GLOBAL_STORE_D16_HI_B8        : VGLOBAL_Real_AllAddr_gfx12<0x024, "GLOBAL_STORE_BYTE_D16_HI">;
-defm GLOBAL_STORE_D16_HI_B16       : VGLOBAL_Real_AllAddr_gfx12<0x025, "GLOBAL_STORE_SHORT_D16_HI">;
-defm GLOBAL_LOAD_ADDTID_B32        : VGLOBAL_Real_AllAddr_gfx12<0x028, "GLOBAL_LOAD_DWORD_ADDTID">;
-defm GLOBAL_STORE_ADDTID_B32       : VGLOBAL_Real_AllAddr_gfx12<0x029, "GLOBAL_STORE_DWORD_ADDTID">;
-
-defm GLOBAL_ATOMIC_SWAP_B32        : VGLOBAL_Real_Atomics_gfx12<0x033, "GLOBAL_ATOMIC_SWAP", "global_atomic_swap_b32", true>;
-defm GLOBAL_ATOMIC_CMPSWAP_B32     : VGLOBAL_Real_Atomics_gfx12<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>;
-defm GLOBAL_ATOMIC_ADD_U32         : VGLOBAL_Real_Atomics_gfx12<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>;
-defm GLOBAL_ATOMIC_SUB_U32         : VGLOBAL_Real_Atomics_gfx12<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>;
-defm GLOBAL_ATOMIC_SUB_CLAMP_U32   : VGLOBAL_Real_Atomics_gfx12<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_sub_clamp_u32", true, "global_atomic_csub_u32">;
-defm GLOBAL_ATOMIC_MIN_I32         : VGLOBAL_Real_Atomics_gfx12<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>;
-defm GLOBAL_ATOMIC_MIN_U32         : VGLOBAL_Real_Atomics_gfx12<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>;
-defm GLOBAL_ATOMIC_MAX_I32         : VGLOBAL_Real_Atomics_gfx12<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>;
-defm GLOBAL_ATOMIC_MAX_U32         : VGLOBAL_Real_Atomics_gfx12<0x03b, "GLOBAL_ATOMIC_UMAX", "global_atomic_max_u32", true>;
-defm GLOBAL_ATOMIC_AND_B32         : VGLOBAL_Real_Atomics_gfx12<0x03c, "GLOBAL_ATOMIC_AND", "global_atomic_and_b32", true>;
-defm GLOBAL_ATOMIC_OR_B32          : VGLOBAL_Real_Atomics_gfx12<0x03d, "GLOBAL_ATOMIC_OR", "global_atomic_or_b32", true>;
-defm GLOBAL_ATOMIC_XOR_B32         : VGLOBAL_Real_Atomics_gfx12<0x03e, "GLOBAL_ATOMIC_XOR", "global_atomic_xor_b32", true>;
-defm GLOBAL_ATOMIC_INC_U32         : VGLOBAL_Real_Atomics_gfx12<0x03f, "GLOBAL_ATOMIC_INC", "global_atomic_inc_u32", true>;
-defm GLOBAL_ATOMIC_DEC_U32         : VGLOBAL_Real_Atomics_gfx12<0x040, "GLOBAL_ATOMIC_DEC", "global_atomic_dec_u32", true>;
-defm GLOBAL_ATOMIC_SWAP_B64        : VGLOBAL_Real_Atomics_gfx12<0x041, "GLOBAL_ATOMIC_SWAP_X2", "global_atomic_swap_b64", true>;
-defm GLOBAL_ATOMIC_CMPSWAP_B64     : VGLOBAL_Real_Atomics_gfx12<0x042, "GLOBAL_ATOMIC_CMPSWAP_X2", "global_atomic_cmpswap_b64", true>;
-defm GLOBAL_ATOMIC_ADD_U64         : VGLOBAL_Real_Atomics_gfx12<0x043, "GLOBAL_ATOMIC_ADD_X2", "global_atomic_add_u64", true>;
-defm GLOBAL_ATOMIC_SUB_U64         : VGLOBAL_Real_Atomics_gfx12<0x044, "GLOBAL_ATOMIC_SUB_X2", "global_atomic_sub_u64", true>;
-defm GLOBAL_ATOMIC_MIN_I64         : VGLOBAL_Real_Atomics_gfx12<0x045, "GLOBAL_ATOMIC_SMIN_X2", "global_atomic_min_i64", true>;
-defm GLOBAL_ATOMIC_MIN_U64         : VGLOBAL_Real_Atomics_gfx12<0x046, "GLOBAL_ATOMIC_UMIN_X2", "global_atomic_min_u64", true>;
-defm GLOBAL_ATOMIC_MAX_I64         : VGLOBAL_Real_Atomics_gfx12<0x047, "GLOBAL_ATOMIC_SMAX_X2", "global_atomic_max_i64", true>;
-defm GLOBAL_ATOMIC_MAX_U64         : VGLOBAL_Real_Atomics_gfx12<0x048, "GLOBAL_ATOMIC_UMAX_X2", "global_atomic_max_u64", true>;
-defm GLOBAL_ATOMIC_AND_B64         : VGLOBAL_Real_Atomics_gfx12<0x049, "GLOBAL_ATOMIC_AND_X2", "global_atomic_and_b64", true>;
-defm GLOBAL_ATOMIC_OR_B64          : VGLOBAL_Real_Atomics_gfx12<0x04a, "GLOBAL_ATOMIC_OR_X2", "global_atomic_or_b64", true>;
-defm GLOBAL_ATOMIC_XOR_B64         : VGLOBAL_Real_Atomics_gfx12<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>;
-defm GLOBAL_ATOMIC_INC_U64         : VGLOBAL_Real_Atomics_gfx12<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>;
-defm GLOBAL_ATOMIC_DEC_U64         : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>;
-defm GLOBAL_ATOMIC_COND_SUB_U32    : VGLOBAL_Real_Atomics_gfx12<0x050, "GLOBAL_ATOMIC_COND_SUB_U32", "global_atomic_cond_sub_u32">;
-defm GLOBAL_ATOMIC_MIN_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
-defm GLOBAL_ATOMIC_MAX_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
+defm GLOBAL_LOAD_UBYTE             : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">;
+defm GLOBAL_LOAD_SBYTE             : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">;
+defm GLOBAL_LOAD_USHORT            : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">;
+defm GLOBAL_LOAD_SSHORT            : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">;
+defm GLOBAL_LOAD_DWORD             : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">;
+defm GLOBAL_LOAD_DWORDX2           : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">;
+defm GLOBAL_LOAD_DWORDX3           : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">;
+defm GLOBAL_LOAD_DWORDX4           : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">;
+defm GLOBAL_STORE_BYTE             : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">;
+defm GLOBAL_STORE_SHORT            : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">;
+defm GLOBAL_STORE_DWORD            : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">;
+defm GLOBAL_STORE_DWORDX2          : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">;
+defm GLOBAL_STORE_DWORDX3          : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">;
+defm GLOBAL_STORE_DWORDX4          : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">;
+defm GLOBAL_LOAD_UBYTE_D16         : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">;
+defm GLOBAL_LOAD_SBYTE_D16         : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">;
+defm GLOBAL_LOAD_SHORT_D16         : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">;
+defm GLOBAL_LOAD_UBYTE_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_SBYTE_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_SHORT_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_BYTE_D16_HI      : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_SHORT_D16_HI     : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_DWORD_ADDTID      : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">;
+defm GLOBAL_STORE_DWORD_ADDTID     : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">;
+
+defm GLOBAL_ATOMIC_SWAP            : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">;
+defm GLOBAL_ATOMIC_CMPSWAP         : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">;
+defm GLOBAL_ATOMIC_ADD             : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">;
+defm GLOBAL_ATOMIC_SUB             : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">;
+defm GLOBAL_ATOMIC_CSUB            : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_SMIN            : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">;
+defm GLOBAL_ATOMIC_UMIN            : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">;
+defm GLOBAL_ATOMIC_SMAX            : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">;
+defm GLOBAL_ATOMIC_UMAX            : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">;
+defm GLOBAL_ATOMIC_AND             : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">;
+defm GLOBAL_ATOMIC_OR              : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">;
+defm GLOBAL_ATOMIC_XOR             : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">;
+defm GLOBAL_ATOMIC_INC             : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">;
+defm GLOBAL_ATOMIC_DEC             : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">;
+defm GLOBAL_ATOMIC_SWAP_X2         : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">;
+defm GLOBAL_ATOMIC_CMPSWAP_X2      : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">;
+defm GLOBAL_ATOMIC_ADD_X2          : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">;
+defm GLOBAL_ATOMIC_SUB_X2          : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">;
+defm GLOBAL_ATOMIC_SMIN_X2         : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">;
+defm GLOBAL_ATOMIC_UMIN_X2         : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">;
+defm GLOBAL_ATOMIC_SMAX_X2         : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">;
+defm GLOBAL_ATOMIC_UMAX_X2         : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">;
+defm GLOBAL_ATOMIC_AND_X2          : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">;
+defm GLOBAL_ATOMIC_OR_X2           : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">;
+defm GLOBAL_ATOMIC_XOR_X2          : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">;
+defm GLOBAL_ATOMIC_INC_X2          : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">;
+defm GLOBAL_ATOMIC_DEC_X2          : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">;
+defm GLOBAL_ATOMIC_COND_SUB_U32    : VGLOBAL_Real_Atomics_gfx12<0x050>;
+defm GLOBAL_ATOMIC_FMIN            : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_FMAX            : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
 defm GLOBAL_ATOMIC_ADD_F32         : VGLOBAL_Real_Atomics_gfx12<0x056>;
 
-let DecoderNamespace = "GFX12" in {
-  defm GLOBAL_LOAD_TR_B128_w32     : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w32", "global_load_tr_b128">;
-  defm GLOBAL_LOAD_TR_B64_w32      : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w32", "global_load_tr_b64">;
-}
+defm GLOBAL_LOAD_TR_B128_w32       : VGLOBAL_Real_AllAddr_gfx12<0x057, "global_load_tr_b128">;
+defm GLOBAL_LOAD_TR_B64_w32        : VGLOBAL_Real_AllAddr_gfx12<0x058, "global_load_tr_b64">;
 
-let DecoderNamespace = "GFX12W64" in {
-  defm GLOBAL_LOAD_TR_B128_w64     : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w64", "global_load_tr_b128">;
-  defm GLOBAL_LOAD_TR_B64_w64      : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w64", "global_load_tr_b64">;
-}
+defm GLOBAL_LOAD_TR_B128_w64       : VGLOBAL_Real_AllAddr_gfx12_w64<0x057, "global_load_tr_b128">;
+defm GLOBAL_LOAD_TR_B64_w64        : VGLOBAL_Real_AllAddr_gfx12_w64<0x058, "global_load_tr_b64">;
 
 defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;
 defm GLOBAL_ATOMIC_PK_ADD_F16      : VGLOBAL_Real_Atomics_gfx12<0x059>;
@@ -2765,25 +2735,25 @@ defm GLOBAL_WB                     : VFLAT_Real_Base_gfx12<0x02c>;
 defm GLOBAL_WBINV                  : VFLAT_Real_Base_gfx12<0x04f>;
 
 // ENC_VSCRATCH.
-defm SCRATCH_LOAD_U8               : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
-defm SCRATCH_LOAD_I8               : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;
-defm SCRATCH_LOAD_U16              : VSCRATCH_Real_AllAddr_gfx12<0x12, "SCRATCH_LOAD_USHORT", "scratch_load_u16", true>;
-defm SCRATCH_LOAD_I16              : VSCRATCH_Real_AllAddr_gfx12<0x13, "SCRATCH_LOAD_SSHORT", "scratch_load_i16", true>;
-defm SCRATCH_LOAD_B32              : VSCRATCH_Real_AllAddr_gfx12<0x14, "SCRATCH_LOAD_DWORD", "scratch_load_b32", true>;
-defm SCRATCH_LOAD_B64              : VSCRATCH_Real_AllAddr_gfx12<0x15, "SCRATCH_LOAD_DWORDX2", "scratch_load_b64", true>;
-defm SCRATCH_LOAD_B96              : VSCRATCH_Real_AllAddr_gfx12<0x16, "SCRATCH_LOAD_DWORDX3", "scratch_load_b96", true>;
-defm SCRATCH_LOAD_B128             : VSCRATCH_Real_AllAddr_gfx12<0x17, "SCRATCH_LOAD_DWORDX4", "scratch_load_b128", true>;
-defm SCRATCH_STORE_B8              : VSCRATCH_Real_AllAddr_gfx12<0x18, "SCRATCH_STORE_BYTE", "scratch_store_b8", true>;
-defm SCRATCH_STORE_B16             : VSCRATCH_Real_AllAddr_gfx12<0x19, "SCRATCH_STORE_SHORT", "scratch_store_b16", true>;
-defm SCRATCH_STORE_B32             : VSCRATCH_Real_AllAddr_gfx12<0x1a, "SCRATCH_STORE_DWORD", "scratch_store_b32", true>;
-defm SCRATCH_STORE_B64             : VSCRATCH_Real_AllAddr_gfx12<0x1b, "SCRATCH_STORE_DWORDX2", "scratch_store_b64", true>;
-defm SCRATCH_STORE_B96             : VSCRATCH_Real_AllAddr_gfx12<0x1c, "SCRATCH_STORE_DWORDX3", "scratch_store_b96", true>;
-defm SCRATCH_STORE_B128            : VSCRATCH_Real_AllAddr_gfx12<0x1d, "SCRATCH_STORE_DWORDX4", "scratch_store_b128", true>;
-defm SCRATCH_LOAD_D16_U8           : VSCRATCH_Real_AllAddr_gfx12<0x1e, "SCRATCH_LOAD_UBYTE_D16">;
-defm SCRATCH_LOAD_D16_I8           : VSCRATCH_Real_AllAddr_gfx12<0x1f, "SCRATCH_LOAD_SBYTE_D16">;
-defm SCRATCH_LOAD_D16_B16          : VSCRATCH_Real_AllAddr_gfx12<0x20, "SCRATCH_LOAD_SHORT_D16">;
-defm SCRATCH_LOAD_D16_HI_U8        : VSCRATCH_Real_AllAddr_gfx12<0x21, "SCRATCH_LOAD_UBYTE_D16_HI">;
-defm SCRATCH_LOAD_D16_HI_I8        : VSCRATCH_Real_AllAddr_gfx12<0x22, "SCRATCH_LOAD_SBYTE_D16_HI">;
-defm SCRATCH_LOAD_D16_HI_B16       : VSCRATCH_Real_AllAddr_gfx12<0x23, "SCRATCH_LOAD_SHORT_D16_HI">;
-defm SCRATCH_STORE_D16_HI_B8       : VSCRATCH_Real_AllAddr_gfx12<0x24, "SCRATCH_STORE_BYTE_D16_HI">;
-defm SCRATCH_STORE_D16_HI_B16      : VSCRATCH_Real_AllAddr_gfx12<0x25, "SCRATCH_STORE_SHORT_D16_HI">;
+defm SCRATCH_LOAD_UBYTE            : VSCRATCH_Real_AllAddr_gfx12<0x10, "scratch_load_u8">;
+defm SCRATCH_LOAD_SBYTE            : VSCRATCH_Real_AllAddr_gfx12<0x11, "scratch_load_i8">;
+defm SCRATCH_LOAD_USHORT           : VSCRATCH_Real_AllAddr_gfx12<0x12, "scratch_load_u16">;
+defm SCRATCH_LOAD_SSHORT           : VSCRATCH_Real_AllAddr_gfx12<0x13, "scratch_load_i16">;
+defm SCRATCH_LOAD_DWORD            : VSCRATCH_Real_AllAddr_gfx12<0x14, "scratch_load_b32">;
+defm SCRATCH_LOAD_DWORDX2          : VSCRATCH_Real_AllAddr_gfx12<0x15, "scratch_load_b64">;
+defm SCRATCH_LOAD_DWORDX3          : VSCRATCH_Real_AllAddr_gfx12<0x16, "scratch_load_b96">;
+defm SCRATCH_LOAD_DWORDX4          : VSCRATCH_Real_AllAddr_gfx12<0x17, "scratch_load_b128">;
+defm SCRATCH_STORE_BYTE            : VSCRATCH_Real_AllAddr_gfx12<0x18, "scratch_store_b8">;
+defm SCRATCH_STORE_SHORT           : VSCRATCH_Real_AllAddr_gfx12<0x19, "scratch_store_b16">;
+defm SCRATCH_STORE_DWORD           : VSCRATCH_Real_AllAddr_gfx12<0x1a, "scratch_store_b32">;
+defm SCRATCH_STORE_DWORDX2         : VSCRATCH_Real_AllAddr_gfx12<0x1b, "scratch_store_b64">;
+defm SCRATCH_STORE_DWORDX3         : VSCRATCH_Real_AllAddr_gfx12<0x1c, "scratch_store_b96">;
+defm SCRATCH_STORE_DWORDX4         : VSCRATCH_Real_AllAddr_gfx12<0x1d, "scratch_store_b128">;
+defm SCRATCH_LOAD_UBYTE_D16        : VSCRATCH_Real_AllAddr_gfx12<0x1e, "scratch_load_d16_u8">;
+defm SCRATCH_LOAD_SBYTE_D16        : VSCRATCH_Real_AllAddr_gfx12<0x1f, "scratch_load_d16_i8">;
+defm SCRATCH_LOAD_SHORT_D16        : VSCRATCH_Real_AllAddr_gfx12<0x20, "scratch_load_d16_b16">;
+defm SCRATCH_LOAD_UBYTE_D16_HI     : VSCRATCH_Real_AllAddr_gfx12<0x21, "scratch_load_d16_hi_u8">;
+defm SCRATCH_LOAD_SBYTE_D16_HI     : VSCRATCH_Real_AllAddr_gfx12<0x22, "scratch_load_d16_hi_i8">;
+defm SCRATCH_LOAD_SHORT_D16_HI     : VSCRATCH_Real_AllAddr_gfx12<0x23, "scratch_load_d16_hi_b16">;
+defm SCRATCH_STORE_BYTE_D16_HI     : VSCRATCH_Real_AllAddr_gfx12<0x24, "scratch_store_d16_hi_b8">;
+defm SCRATCH_STORE_SHORT_D16_HI    : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_store_d16_hi_b16">;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
new file mode 100644
index 00000000000000..4578c33d92dce1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -0,0 +1,115 @@
+//===- AMDGPUMCExpr.cpp - AMDGPU specific MC expression classes -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+
+using namespace llvm;
+
+AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
+                                           ArrayRef<const MCExpr *> Args,
+                                           MCContext &Ctx)
+    : Kind(Kind), Ctx(Ctx) {
+  assert(Args.size() >= 1 && "Needs a minimum of one expression.");
+  assert(Kind != AGVK_None &&
+         "Cannot construct AMDGPUVariadicMCExpr of kind none.");
+
+  // Allocating the variadic arguments through the same allocation mechanism
+  // that the object itself is allocated with so they end up in the same memory.
+  //
+  // Will result in an asan failure if allocated on the heap through standard
+  // allocation (e.g., through SmallVector's grow).
+  RawArgs = static_cast<const MCExpr **>(
+      Ctx.allocate(sizeof(const MCExpr *) * Args.size()));
+  std::uninitialized_copy(Args.begin(), Args.end(), RawArgs);
+  this->Args = ArrayRef<const MCExpr *>(RawArgs, Args.size());
+}
+
+AMDGPUVariadicMCExpr::~AMDGPUVariadicMCExpr() { Ctx.deallocate(RawArgs); }
+
+const AMDGPUVariadicMCExpr *
+AMDGPUVariadicMCExpr::create(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
+                             MCContext &Ctx) {
+  return new (Ctx) AMDGPUVariadicMCExpr(Kind, Args, Ctx);
+}
+
+const MCExpr *AMDGPUVariadicMCExpr::getSubExpr(size_t Index) const {
+  assert(Index < Args.size() &&
+         "Indexing out of bounds AMDGPUVariadicMCExpr sub-expr");
+  return Args[Index];
+}
+
+void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
+                                     const MCAsmInfo *MAI) const {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
+  case AGVK_Or:
+    OS << "or(";
+    break;
+  case AGVK_Max:
+    OS << "max(";
+    break;
+  }
+  for (auto It = Args.begin(); It != Args.end(); ++It) {
+    (*It)->print(OS, MAI, /*InParens=*/false);
+    if ((It + 1) != Args.end())
+      OS << ", ";
+  }
+  OS << ')';
+}
+
+static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1,
+                  int64_t Arg2) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown AMDGPUVariadicMCExpr kind.");
+  case AMDGPUVariadicMCExpr::AGVK_Max:
+    return std::max(Arg1, Arg2);
+  case AMDGPUVariadicMCExpr::AGVK_Or:
+    return Arg1 | Arg2;
+  }
+}
+
+bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
+    MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
+  std::optional<int64_t> Total;
+
+  for (const MCExpr *Arg : Args) {
+    MCValue ArgRes;
+    if (!Arg->evaluateAsRelocatable(ArgRes, Layout, Fixup) ||
+        !ArgRes.isAbsolute())
+      return false;
+
+    if (!Total.has_value())
+      Total = ArgRes.getConstant();
+    Total = op(Kind, *Total, ArgRes.getConstant());
+  }
+
+  Res = MCValue::get(*Total);
+  return true;
+}
+
+void AMDGPUVariadicMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  for (const MCExpr *Arg : Args)
+    Streamer.visitUsedExpr(*Arg);
+}
+
+MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
+  for (const MCExpr *Arg : Args) {
+    if (Arg->findAssociatedFragment())
+      return Arg->findAssociatedFragment();
+  }
+  return nullptr;
+}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
new file mode 100644
index 00000000000000..238e0dea791b24
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -0,0 +1,72 @@
+//===- AMDGPUMCExpr.h - AMDGPU specific MC expression classes ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCEXPR_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCEXPR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+/// AMDGPU target specific variadic MCExpr operations.
+///
+/// Takes in a minimum of 1 argument to be used with an operation. The supported
+/// operations are:
+///   - (bitwise) or
+///   - max
+///
+/// \note If the 'or'/'max' operations are provided only a single argument, the
+/// operation will act as a no-op and simply resolve as the provided argument.
+///
+class AMDGPUVariadicMCExpr : public MCTargetExpr {
+public:
+  enum VariadicKind { AGVK_None, AGVK_Or, AGVK_Max };
+
+private:
+  VariadicKind Kind;
+  MCContext &Ctx;
+  const MCExpr **RawArgs;
+  ArrayRef<const MCExpr *> Args;
+
+  AMDGPUVariadicMCExpr(VariadicKind Kind, ArrayRef<const MCExpr *> Args,
+                       MCContext &Ctx);
+  ~AMDGPUVariadicMCExpr();
+
+public:
+  static const AMDGPUVariadicMCExpr *
+  create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
+
+  static const AMDGPUVariadicMCExpr *createOr(ArrayRef<const MCExpr *> Args,
+                                              MCContext &Ctx) {
+    return create(VariadicKind::AGVK_Or, Args, Ctx);
+  }
+
+  static const AMDGPUVariadicMCExpr *createMax(ArrayRef<const MCExpr *> Args,
+                                               MCContext &Ctx) {
+    return create(VariadicKind::AGVK_Max, Args, Ctx);
+  }
+
+  VariadicKind getKind() const { return Kind; }
+  const MCExpr *getSubExpr(size_t Index) const;
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override;
+  void fixELFSymbolsInTLSFixups(MCAssembler &) const override{};
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCEXPR_H
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index 5dc76071b0594e..0842a58f794b32 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMAMDGPUDesc
   AMDGPUInstPrinter.cpp
   AMDGPUMCAsmInfo.cpp
   AMDGPUMCCodeEmitter.cpp
+  AMDGPUMCExpr.cpp
   AMDGPUMCTargetDesc.cpp
   AMDGPUTargetStreamer.cpp
   R600InstPrinter.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 0b516bfffb9b66..1f0207ddb0ebd2 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -460,7 +460,8 @@ enum Id { // Message ID, width(4) [3:0].
   ID_RTN_GET_REALTIME = 131,
   ID_RTN_SAVE_WAVE = 132,
   ID_RTN_GET_TBA = 133,
-  ID_RTN_GET_SE_AID_ID = 134,
+  ID_RTN_GET_TBA_TO_PC = 134,
+  ID_RTN_GET_SE_AID_ID = 135,
 
   ID_MASK_PreGFX11_ = 0xF,
   ID_MASK_GFX11Plus_ = 0xFF
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index a6184c5e1e0487..27621906e4c5fd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -485,6 +485,16 @@ class WaitcntGenerator {
   virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
 
   virtual ~WaitcntGenerator() = default;
+
+  // Create a mask value from the initializer list of wait event types.
+  static constexpr unsigned
+  eventMask(std::initializer_list<WaitEventType> Events) {
+    unsigned Mask = 0;
+    for (auto &E : Events)
+      Mask |= 1 << E;
+
+    return Mask;
+  }
 };
 
 class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
@@ -506,14 +516,12 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
     assert(ST);
 
     static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
-        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) |
-            (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS),
-        (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
-            (1 << SQ_MESSAGE),
-        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
-            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
-            (1 << EXP_LDS_ACCESS),
-        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
+                   VMEM_BVH_READ_ACCESS}),
+        eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
+        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
+                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
         0,
         0,
         0};
@@ -543,15 +551,14 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
     assert(ST);
 
     static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
-        (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
-        (1 << LDS_ACCESS) | (1 << GDS_ACCESS),
-        (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
-            (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
-            (1 << EXP_LDS_ACCESS),
-        (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
-        (1 << VMEM_SAMPLER_READ_ACCESS),
-        (1 << VMEM_BVH_READ_ACCESS),
-        (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)};
+        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
+        eventMask({LDS_ACCESS, GDS_ACCESS}),
+        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
+                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+        eventMask({VMEM_SAMPLER_READ_ACCESS}),
+        eventMask({VMEM_BVH_READ_ACCESS}),
+        eventMask({SMEM_ACCESS, SQ_MESSAGE})};
 
     return WaitEventMaskForInstGFX12Plus;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c19c3c6017a7c8..f4b21b7dfac391 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3635,12 +3635,13 @@ memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
   return true;
 }
 
-static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
-                                int WidthB, int OffsetB) {
+static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
+                                LocationSize WidthB, int OffsetB) {
   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
-  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
-  return LowOffset + LowWidth <= HighOffset;
+  LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+  return LowWidth.hasValue() &&
+         LowOffset + (int)LowWidth.getValue() <= HighOffset;
 }
 
 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
@@ -3662,8 +3663,8 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
     // FIXME: Handle ds_read2 / ds_write2.
     return false;
   }
-  unsigned Width0 = MIa.memoperands().front()->getSize();
-  unsigned Width1 = MIb.memoperands().front()->getSize();
+  LocationSize Width0 = MIa.memoperands().front()->getSize();
+  LocationSize Width1 = MIb.memoperands().front()->getSize();
   return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a62bf779fe2e2d..4c5978cdc6665c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -815,7 +815,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   }
 
   static bool isMFMAorWMMA(const MachineInstr &MI) {
-    return isMFMA(MI) || isWMMA(MI);
+    return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
   }
 
   static bool isSWMMAC(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 835a5a24723154..1694436bad15ce 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -835,6 +835,19 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
   return fp16SrcZerosHighBits(N->getOpcode());
 }]>;
 
+def is_canonicalized : PatLeaf<(fAny srcvalue:$src), [{
+  const SITargetLowering &Lowering =
+      *static_cast<const SITargetLowering *>(getTargetLowering());
+  return Lowering.isCanonicalized(*CurDAG, SDValue(N, 0));
+}]> {
+  let GISelPredicateCode = [{
+    const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+        MF.getSubtarget().getTargetLowering());
+    const MachineOperand &Dst = MI.getOperand(0);
+    assert(Dst.isDef());
+    return TLI->isCanonicalized(Dst.getReg(), MF);
+   }];
+}
 
 //===----------------------------------------------------------------------===//
 // MUBUF/SMEM Patterns
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3ab788406ecb28..1c942dcefdacea 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2946,30 +2946,12 @@ def : GCNPat<
 
 // If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
 let AddedComplexity = 1000 in {
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> f16:$src),
-  (COPY f16:$src)
->;
-
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> v2f16:$src),
-  (COPY v2f16:$src)
->;
-
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> f32:$src),
-  (COPY f32:$src)
->;
-
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> v2f32:$src),
-  (COPY v2f32:$src)
->;
-
-def : GCNPat<
-  (is_canonicalized_1<fcanonicalize> f64:$src),
-  (COPY f64:$src)
->;
+foreach vt = [f16, v2f16, f32, v2f32, f64] in {
+  def : GCNPat<
+    (fcanonicalize (vt is_canonicalized:$src)),
+    (COPY vt:$src)
+  >;
+}
 }
 
 // Prefer selecting to max when legal, but using mul is always valid.
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 9c85ff3c43e22f..4ddee2f6d5befa 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -862,7 +862,7 @@ SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
 
-  unsigned Size = MMOa->getSize() + MMOb->getSize();
+  unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
 
   // A base pointer for the combined operation is the same as the leading
   // operation's pointer.
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5c64c6bcd1968c..79a7d1cf66c4d3 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1753,12 +1753,12 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
-                        FrameReg, Offset * SB.EltSize, MMO, SB.RS);
+                        FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
   } else {
     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
     buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
-                        FrameReg, Offset * SB.EltSize, MMO, SB.RS);
+                        FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
     // This only ever adds one VGPR spill
     SB.MFI.addToSpilledVGPRs(1);
   }
@@ -2265,10 +2265,10 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
           TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
           *MI->memoperands_begin(), RS);
-      
-      if (IsWWMRegSpill) 
+
+      if (IsWWMRegSpill)
         TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
-  
+
       MI->eraseFromParent();
       return true;
     }
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index bccdf4b5ca6374..1159c4e0fc2ed5 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -744,19 +744,19 @@ def S_NOR_B64 : SOP2_64 <"s_nor_b64",
 
 // There are also separate patterns for types other than i32
 def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
-  [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+  [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (not i32:$src1)))]
 >;
 
 def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64",
-  [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+  [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (not i64:$src1)))]
 >;
 
 def S_ORN2_B32 : SOP2_32 <"s_orn2_b32",
-  [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+  [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (not i32:$src1)))]
 >;
 
 def S_ORN2_B64 : SOP2_64 <"s_orn2_b64",
-  [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+  [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (not i64:$src1)))]
 >;
 } // End Defs = [SCC]
 
@@ -1905,21 +1905,20 @@ def : GCNPat<
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
 >;
 
-// FIXME: ValueType should have isVector field
 class ScalarNot2Pat<Instruction inst, SDPatternOperator op, ValueType vt,
-                    bit isVector = 1> : GCNPat<
-  (UniformBinFrag<op> vt:$src0, (UniformUnaryFrag<!if(isVector, vnot, not)> vt:$src1)),
+                    SDPatternOperator notnode = !if(vt.isVector, vnot, not)> : GCNPat<
+  (UniformBinFrag<op> vt:$src0, (notnode vt:$src1)),
   (inst getSOPSrcForVT<vt>.ret:$src0, getSOPSrcForVT<vt>.ret:$src1)
 >;
 
 // Match these for some more types
 // TODO: i1
-def : ScalarNot2Pat<S_ANDN2_B32, and, i16, 0>;
+def : ScalarNot2Pat<S_ANDN2_B32, and, i16>;
 def : ScalarNot2Pat<S_ANDN2_B32, and, v2i16>;
 def : ScalarNot2Pat<S_ANDN2_B64, and, v4i16>;
 def : ScalarNot2Pat<S_ANDN2_B64, and, v2i32>;
 
-def : ScalarNot2Pat<S_ORN2_B32, or, i16, 0>;
+def : ScalarNot2Pat<S_ORN2_B32, or, i16>;
 def : ScalarNot2Pat<S_ORN2_B32, or, v2i16>;
 def : ScalarNot2Pat<S_ORN2_B64, or, v4i16>;
 def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 23434d2de0fc66..d468b14d54d3fc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -60,6 +60,7 @@ const CustomOperand<const MCSubtargetInfo &> Msg[] = {
   {{"MSG_RTN_GET_REALTIME"},    ID_RTN_GET_REALTIME,        isGFX11Plus},
   {{"MSG_RTN_SAVE_WAVE"},       ID_RTN_SAVE_WAVE,           isGFX11Plus},
   {{"MSG_RTN_GET_TBA"},         ID_RTN_GET_TBA,             isGFX11Plus},
+  {{"MSG_RTN_GET_TBA_TO_PC"},   ID_RTN_GET_TBA_TO_PC,       isGFX11Plus},
   {{"MSG_RTN_GET_SE_AID_ID"},   ID_RTN_GET_SE_AID_ID,       isGFX12Plus},
 };
 // clang-format on
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 4510c7cf4f42e2..66596dbda83c95 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -1742,6 +1742,7 @@ def ARMAsmWriter : AsmWriter {
 
 def ARMAsmParser : AsmParser {
   bit ReportMultipleNearMisses = 1;
+  let PreferSmallerInstructions = true;
 }
 
 def ARMAsmParserVariant : AsmParserVariant {
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index dd63ca17e5b9f1..5d0468948dfb61 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -3710,7 +3710,7 @@ unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr &MI) const {
   for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
                                   E = MI.memoperands_end();
        I != E; ++I) {
-    Size += (*I)->getSize();
+    Size += (*I)->getSize().getValue();
   }
   // FIXME: The scheduler currently can't handle values larger than 16. But
   // the values can actually go up to 32 for floating-point load/store
diff --git a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
index 9b26aac6c0b71e..34b6f0575f7276 100644
--- a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -191,7 +191,7 @@ ARMBankConflictHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   auto BasePseudoVal0 = MO0->getPseudoValue();
   int64_t Offset0 = 0;
 
-  if (MO0->getSize() > 4)
+  if (!MO0->getSize().hasValue() || MO0->getSize().getValue() > 4)
     return NoHazard;
 
   bool SPvalid = false;
@@ -259,8 +259,8 @@ void ARMBankConflictHazardRecognizer::EmitInstruction(SUnit *SU) {
     return;
 
   auto MO = *MI.memoperands().begin();
-  uint64_t Size1 = MO->getSize();
-  if (Size1 > 4)
+  LocationSize Size1 = MO->getSize();
+  if (Size1.hasValue() && Size1.getValue() > 4)
     return;
   Accesses.push_back(&MI);
 }
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 404085820a6660..d0678f378da1ea 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -155,7 +155,11 @@ def iflags_op : Operand<i32> {
 
 // ARM Predicate operand. Default to 14 = always (AL). Second part is CC
 // register whose default is 0 (no register).
-def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+def CondCodeOperand : AsmOperandClass {
+  let Name = "CondCode";
+  let DefaultMethod = "defaultCondCodeOp";
+  let IsOptional = true;
+}
 def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
                                      (ops (i32 14), (i32 zero_reg))> {
   let PrintMethod = "printPredicateOperand";
@@ -174,7 +178,11 @@ def cmovpred : Operand<i32>, PredicateOp,
 }
 
 // Conditional code result for instructions whose 's' bit is set, e.g. subs.
-def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
+def CCOutOperand : AsmOperandClass {
+  let Name = "CCOut";
+  let DefaultMethod = "defaultCCOutOp";
+  let IsOptional = true;
+}
 def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
   let EncoderMethod = "getCCOutOpValue";
   let PrintMethod = "printSBitModifierOperand";
@@ -202,10 +210,14 @@ def inv_cond_XFORM : SDNodeXForm<imm, [{
 def VPTPredNOperand : AsmOperandClass {
   let Name = "VPTPredN";
   let PredicateMethod = "isVPTPred";
+  let DefaultMethod = "defaultVPTPredOp";
+  let IsOptional = true;
 }
 def VPTPredROperand : AsmOperandClass {
   let Name = "VPTPredR";
   let PredicateMethod = "isVPTPred";
+  let DefaultMethod = "defaultVPTPredOp";
+  let IsOptional = true;
 }
 
 // Operand classes for the cluster of MC operands describing a
@@ -468,7 +480,7 @@ class InstThumb<AddrMode am, int sz, IndexMode im,
 // These are aliases that require C++ handling to convert to the target
 // instruction, while InstAliases can be handled directly by tblgen.
 class AsmPseudoInst<string asm, dag iops, dag oops = (outs)>
-  : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+  : InstTemplate<AddrModeNone, 4, IndexModeNone, Pseudo, GenericDomain,
                  "", NoItinerary> {
   let OutOperandList = oops;
   let InOperandList = iops;
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index be0ca964d3f912..e7f4059935138a 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1210,7 +1210,7 @@ def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255_expr:$imm8), IIC_iMOVi,
 // Because we have an explicit tMOVSr below, we need an alias to handle
 // the immediate "movs" form here. Blech.
 def : tInstAlias <"movs $Rdn, $imm8",
-                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255_expr:$imm8, 14, 0)>;
+                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255_expr:$imm8, 14, zero_reg)>;
 
 // A7-73: MOV(2) - mov setting flag.
 
@@ -1768,7 +1768,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
 // In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
 // encoding is available on ARMv6K, but we don't differentiate that finely.
-def : InstAlias<"nop", (tMOVr R8, R8, 14, 0), 0>, Requires<[IsThumb, IsThumb1Only]>;
+def : InstAlias<"nop", (tMOVr R8, R8, 14, zero_reg), 0>, Requires<[IsThumb, IsThumb1Only]>;
 
 
 // "neg" is and alias for "rsb rd, rn, #0"
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index acd46e8093aa78..f227d68deeb8bc 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5092,14 +5092,14 @@ def : InstAlias<"dmb${p}.w", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"dsb${p}.w\t$opt", (t2DSB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"dsb${p}.w", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
-def : InstAlias<"isb${p}.w\t$opt", (t2ISB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}.w\t$opt", (t2ISB instsyncb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"isb${p}.w", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 
-// Non-predicable aliases of a predicable DSB: the predicate is (14, 0) where
-// 14 = AL (always execute) and 0 = "instruction doesn't read the CPSR".
-def : InstAlias<"ssbb", (t2DSB 0x0, 14, 0), 1>, Requires<[HasDB, IsThumb2]>;
-def : InstAlias<"pssbb", (t2DSB 0x4, 14, 0), 1>, Requires<[HasDB, IsThumb2]>;
+// Non-predicable aliases of a predicable DSB: the predicate is (14, zero_reg) where
+// 14 = AL (always execute) and zero_reg = "instruction doesn't read the CPSR".
+def : InstAlias<"ssbb", (t2DSB 0x0, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>;
+def : InstAlias<"pssbb", (t2DSB 0x4, 14, zero_reg), 1>, Requires<[HasDB, IsThumb2]>;
 
 // Armv8-R 'Data Full Barrier'
 def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>;
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c320bf723c88bb..9cfdb15a0f43d3 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -47,6 +48,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
@@ -61,6 +63,7 @@
 #include <iterator>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -79,6 +82,7 @@ extern const ARMInstrTable ARMDescs;
 } // end namespace llvm
 
 namespace {
+class ARMOperand;
 
 enum class ImplicitItModeTy { Always, Never, ARMOnly, ThumbOnly };
 
@@ -446,13 +450,15 @@ class ARMAsmParser : public MCTargetAsmParser {
   }
 
   bool validatetLDMRegList(const MCInst &Inst, const OperandVector &Operands,
-                           unsigned ListNo, bool IsARPop = false);
+                           unsigned MnemonicOpsEndInd, unsigned ListIndex,
+                           bool IsARPop = false);
   bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
-                           unsigned ListNo);
+                           unsigned MnemonicOpsEndInd, unsigned ListIndex);
 
   int tryParseRegister(bool AllowOutofBoundReg = false);
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
+  std::optional<ARM_AM::ShiftOpc> tryParseShiftToken();
   bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
                          bool AllowRAAC = false,
                          bool AllowOutOfBoundReg = false);
@@ -505,6 +511,10 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseDirectiveSEHEpilogEnd(SMLoc L);
   bool parseDirectiveSEHCustom(SMLoc L);
 
+  std::unique_ptr<ARMOperand> defaultCondCodeOp();
+  std::unique_ptr<ARMOperand> defaultCCOutOp();
+  std::unique_ptr<ARMOperand> defaultVPTPredOp();
+
   bool isMnemonicVPTPredicable(StringRef Mnemonic, StringRef ExtraToken);
   StringRef splitMnemonic(StringRef Mnemonic, StringRef ExtraToken,
                           ARMCC::CondCodes &PredicationCode,
@@ -517,9 +527,13 @@ class ARMAsmParser : public MCTargetAsmParser {
                              bool &CanAcceptVPTPredicationCode);
   bool enableArchExtFeature(StringRef Name, SMLoc &ExtLoc);
 
-  void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
-                                     OperandVector &Operands);
-  bool CDEConvertDualRegOperand(StringRef Mnemonic, OperandVector &Operands);
+  void tryConvertingToTwoOperandForm(StringRef Mnemonic,
+                                     ARMCC::CondCodes PredicationCode,
+                                     bool CarrySetting, OperandVector &Operands,
+                                     unsigned MnemonicOpsEndInd);
+
+  bool CDEConvertDualRegOperand(StringRef Mnemonic, OperandVector &Operands,
+                                unsigned MnemonicOpsEndInd);
 
   bool isThumb() const {
     // FIXME: Can tablegen auto-generate this?
@@ -635,12 +649,13 @@ class ARMAsmParser : public MCTargetAsmParser {
   ParseStatus parseProcIFlagsOperand(OperandVector &);
   ParseStatus parseMSRMaskOperand(OperandVector &);
   ParseStatus parseBankedRegOperand(OperandVector &);
-  ParseStatus parsePKHImm(OperandVector &O, StringRef Op, int Low, int High);
+  ParseStatus parsePKHImm(OperandVector &O, ARM_AM::ShiftOpc, int Low,
+                          int High);
   ParseStatus parsePKHLSLImm(OperandVector &O) {
-    return parsePKHImm(O, "lsl", 0, 31);
+    return parsePKHImm(O, ARM_AM::lsl, 0, 31);
   }
   ParseStatus parsePKHASRImm(OperandVector &O) {
-    return parsePKHImm(O, "asr", 1, 32);
+    return parsePKHImm(O, ARM_AM::asr, 1, 32);
   }
   ParseStatus parseSetEndImm(OperandVector &);
   ParseStatus parseShifterImm(OperandVector &);
@@ -659,15 +674,20 @@ class ARMAsmParser : public MCTargetAsmParser {
   void cvtThumbBranches(MCInst &Inst, const OperandVector &);
   void cvtMVEVMOVQtoDReg(MCInst &Inst, const OperandVector &);
 
-  bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
-  bool processInstruction(MCInst &Inst, const OperandVector &Ops, MCStreamer &Out);
-  bool shouldOmitCCOutOperand(StringRef Mnemonic, OperandVector &Operands);
-  bool shouldOmitPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
-  bool shouldOmitVectorPredicateOperand(StringRef Mnemonic, OperandVector &Operands);
+  bool validateInstruction(MCInst &Inst, const OperandVector &Ops,
+                           unsigned MnemonicOpsEndInd);
+  bool processInstruction(MCInst &Inst, const OperandVector &Ops,
+                          unsigned MnemonicOpsEndInd, MCStreamer &Out);
+  bool shouldOmitVectorPredicateOperand(StringRef Mnemonic,
+                                        OperandVector &Operands,
+                                        unsigned MnemonicOpsEndInd);
   bool isITBlockTerminator(MCInst &Inst) const;
-  void fixupGNULDRDAlias(StringRef Mnemonic, OperandVector &Operands);
-  bool validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands,
-                        bool Load, bool ARMMode, bool Writeback);
+
+  void fixupGNULDRDAlias(StringRef Mnemonic, OperandVector &Operands,
+                         unsigned MnemonicOpsEndInd);
+  bool validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands, bool Load,
+                        bool ARMMode, bool Writeback,
+                        unsigned MnemonicOpsEndInd);
 
 public:
   enum ARMMatchResultTy {
@@ -716,6 +736,9 @@ class ARMAsmParser : public MCTargetAsmParser {
   unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
   unsigned checkTargetMatchPredicate(MCInst &Inst) override;
+  unsigned
+  checkEarlyTargetMatchPredicate(MCInst &Inst,
+                                 const OperandVector &Operands) override;
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
@@ -1347,6 +1370,14 @@ class ARMOperand : public MCParsedAsmOperand {
   bool isRegListWithAPSR() const {
     return Kind == k_RegisterListWithAPSR || Kind == k_RegisterList;
   }
+  bool isDReg() const {
+    return isReg() &&
+           ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg.RegNum);
+  }
+  bool isQReg() const {
+    return isReg() &&
+           ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg.RegNum);
+  }
   bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
   bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
   bool isFPSRegListWithVPR() const { return Kind == k_FPSRegisterListWithVPR; }
@@ -2012,6 +2043,8 @@ class ARMOperand : public MCParsedAsmOperand {
   bool isProcIFlags() const { return Kind == k_ProcIFlags; }
 
   // NEON operands.
+  bool isVectorList() const { return Kind == k_VectorList; }
+
   bool isSingleSpacedVectorList() const {
     return Kind == k_VectorList && !VectorList.isDoubleSpaced;
   }
@@ -2452,6 +2485,20 @@ class ARMOperand : public MCParsedAsmOperand {
            CC == ARMCC::GT || CC == ARMCC::LE || CC == ARMCC::GE;
   }
 
+  void setVecListDPair(unsigned int DPair) {
+    Kind = k_VectorList;
+    VectorList.RegNum = DPair;
+    VectorList.Count = 2;
+    VectorList.isDoubleSpaced = false;
+  }
+
+  void setVecListOneD(unsigned int DReg) {
+    Kind = k_VectorList;
+    VectorList.RegNum = DReg;
+    VectorList.Count = 1;
+    VectorList.isDoubleSpaced = false;
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
     if (!Expr)
@@ -4054,6 +4101,63 @@ static MCRegister MatchRegisterName(StringRef Name);
 
 /// }
 
+static bool isDataTypeToken(StringRef Tok) {
+  static const DenseSet<StringRef> DataTypes{
+      ".8",  ".16",  ".32",  ".64",  ".i8", ".i16", ".i32", ".i64",
+      ".u8", ".u16", ".u32", ".u64", ".s8", ".s16", ".s32", ".s64",
+      ".p8", ".p16", ".f32", ".f64", ".f",  ".d"};
+  return DataTypes.contains(Tok);
+}
+
+static unsigned getMnemonicOpsEndInd(const OperandVector &Operands) {
+  unsigned MnemonicOpsEndInd = 1;
+  // Special case for CPS which has a Mnemonic side token for possibly storing
+  // ie/id variant
+  if (Operands[0]->isToken() &&
+      static_cast<ARMOperand &>(*Operands[0]).getToken() == "cps") {
+    if (Operands.size() > 1 && Operands[1]->isImm() &&
+        static_cast<ARMOperand &>(*Operands[1]).getImm()->getKind() ==
+            llvm::MCExpr::Constant &&
+        (dyn_cast<MCConstantExpr>(
+             static_cast<ARMOperand &>(*Operands[1]).getImm())
+                 ->getValue() == ARM_PROC::IE ||
+         dyn_cast<MCConstantExpr>(
+             static_cast<ARMOperand &>(*Operands[1]).getImm())
+                 ->getValue() == ARM_PROC::ID))
+      ++MnemonicOpsEndInd;
+  }
+
+  // In some circumstances the condition code moves to the right
+  bool RHSCondCode = false;
+  while (MnemonicOpsEndInd < Operands.size()) {
+    auto Op = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
+    // Special case for it instructions which have a condition code on the RHS
+    if (Op.isITMask()) {
+      RHSCondCode = true;
+      MnemonicOpsEndInd++;
+    } else if (Op.isToken() &&
+               (
+                   // There are several special cases not covered by
+                   // isDataTypeToken
+                   Op.getToken() == ".w" || Op.getToken() == ".bf16" ||
+                   Op.getToken() == ".p64" || Op.getToken() == ".f16" ||
+                   isDataTypeToken(Op.getToken()))) {
+      // In the mnemonic operators the cond code must always precede the data
+      // type. So we can now safely assume any subsequent cond code is on the
+      // RHS. As is the case for VCMP and VPT.
+      RHSCondCode = true;
+      MnemonicOpsEndInd++;
+    }
+    // Skip all mnemonic operator types
+    else if (Op.isCCOut() || (Op.isCondCode() && !RHSCondCode) ||
+             Op.isVPTPred() || (Op.isToken() && Op.getToken() == ".w"))
+      MnemonicOpsEndInd++;
+    else
+      break;
+  }
+  return MnemonicOpsEndInd;
+}
+
 bool ARMAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
                                  SMLoc &EndLoc) {
   const AsmToken &Tok = getParser().getTok();
@@ -4127,30 +4231,36 @@ int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
   return RegNum;
 }
 
-// Try to parse a shifter  (e.g., "lsl <amt>"). On success, return 0.
-// If a recoverable error occurs, return 1. If an irrecoverable error
-// occurs, return -1. An irrecoverable error is one where tokens have been
-// consumed in the process of trying to parse the shifter (i.e., when it is
-// indeed a shifter operand, but malformed).
-int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
+std::optional<ARM_AM::ShiftOpc> ARMAsmParser::tryParseShiftToken() {
   MCAsmParser &Parser = getParser();
-  SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
-    return -1;
+    return std::nullopt;
 
   std::string lowerCase = Tok.getString().lower();
-  ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase)
+  return StringSwitch<std::optional<ARM_AM::ShiftOpc>>(lowerCase)
       .Case("asl", ARM_AM::lsl)
       .Case("lsl", ARM_AM::lsl)
       .Case("lsr", ARM_AM::lsr)
       .Case("asr", ARM_AM::asr)
       .Case("ror", ARM_AM::ror)
       .Case("rrx", ARM_AM::rrx)
-      .Default(ARM_AM::no_shift);
+      .Default(std::nullopt);
+}
 
-  if (ShiftTy == ARM_AM::no_shift)
+// Try to parse a shifter  (e.g., "lsl <amt>"). On success, return 0.
+// If a recoverable error occurs, return 1. If an irrecoverable error
+// occurs, return -1. An irrecoverable error is one where tokens have been
+// consumed in the process of trying to parse the shifter (i.e., when it is
+// indeed a shifter operand, but malformed).
+int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+
+  auto ShiftTyOpt = tryParseShiftToken();
+  if (ShiftTyOpt == std::nullopt)
     return 1;
+  auto ShiftTy = ShiftTyOpt.value();
 
   Parser.Lex(); // Eat the operator.
 
@@ -4680,6 +4790,8 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
   // As an extension (to match gas), support a plain D register or Q register
   // (without encosing curly braces) as a single or double entry list,
   // respectively.
+  // If there is no lane supplied, just parse as a register and
+  // use the custom matcher to convert to list if necessary
   if (!hasMVE() && Parser.getTok().is(AsmToken::Identifier)) {
     SMLoc E = Parser.getTok().getEndLoc();
     int Reg = tryParseRegister();
@@ -4691,7 +4803,7 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
         return Res;
       switch (LaneKind) {
       case NoLanes:
-        Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, false, S, E));
+        Operands.push_back(ARMOperand::CreateReg(Reg, S, E));
         break;
       case AllLanes:
         Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 1, false,
@@ -4712,9 +4824,7 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
         return Res;
       switch (LaneKind) {
       case NoLanes:
-        Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
-                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
-        Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, false, S, E));
+        Operands.push_back(ARMOperand::CreateReg(Reg, S, E));
         break;
       case AllLanes:
         Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
@@ -4730,7 +4840,8 @@ ParseStatus ARMAsmParser::parseVectorList(OperandVector &Operands) {
       }
       return ParseStatus::Success;
     }
-    return Error(S, "vector register expected");
+    Operands.push_back(ARMOperand::CreateReg(Reg, S, E));
+    return ParseStatus::Success;
   }
 
   if (Parser.getTok().isNot(AsmToken::LCurly))
@@ -5060,6 +5171,10 @@ ParseStatus ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) {
 
 /// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
 ParseStatus ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
+  // Don't parse two MSR registers in a row
+  if (static_cast<ARMOperand &>(*Operands.back()).isMSRMask() ||
+      static_cast<ARMOperand &>(*Operands.back()).isBankedReg())
+    return ParseStatus::NoMatch;
   MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
@@ -5157,6 +5272,10 @@ ParseStatus ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
 /// parseBankedRegOperand - Try to parse a banked register (e.g. "lr_irq") for
 /// use in the MRS/MSR instructions added to support virtualization.
 ParseStatus ARMAsmParser::parseBankedRegOperand(OperandVector &Operands) {
+  // Don't parse two Banked registers in a row
+  if (static_cast<ARMOperand &>(*Operands.back()).isBankedReg() ||
+      static_cast<ARMOperand &>(*Operands.back()).isMSRMask())
+    return ParseStatus::NoMatch;
   MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
@@ -5174,23 +5293,30 @@ ParseStatus ARMAsmParser::parseBankedRegOperand(OperandVector &Operands) {
   return ParseStatus::Success;
 }
 
-ParseStatus ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op,
-                                      int Low, int High) {
+// FIXME: Unify the different methods for handling shift operators
+// and use TableGen matching mechanisms to do the validation rather than
+// separate parsing paths.
+ParseStatus ARMAsmParser::parsePKHImm(OperandVector &Operands,
+                                      ARM_AM::ShiftOpc Op, int Low, int High) {
   MCAsmParser &Parser = getParser();
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.isNot(AsmToken::Identifier))
-    return Error(Parser.getTok().getLoc(), Op + " operand expected.");
-  StringRef ShiftName = Tok.getString();
-  std::string LowerOp = Op.lower();
-  std::string UpperOp = Op.upper();
-  if (ShiftName != LowerOp && ShiftName != UpperOp)
-    return Error(Parser.getTok().getLoc(), Op + " operand expected.");
+  auto ShiftCodeOpt = tryParseShiftToken();
+
+  if (!ShiftCodeOpt.has_value())
+    return ParseStatus::NoMatch;
+  auto ShiftCode = ShiftCodeOpt.value();
+
+  // The wrong shift code has been provided. Can error here as has matched the
+  // correct operand in this case.
+  if (ShiftCode != Op)
+    return Error(Parser.getTok().getLoc(),
+                 ARM_AM::getShiftOpcStr(Op) + " operand expected.");
+
   Parser.Lex(); // Eat shift type token.
 
   // There must be a '#' and a shift amount.
   if (Parser.getTok().isNot(AsmToken::Hash) &&
       Parser.getTok().isNot(AsmToken::Dollar))
-    return Error(Parser.getTok().getLoc(), "'#' expected");
+    return ParseStatus::NoMatch;
   Parser.Lex(); // Eat hash token.
 
   const MCExpr *ShiftAmount;
@@ -5240,7 +5366,7 @@ ParseStatus ARMAsmParser::parseShifterImm(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier))
-    return Error(S, "shift operator 'asr' or 'lsl' expected");
+    return ParseStatus::NoMatch;
   StringRef ShiftName = Tok.getString();
   bool isASR;
   if (ShiftName == "lsl" || ShiftName == "LSL")
@@ -5248,7 +5374,7 @@ ParseStatus ARMAsmParser::parseShifterImm(OperandVector &Operands) {
   else if (ShiftName == "asr" || ShiftName == "ASR")
     isASR = true;
   else
-    return Error(S, "shift operator 'asr' or 'lsl' expected");
+    return ParseStatus::NoMatch;
   Parser.Lex(); // Eat the operator.
 
   // A '#' and a shift amount.
@@ -5441,7 +5567,7 @@ ParseStatus ARMAsmParser::parseBitfield(OperandVector &Operands) {
   // The bitfield descriptor is really two operands, the LSB and the width.
   if (Parser.getTok().isNot(AsmToken::Hash) &&
       Parser.getTok().isNot(AsmToken::Dollar))
-    return Error(Parser.getTok().getLoc(), "'#' expected");
+    return ParseStatus::NoMatch;
   Parser.Lex(); // Eat hash token.
 
   const MCExpr *LSBExpr;
@@ -5600,37 +5726,86 @@ ParseStatus ARMAsmParser::parseAM3Offset(OperandVector &Operands) {
   return ParseStatus::Success;
 }
 
+// Finds the index of the first CondCode operator, if there is none returns 0
+unsigned findCondCodeInd(const OperandVector &Operands,
+                         unsigned MnemonicOpsEndInd) {
+  for (unsigned I = 1; I < MnemonicOpsEndInd; ++I) {
+    auto Op = static_cast<ARMOperand &>(*Operands[I]);
+    if (Op.isCondCode())
+      return I;
+  }
+  return 0;
+}
+
+unsigned findCCOutInd(const OperandVector &Operands,
+                      unsigned MnemonicOpsEndInd) {
+  for (unsigned I = 1; I < MnemonicOpsEndInd; ++I) {
+    auto Op = static_cast<ARMOperand &>(*Operands[I]);
+    if (Op.isCCOut())
+      return I;
+  }
+  return 0;
+}
+
 /// Convert parsed operands to MCInst.  Needed here because this instruction
 /// only has two register operands, but multiplication is commutative so
 /// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN".
 void ARMAsmParser::cvtThumbMultiply(MCInst &Inst,
                                     const OperandVector &Operands) {
-  ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1);
-  ((ARMOperand &)*Operands[1]).addCCOutOperands(Inst, 1);
-  // If we have a three-operand form, make sure to set Rn to be the operand
-  // that isn't the same as Rd.
-  unsigned RegOp = 4;
-  if (Operands.size() == 6 &&
-      ((ARMOperand &)*Operands[4]).getReg() ==
-          ((ARMOperand &)*Operands[3]).getReg())
-    RegOp = 5;
-  ((ARMOperand &)*Operands[RegOp]).addRegOperands(Inst, 1);
-  Inst.addOperand(Inst.getOperand(0));
-  ((ARMOperand &)*Operands[2]).addCondCodeOperands(Inst, 2);
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+  unsigned CondI = findCondCodeInd(Operands, MnemonicOpsEndInd);
+  unsigned CondOutI = findCCOutInd(Operands, MnemonicOpsEndInd);
+
+  // 2 operand form
+  unsigned RegRd = MnemonicOpsEndInd;
+  unsigned RegRn = MnemonicOpsEndInd + 1;
+  unsigned RegRm = MnemonicOpsEndInd;
+
+  if (Operands.size() == MnemonicOpsEndInd + 3) {
+    // If we have a three-operand form, make sure to set Rn to be the operand
+    // that isn't the same as Rd.
+    if (((ARMOperand &)*Operands[RegRd]).getReg() ==
+        ((ARMOperand &)*Operands[MnemonicOpsEndInd + 1]).getReg()) {
+      RegRn = MnemonicOpsEndInd + 2;
+      RegRm = MnemonicOpsEndInd + 1;
+    } else {
+      RegRn = MnemonicOpsEndInd + 1;
+      RegRm = MnemonicOpsEndInd + 2;
+    }
+  }
+
+  // Rd
+  ((ARMOperand &)*Operands[RegRd]).addRegOperands(Inst, 1);
+  // CCOut
+  if (CondOutI != 0) {
+    ((ARMOperand &)*Operands[CondOutI]).addCCOutOperands(Inst, 1);
+  } else {
+    ARMOperand Op = *ARMOperand::CreateCCOut(0, Operands[0]->getEndLoc());
+    Op.addCCOutOperands(Inst, 1);
+  }
+  // Rn
+  ((ARMOperand &)*Operands[RegRn]).addRegOperands(Inst, 1);
+  // Rm
+  ((ARMOperand &)*Operands[RegRm]).addRegOperands(Inst, 1);
+
+  // Cond code
+  if (CondI != 0) {
+    ((ARMOperand &)*Operands[CondI]).addCondCodeOperands(Inst, 2);
+  } else {
+    ARMOperand Op =
+        *ARMOperand::CreateCondCode(llvm::ARMCC::AL, Operands[0]->getEndLoc());
+    Op.addCondCodeOperands(Inst, 2);
+  }
 }
 
 void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
                                     const OperandVector &Operands) {
-  int CondOp = -1, ImmOp = -1;
-  switch(Inst.getOpcode()) {
-    case ARM::tB:
-    case ARM::tBcc:  CondOp = 1; ImmOp = 2; break;
-
-    case ARM::t2B:
-    case ARM::t2Bcc: CondOp = 1; ImmOp = 3; break;
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+  unsigned CondI = findCondCodeInd(Operands, MnemonicOpsEndInd);
+  unsigned Cond =
+      (CondI == 0 ? ARMCC::AL
+                  : static_cast<ARMOperand &>(*Operands[CondI]).getCondCode());
 
-    default: llvm_unreachable("Unexpected instruction in cvtThumbBranches");
-  }
   // first decide whether or not the branch should be conditional
   // by looking at it's location relative to an IT block
   if(inITBlock()) {
@@ -5641,9 +5816,6 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
       case ARM::t2Bcc: Inst.setOpcode(ARM::t2B); break;
     }
   } else {
-    // outside IT blocks we can only have unconditional branches with AL
-    // condition code or conditional branches with non-AL condition code
-    unsigned Cond = static_cast<ARMOperand &>(*Operands[CondOp]).getCondCode();
     switch(Inst.getOpcode()) {
       case ARM::tB:
       case ARM::tBcc:
@@ -5660,36 +5832,56 @@ void ARMAsmParser::cvtThumbBranches(MCInst &Inst,
   switch(Inst.getOpcode()) {
     // classify tB as either t2B or t1B based on range of immediate operand
     case ARM::tB: {
-      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
       if (!op.isSignedOffset<11, 1>() && isThumb() && hasV8MBaseline())
         Inst.setOpcode(ARM::t2B);
       break;
     }
     // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
     case ARM::tBcc: {
-      ARMOperand &op = static_cast<ARMOperand &>(*Operands[ImmOp]);
+      ARMOperand &op = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
       if (!op.isSignedOffset<8, 1>() && isThumb() && hasV8MBaseline())
         Inst.setOpcode(ARM::t2Bcc);
       break;
     }
   }
-  ((ARMOperand &)*Operands[ImmOp]).addImmOperands(Inst, 1);
-  ((ARMOperand &)*Operands[CondOp]).addCondCodeOperands(Inst, 2);
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd]).addImmOperands(Inst, 1);
+  if (CondI != 0) {
+    ((ARMOperand &)*Operands[CondI]).addCondCodeOperands(Inst, 2);
+  } else {
+    ARMOperand Op =
+        *ARMOperand::CreateCondCode(llvm::ARMCC::AL, Operands[0]->getEndLoc());
+    Op.addCondCodeOperands(Inst, 2);
+  }
 }
 
 void ARMAsmParser::cvtMVEVMOVQtoDReg(
   MCInst &Inst, const OperandVector &Operands) {
 
-  // mnemonic, condition code, Rt, Rt2, Qd, idx, Qd again, idx2
-  assert(Operands.size() == 8);
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+  unsigned CondI = findCondCodeInd(Operands, MnemonicOpsEndInd);
 
-  ((ARMOperand &)*Operands[2]).addRegOperands(Inst, 1); // Rt
-  ((ARMOperand &)*Operands[3]).addRegOperands(Inst, 1); // Rt2
-  ((ARMOperand &)*Operands[4]).addRegOperands(Inst, 1); // Qd
-  ((ARMOperand &)*Operands[5]).addMVEPairVectorIndexOperands(Inst, 1); // idx
+  // mnemonic, condition code, Rt, Rt2, Qd, idx, Qd again, idx2
+  assert(Operands.size() == MnemonicOpsEndInd + 6);
+
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd]).addRegOperands(Inst, 1); // Rt
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd + 1])
+      .addRegOperands(Inst, 1); // Rt2
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd + 2])
+      .addRegOperands(Inst, 1); // Qd
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd + 3])
+      .addMVEPairVectorIndexOperands(Inst, 1); // idx
   // skip second copy of Qd in Operands[6]
-  ((ARMOperand &)*Operands[7]).addMVEPairVectorIndexOperands(Inst, 1); // idx2
-  ((ARMOperand &)*Operands[1]).addCondCodeOperands(Inst, 2); // condition code
+  ((ARMOperand &)*Operands[MnemonicOpsEndInd + 5])
+      .addMVEPairVectorIndexOperands(Inst, 1); // idx2
+  if (CondI != 0) {
+    ((ARMOperand &)*Operands[CondI])
+        .addCondCodeOperands(Inst, 2); // condition code
+  } else {
+    ARMOperand Op =
+        *ARMOperand::CreateCondCode(ARMCC::AL, Operands[0]->getEndLoc());
+    Op.addCondCodeOperands(Inst, 2);
+  }
 }
 
 /// Parse an ARM memory expression, return false if successful else return true
@@ -5948,6 +6140,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
 
 /// parseFPImm - A floating point immediate expression operand.
 ParseStatus ARMAsmParser::parseFPImm(OperandVector &Operands) {
+  LLVM_DEBUG(dbgs() << "PARSE FPImm, Ops: " << Operands.size());
+
   MCAsmParser &Parser = getParser();
   // Anything that can accept a floating point constant as an operand
   // needs to go through here, as the regular parseExpression is
@@ -5974,10 +6168,19 @@ ParseStatus ARMAsmParser::parseFPImm(OperandVector &Operands) {
   // integer constant. Make sure we don't try to parse an FPImm
   // for these:
   // vmov.i{8|16|32|64} <dreg|qreg>, #imm
-  ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[2]);
-  bool isVmovf = TyOp.isToken() &&
-                 (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64" ||
-                  TyOp.getToken() == ".f16");
+
+  bool isVmovf = false;
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+  for (unsigned I = 1; I < MnemonicOpsEndInd; ++I) {
+    ARMOperand &TyOp = static_cast<ARMOperand &>(*Operands[I]);
+    if (TyOp.isToken() &&
+        (TyOp.getToken() == ".f32" || TyOp.getToken() == ".f64" ||
+         TyOp.getToken() == ".f16")) {
+      isVmovf = true;
+      break;
+    }
+  }
+
   ARMOperand &Mnemonic = static_cast<ARMOperand &>(*Operands[0]);
   bool isFconst = Mnemonic.isToken() && (Mnemonic.getToken() == "fconstd" ||
                                          Mnemonic.getToken() == "fconsts");
@@ -6493,18 +6696,30 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic,
     CanAcceptPredicationCode = true;
 }
 
+bool operandsContainWide(OperandVector &Operands, unsigned MnemonicOpsEndInd) {
+  for (unsigned I = 0; I < MnemonicOpsEndInd; ++I) {
+    auto &Op = static_cast<ARMOperand &>(*Operands[I]);
+    if (Op.isToken() && Op.getToken() == ".w")
+      return true;
+  }
+  return false;
+}
+
 // Some Thumb instructions have two operand forms that are not
 // available as three operand, convert to two operand form if possible.
 //
 // FIXME: We would really like to be able to tablegen'erate this.
-void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
-                                                 bool CarrySetting,
-                                                 OperandVector &Operands) {
-  if (Operands.size() != 6)
+void ARMAsmParser::tryConvertingToTwoOperandForm(
+    StringRef Mnemonic, ARMCC::CondCodes PredicationCode, bool CarrySetting,
+    OperandVector &Operands, unsigned MnemonicOpsEndInd) {
+
+  if (operandsContainWide(Operands, MnemonicOpsEndInd))
+    return;
+  if (Operands.size() != MnemonicOpsEndInd + 3)
     return;
 
-  const auto &Op3 = static_cast<ARMOperand &>(*Operands[3]);
-        auto &Op4 = static_cast<ARMOperand &>(*Operands[4]);
+  const auto &Op3 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
+  auto &Op4 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]);
   if (!Op3.isReg() || !Op4.isReg())
     return;
 
@@ -6515,7 +6730,7 @@ void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
   // it in processInstruction(), but the 3 operand form of ADD (t2ADDrr)
   // won't accept SP or PC so we do the transformation here taking care
   // with immediate range in the 'add sp, sp #imm' case.
-  auto &Op5 = static_cast<ARMOperand &>(*Operands[5]);
+  auto &Op5 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 2]);
   if (isThumbTwo()) {
     if (Mnemonic != "add")
       return;
@@ -6575,7 +6790,7 @@ void ARMAsmParser::tryConvertingToTwoOperandForm(StringRef Mnemonic,
   if (Transform) {
     if (Swap)
       std::swap(Op4, Op5);
-    Operands.erase(Operands.begin() + 3);
+    Operands.erase(Operands.begin() + MnemonicOpsEndInd);
   }
 }
 
@@ -6600,183 +6815,9 @@ static bool isThumbI8Relocation(MCParsedAsmOperand &MCOp) {
   return false;
 }
 
-bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
-                                          OperandVector &Operands) {
-  // FIXME: This is all horribly hacky. We really need a better way to deal
-  // with optional operands like this in the matcher table.
-
-  // The 'mov' mnemonic is special. One variant has a cc_out operand, while
-  // another does not. Specifically, the MOVW instruction does not. So we
-  // special case it here and remove the defaulted (non-setting) cc_out
-  // operand if that's the instruction we're trying to match.
-  //
-  // We do this as post-processing of the explicit operands rather than just
-  // conditionally adding the cc_out in the first place because we need
-  // to check the type of the parsed immediate operand.
-  if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
-      !static_cast<ARMOperand &>(*Operands[4]).isModImm() &&
-      static_cast<ARMOperand &>(*Operands[4]).isImm0_65535Expr() &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
-    return true;
-
-  if (Mnemonic == "movs" && Operands.size() > 3 && isThumb() &&
-      isThumbI8Relocation(*Operands[3]))
-    return true;
-
-  // Register-register 'add' for thumb does not have a cc_out operand
-  // when there are only two register operands.
-  if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0)
-    return true;
-  // Register-register 'add' for thumb does not have a cc_out operand
-  // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
-  // have to check the immediate range here since Thumb2 has a variant
-  // that can handle a different range and has a cc_out operand.
-  if (((isThumb() && Mnemonic == "add") ||
-       (isThumbTwo() && Mnemonic == "sub")) &&
-      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::SP &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      ((Mnemonic == "add" && static_cast<ARMOperand &>(*Operands[5]).isReg()) ||
-       static_cast<ARMOperand &>(*Operands[5]).isImm0_1020s4()))
-    return true;
-  // For Thumb2, add/sub immediate does not have a cc_out operand for the
-  // imm0_4095 variant. That's the least-preferred variant when
-  // selecting via the generic "add" mnemonic, so to know that we
-  // should remove the cc_out operand, we have to explicitly check that
-  // it's not one of the other variants. Ugh.
-  if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
-      Operands.size() == 6 && static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
-    // Nest conditions rather than one big 'if' statement for readability.
-    //
-    // If both registers are low, we're in an IT block, and the immediate is
-    // in range, we should use encoding T1 instead, which has a cc_out.
-    if (inITBlock() &&
-        isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) &&
-        isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) &&
-        static_cast<ARMOperand &>(*Operands[5]).isImm0_7())
-      return false;
-    // Check against T3. If the second register is the PC, this is an
-    // alternate form of ADR, which uses encoding T4, so check for that too.
-    if (static_cast<ARMOperand &>(*Operands[4]).getReg() != ARM::PC &&
-        (static_cast<ARMOperand &>(*Operands[5]).isT2SOImm() ||
-         static_cast<ARMOperand &>(*Operands[5]).isT2SOImmNeg()))
-      return false;
-
-    // Otherwise, we use encoding T4, which does not have a cc_out
-    // operand.
-    return true;
-  }
-
-  // The thumb2 multiply instruction doesn't have a CCOut register, so
-  // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
-  // use the 16-bit encoding or not.
-  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[5]).isReg() &&
-      // If the registers aren't low regs, the destination reg isn't the
-      // same as one of the source regs, or the cc_out operand is zero
-      // outside of an IT block, we have to use the 32-bit encoding, so
-      // remove the cc_out operand.
-      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[5]).getReg()) ||
-       !inITBlock() || (static_cast<ARMOperand &>(*Operands[3]).getReg() !=
-                            static_cast<ARMOperand &>(*Operands[5]).getReg() &&
-                        static_cast<ARMOperand &>(*Operands[3]).getReg() !=
-                            static_cast<ARMOperand &>(*Operands[4]).getReg())))
-    return true;
-
-  // Also check the 'mul' syntax variant that doesn't specify an explicit
-  // destination register.
-  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      // If the registers aren't low regs  or the cc_out operand is zero
-      // outside of an IT block, we have to use the 32-bit encoding, so
-      // remove the cc_out operand.
-      (!isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()) ||
-       !isARMLowRegister(static_cast<ARMOperand &>(*Operands[4]).getReg()) ||
-       !inITBlock()))
-    return true;
-
-  // Register-register 'add/sub' for thumb does not have a cc_out operand
-  // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also
-  // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't
-  // right, this will result in better diagnostics (which operand is off)
-  // anyway.
-  if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
-      (Operands.size() == 5 || Operands.size() == 6) &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::SP &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      (static_cast<ARMOperand &>(*Operands[4]).isImm() ||
-       (Operands.size() == 6 &&
-        static_cast<ARMOperand &>(*Operands[5]).isImm()))) {
-    // Thumb2 (add|sub){s}{p}.w GPRnopc, sp, #{T2SOImm} has cc_out
-    return (!(isThumbTwo() &&
-              (static_cast<ARMOperand &>(*Operands[4]).isT2SOImm() ||
-               static_cast<ARMOperand &>(*Operands[4]).isT2SOImmNeg())));
-  }
-  // Fixme: Should join all the thumb+thumb2 (add|sub) in a single if case
-  // Thumb2 ADD r0, #4095 -> ADDW r0, r0, #4095 (T4)
-  // Thumb2 SUB r0, #4095 -> SUBW r0, r0, #4095
-  if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
-      (Operands.size() == 5) &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[3]).getReg() != ARM::SP &&
-      static_cast<ARMOperand &>(*Operands[3]).getReg() != ARM::PC &&
-      static_cast<ARMOperand &>(*Operands[1]).getReg() == 0 &&
-      static_cast<ARMOperand &>(*Operands[4]).isImm()) {
-    const ARMOperand &IMM = static_cast<ARMOperand &>(*Operands[4]);
-    if (IMM.isT2SOImm() || IMM.isT2SOImmNeg())
-      return false; // add.w / sub.w
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(IMM.getImm())) {
-      const int64_t Value = CE->getValue();
-      // Thumb1 imm8 sub / add
-      if ((Value < ((1 << 7) - 1) << 2) && inITBlock() && (!(Value & 3)) &&
-          isARMLowRegister(static_cast<ARMOperand &>(*Operands[3]).getReg()))
-        return false;
-      return true; // Thumb2 T4 addw / subw
-    }
-  }
-  return false;
-}
-
-bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic,
-                                              OperandVector &Operands) {
-  // VRINT{Z, X} have a predicate operand in VFP, but not in NEON
-  unsigned RegIdx = 3;
-  if ((((Mnemonic == "vrintz" || Mnemonic == "vrintx") && !hasMVE()) ||
-      Mnemonic == "vrintr") &&
-      (static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" ||
-       static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) {
-    if (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-        (static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32" ||
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f16"))
-      RegIdx = 4;
-
-    if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() &&
-        (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(
-             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg()) ||
-         ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
-             static_cast<ARMOperand &>(*Operands[RegIdx]).getReg())))
-      return true;
-  }
-  return false;
-}
-
-bool ARMAsmParser::shouldOmitVectorPredicateOperand(StringRef Mnemonic,
-                                                    OperandVector &Operands) {
-  if (!hasMVE() || Operands.size() < 3)
+bool ARMAsmParser::shouldOmitVectorPredicateOperand(
+    StringRef Mnemonic, OperandVector &Operands, unsigned MnemonicOpsEndInd) {
+  if (!hasMVE() || Operands.size() <= MnemonicOpsEndInd)
     return true;
 
   if (Mnemonic.starts_with("vld2") || Mnemonic.starts_with("vld4") ||
@@ -6806,24 +6847,13 @@ bool ARMAsmParser::shouldOmitVectorPredicateOperand(StringRef Mnemonic,
       // MQPR, to more accurately report errors when using Q registers
       // outside of the allowed range.
       if (static_cast<ARMOperand &>(*Operand).isVectorIndex() ||
-          (Operand->isReg() &&
-           (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(
-             Operand->getReg()))))
+          static_cast<ARMOperand &>(*Operand).isQReg())
         return false;
     }
     return true;
   }
 }
 
-static bool isDataTypeToken(StringRef Tok) {
-  return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
-    Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
-    Tok == ".u8" || Tok == ".u16" || Tok == ".u32" || Tok == ".u64" ||
-    Tok == ".s8" || Tok == ".s16" || Tok == ".s32" || Tok == ".s64" ||
-    Tok == ".p8" || Tok == ".p16" || Tok == ".f32" || Tok == ".f64" ||
-    Tok == ".f" || Tok == ".d";
-}
-
 // FIXME: This bit should probably be handled via an explicit match class
 // in the .td files that matches the suffix instead of having it be
 // a literal string token the way it is now.
@@ -6844,14 +6874,15 @@ static void applyMnemonicAliases(StringRef &Mnemonic,
 // bail out, and let the assembly parser report an error on the instruction as
 // it is written.
 void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,
-                                     OperandVector &Operands) {
+                                     OperandVector &Operands,
+                                     unsigned MnemonicOpsEndInd) {
   if (Mnemonic != "ldrd" && Mnemonic != "strd")
     return;
-  if (Operands.size() < 4)
+  if (Operands.size() < MnemonicOpsEndInd + 2)
     return;
 
-  ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
-  ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+  ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]);
+  ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]);
 
   if (!Op2.isReg())
     return;
@@ -6876,7 +6907,7 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,
     return;
 
   Operands.insert(
-      Operands.begin() + 3,
+      Operands.begin() + MnemonicOpsEndInd + 1,
       ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc()));
 }
 
@@ -6886,19 +6917,17 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,
 // operand. If the conversion fails an error is diagnosed, and the function
 // returns true.
 bool ARMAsmParser::CDEConvertDualRegOperand(StringRef Mnemonic,
-                                            OperandVector &Operands) {
+                                            OperandVector &Operands,
+                                            unsigned MnemonicOpsEndInd) {
   assert(MS.isCDEDualRegInstr(Mnemonic));
-  bool isPredicable =
-      Mnemonic == "cx1da" || Mnemonic == "cx2da" || Mnemonic == "cx3da";
-  size_t NumPredOps = isPredicable ? 1 : 0;
 
-  if (Operands.size() <= 3 + NumPredOps)
+  if (Operands.size() < 3 + MnemonicOpsEndInd)
     return false;
 
   StringRef Op2Diag(
       "operand must be an even-numbered register in the range [r0, r10]");
 
-  const MCParsedAsmOperand &Op2 = *Operands[2 + NumPredOps];
+  const MCParsedAsmOperand &Op2 = *Operands[MnemonicOpsEndInd + 1];
   if (!Op2.isReg())
     return Error(Op2.getStartLoc(), Op2Diag);
 
@@ -6933,16 +6962,43 @@ bool ARMAsmParser::CDEConvertDualRegOperand(StringRef Mnemonic,
     break;
   }
 
-  const MCParsedAsmOperand &Op3 = *Operands[3 + NumPredOps];
+  const MCParsedAsmOperand &Op3 = *Operands[MnemonicOpsEndInd + 2];
   if (!Op3.isReg() || Op3.getReg() != RNext)
     return Error(Op3.getStartLoc(), "operand must be a consecutive register");
 
-  Operands.erase(Operands.begin() + 3 + NumPredOps);
-  Operands[2 + NumPredOps] =
+  Operands.erase(Operands.begin() + MnemonicOpsEndInd + 2);
+  Operands[MnemonicOpsEndInd + 1] =
       ARMOperand::CreateReg(RPair, Op2.getStartLoc(), Op2.getEndLoc());
   return false;
 }
 
+void removeCondCode(OperandVector &Operands, unsigned &MnemonicOpsEndInd) {
+  for (unsigned I = 0; I < MnemonicOpsEndInd; ++I)
+    if (static_cast<ARMOperand &>(*Operands[I]).isCondCode()) {
+      Operands.erase(Operands.begin() + I);
+      --MnemonicOpsEndInd;
+      break;
+    }
+}
+
+void removeCCOut(OperandVector &Operands, unsigned &MnemonicOpsEndInd) {
+  for (unsigned I = 0; I < MnemonicOpsEndInd; ++I)
+    if (static_cast<ARMOperand &>(*Operands[I]).isCCOut()) {
+      Operands.erase(Operands.begin() + I);
+      --MnemonicOpsEndInd;
+      break;
+    }
+}
+
+void removeVPTCondCode(OperandVector &Operands, unsigned &MnemonicOpsEndInd) {
+  for (unsigned I = 0; I < MnemonicOpsEndInd; ++I)
+    if (static_cast<ARMOperand &>(*Operands[I]).isVPTPred()) {
+      Operands.erase(Operands.begin() + I);
+      --MnemonicOpsEndInd;
+      break;
+    }
+}
+
 /// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                     SMLoc NameLoc, OperandVector &Operands) {
@@ -7055,14 +7111,14 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   }
 
   // Add the carry setting operand, if necessary.
-  if (CanAcceptCarrySet) {
+  if (CanAcceptCarrySet && CarrySetting) {
     SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
     Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0,
                                                Loc));
   }
 
   // Add the predication code operand, if necessary.
-  if (CanAcceptPredicationCode) {
+  if (CanAcceptPredicationCode && PredicationCode != llvm::ARMCC::AL) {
     SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
                                       CarrySetting);
     Operands.push_back(ARMOperand::CreateCondCode(
@@ -7070,14 +7126,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   }
 
   // Add the VPT predication code operand, if necessary.
-  // FIXME: We don't add them for the instructions filtered below as these can
-  // have custom operands which need special parsing.  This parsing requires
-  // the operand to be in the same place in the OperandVector as their
-  // definition in tblgen.  Since these instructions may also have the
-  // scalar predication operand we do not add the vector one and leave until
-  // now to fix it up.
-  if (CanAcceptVPTPredicationCode && Mnemonic != "vmov" &&
-      !Mnemonic.starts_with("vcmp") &&
+  // Dont add in certain cases of VCVT as this needs to be disambiguated
+  // after operand parsing.
+  if (CanAcceptVPTPredicationCode && VPTPredicationCode != llvm::ARMVCC::None &&
       !(Mnemonic.starts_with("vcvt") && Mnemonic != "vcvta" &&
         Mnemonic != "vcvtn" && Mnemonic != "vcvtp" && Mnemonic != "vcvtm")) {
     SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
@@ -7123,6 +7174,11 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
+  // This marks the end of the LHS Mnemonic operators.
+  // This is used for indexing into the non-menmonic operators as some of the
+  // mnemonic operators are optional and therfore indexes can differ.
+  unsigned MnemonicOpsEndInd = Operands.size();
+
   // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
@@ -7141,7 +7197,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
     return true;
 
-  tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands);
+  tryConvertingToTwoOperandForm(Mnemonic, PredicationCode, CarrySetting,
+                                Operands, MnemonicOpsEndInd);
 
   if (hasCDE() && MS.isCDEInstr(Mnemonic)) {
     // Dual-register instructions use even-odd register pairs as their
@@ -7152,33 +7209,16 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     // returns false, the function either succeeded or an error (e.g. missing
     // operand) will be diagnosed elsewhere.
     if (MS.isCDEDualRegInstr(Mnemonic)) {
-      bool GotError = CDEConvertDualRegOperand(Mnemonic, Operands);
+      bool GotError =
+          CDEConvertDualRegOperand(Mnemonic, Operands, MnemonicOpsEndInd);
       if (GotError)
         return GotError;
     }
   }
 
-  // Some instructions, mostly Thumb, have forms for the same mnemonic that
-  // do and don't have a cc_out optional-def operand. With some spot-checks
-  // of the operand list, we can figure out which variant we're trying to
-  // parse and adjust accordingly before actually matching. We shouldn't ever
-  // try to remove a cc_out operand that was explicitly set on the
-  // mnemonic, of course (CarrySetting == true). Reason number #317 the
-  // table driven matcher doesn't fit well with the ARM instruction set.
-  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands))
-    Operands.erase(Operands.begin() + 1);
-
-  // Some instructions have the same mnemonic, but don't always
-  // have a predicate. Distinguish them here and delete the
-  // appropriate predicate if needed.  This could be either the scalar
-  // predication code or the vector predication code.
-  if (PredicationCode == ARMCC::AL &&
-      shouldOmitPredicateOperand(Mnemonic, Operands))
-    Operands.erase(Operands.begin() + 1);
-
-
   if (hasMVE()) {
-    if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands) &&
+    if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                          MnemonicOpsEndInd) &&
         Mnemonic == "vmov" && PredicationCode == ARMCC::LT) {
       // Very nasty hack to deal with the vector predicated variant of vmovlt
       // the scalar predicated vmov with condition 'lt'.  We can not tell them
@@ -7193,7 +7233,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       Operands.insert(Operands.begin(),
                       ARMOperand::CreateToken(StringRef("vmovlt"), MLoc));
     } else if (Mnemonic == "vcvt" && PredicationCode == ARMCC::NE &&
-               !shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+               !shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                                 MnemonicOpsEndInd)) {
       // Another nasty hack to deal with the ambiguity between vcvt with scalar
       // predication 'ne' and vcvtn with vector predication 'e'.  As above we
       // can only distinguish between the two after we have parsed their
@@ -7208,36 +7249,33 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       Operands.insert(Operands.begin(),
                       ARMOperand::CreateToken(StringRef("vcvtn"), MLoc));
     } else if (Mnemonic == "vmul" && PredicationCode == ARMCC::LT &&
-               !shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+               !shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                                 MnemonicOpsEndInd)) {
       // Another hack, this time to distinguish between scalar predicated vmul
       // with 'lt' predication code and the vector instruction vmullt with
       // vector predication code "none"
-      Operands.erase(Operands.begin() + 1);
+      removeCondCode(Operands, MnemonicOpsEndInd);
       Operands.erase(Operands.begin());
       SMLoc MLoc = SMLoc::getFromPointer(NameLoc.getPointer());
       Operands.insert(Operands.begin(),
                       ARMOperand::CreateToken(StringRef("vmullt"), MLoc));
-    }
-    // For vmov and vcmp, as mentioned earlier, we did not add the vector
-    // predication code, since these may contain operands that require
-    // special parsing.  So now we have to see if they require vector
-    // predication and replace the scalar one with the vector predication
-    // operand if that is the case.
-    else if (Mnemonic == "vmov" || Mnemonic.starts_with("vcmp") ||
-             (Mnemonic.starts_with("vcvt") && !Mnemonic.starts_with("vcvta") &&
-              !Mnemonic.starts_with("vcvtn") &&
-              !Mnemonic.starts_with("vcvtp") &&
-              !Mnemonic.starts_with("vcvtm"))) {
-      if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
+    } else if (Mnemonic.starts_with("vcvt") && !Mnemonic.starts_with("vcvta") &&
+               !Mnemonic.starts_with("vcvtn") &&
+               !Mnemonic.starts_with("vcvtp") &&
+               !Mnemonic.starts_with("vcvtm")) {
+      if (!shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                            MnemonicOpsEndInd)) {
         // We could not split the vector predicate off vcvt because it might
         // have been the scalar vcvtt instruction.  Now we know its a vector
         // instruction, we still need to check whether its the vector
         // predicated vcvt with 'Then' predication or the vector vcvtt.  We can
         // distinguish the two based on the suffixes, if it is any of
         // ".f16.f32", ".f32.f16", ".f16.f64" or ".f64.f16" then it is the vcvtt.
-        if (Mnemonic.starts_with("vcvtt") && Operands.size() >= 4) {
-          auto Sz1 = static_cast<ARMOperand &>(*Operands[2]);
-          auto Sz2 = static_cast<ARMOperand &>(*Operands[3]);
+        if (Mnemonic.starts_with("vcvtt") && MnemonicOpsEndInd > 2) {
+          auto Sz1 =
+              static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd - 2]);
+          auto Sz2 =
+              static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd - 1]);
           if (!(Sz1.isToken() && Sz1.getToken().starts_with(".f") &&
                 Sz2.isToken() && Sz2.getToken().starts_with(".f"))) {
             Operands.erase(Operands.begin());
@@ -7249,24 +7287,21 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                             ARMOperand::CreateToken(Mnemonic, MLoc));
           }
         }
-        Operands.erase(Operands.begin() + 1);
         SMLoc PLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
                                           Mnemonic.size() + CarrySetting);
+        // Add VPTPred
         Operands.insert(Operands.begin() + 1,
                         ARMOperand::CreateVPTPred(
                             ARMVCC::VPTCodes(VPTPredicationCode), PLoc));
+        ++MnemonicOpsEndInd;
       }
     } else if (CanAcceptVPTPredicationCode) {
       // For all other instructions, make sure only one of the two
       // predication operands is left behind, depending on whether we should
       // use the vector predication.
-      if (shouldOmitVectorPredicateOperand(Mnemonic, Operands)) {
-        if (CanAcceptPredicationCode)
-          Operands.erase(Operands.begin() + 2);
-        else
-          Operands.erase(Operands.begin() + 1);
-      } else if (CanAcceptPredicationCode && PredicationCode == ARMCC::AL) {
-        Operands.erase(Operands.begin() + 1);
+      if (shouldOmitVectorPredicateOperand(Mnemonic, Operands,
+                                           MnemonicOpsEndInd)) {
+        removeVPTCondCode(Operands, MnemonicOpsEndInd);
       }
     }
   }
@@ -7291,69 +7326,73 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
-    // ARM mode 'blx' need special handling, as the register operand version
-    // is predicable, but the label operand version is not. So, we can't rely
-    // on the Mnemonic based checking to correctly figure out when to put
-    // a k_CondCode operand in the list. If we're trying to match the label
-    // version, remove the k_CondCode operand here.
-    if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
-        static_cast<ARMOperand &>(*Operands[2]).isImm())
-      Operands.erase(Operands.begin() + 1);
-
-    // Adjust operands of ldrexd/strexd to MCK_GPRPair.
-    // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
-    // a single GPRPair reg operand is used in the .td file to replace the two
-    // GPRs. However, when parsing from asm, the two GRPs cannot be
-    // automatically
-    // expressed as a GPRPair, so we have to manually merge them.
-    // FIXME: We would really like to be able to tablegen'erate this.
-    if (!isThumb() && Operands.size() > 4 &&
-        (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
-         Mnemonic == "stlexd")) {
-      bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
-      unsigned Idx = isLoad ? 2 : 3;
-      ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
-      ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
-
-      const MCRegisterClass &MRC = MRI->getRegClass(ARM::GPRRegClassID);
-      // Adjust only if Op1 and Op2 are GPRs.
-      if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
-          MRC.contains(Op2.getReg())) {
-        unsigned Reg1 = Op1.getReg();
-        unsigned Reg2 = Op2.getReg();
-        unsigned Rt = MRI->getEncodingValue(Reg1);
-        unsigned Rt2 = MRI->getEncodingValue(Reg2);
-
-        // Rt2 must be Rt + 1 and Rt must be even.
-        if (Rt + 1 != Rt2 || (Rt & 1)) {
-          return Error(Op2.getStartLoc(),
-                       isLoad ? "destination operands must be sequential"
-                              : "source operands must be sequential");
-        }
-        unsigned NewReg = MRI->getMatchingSuperReg(
-            Reg1, ARM::gsub_0, &(MRI->getRegClass(ARM::GPRPairRegClassID)));
-        Operands[Idx] =
-            ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
-        Operands.erase(Operands.begin() + Idx + 1);
+  // ARM mode 'blx' need special handling, as the register operand version
+  // is predicable, but the label operand version is not. So, we can't rely
+  // on the Mnemonic based checking to correctly figure out when to put
+  // a k_CondCode operand in the list. If we're trying to match the label
+  // version, remove the k_CondCode operand here.
+  if (!isThumb() && Mnemonic == "blx" &&
+      Operands.size() == MnemonicOpsEndInd + 1 &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]).isImm())
+    removeCondCode(Operands, MnemonicOpsEndInd);
+
+  // Adjust operands of ldrexd/strexd to MCK_GPRPair.
+  // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+  // a single GPRPair reg operand is used in the .td file to replace the two
+  // GPRs. However, when parsing from asm, the two GRPs cannot be
+  // automatically
+  // expressed as a GPRPair, so we have to manually merge them.
+  // FIXME: We would really like to be able to tablegen'erate this.
+  if (!isThumb() && Operands.size() > MnemonicOpsEndInd + 1 &&
+      (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
+       Mnemonic == "stlexd")) {
+    bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
+    unsigned Idx = isLoad ? MnemonicOpsEndInd : MnemonicOpsEndInd + 1;
+    ARMOperand &Op1 = static_cast<ARMOperand &>(*Operands[Idx]);
+    ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[Idx + 1]);
+
+    const MCRegisterClass &MRC = MRI->getRegClass(ARM::GPRRegClassID);
+    // Adjust only if Op1 and Op2 are GPRs.
+    if (Op1.isReg() && Op2.isReg() && MRC.contains(Op1.getReg()) &&
+        MRC.contains(Op2.getReg())) {
+      unsigned Reg1 = Op1.getReg();
+      unsigned Reg2 = Op2.getReg();
+      unsigned Rt = MRI->getEncodingValue(Reg1);
+      unsigned Rt2 = MRI->getEncodingValue(Reg2);
+
+      // Rt2 must be Rt + 1 and Rt must be even.
+      if (Rt + 1 != Rt2 || (Rt & 1)) {
+        return Error(Op2.getStartLoc(),
+                     isLoad ? "destination operands must be sequential"
+                            : "source operands must be sequential");
       }
+      unsigned NewReg = MRI->getMatchingSuperReg(
+          Reg1, ARM::gsub_0, &(MRI->getRegClass(ARM::GPRPairRegClassID)));
+      Operands[Idx] =
+          ARMOperand::CreateReg(NewReg, Op1.getStartLoc(), Op2.getEndLoc());
+      Operands.erase(Operands.begin() + Idx + 1);
+    }
   }
 
   // GNU Assembler extension (compatibility).
-  fixupGNULDRDAlias(Mnemonic, Operands);
+  fixupGNULDRDAlias(Mnemonic, Operands, MnemonicOpsEndInd);
 
   // FIXME: As said above, this is all a pretty gross hack.  This instruction
   // does not fit with other "subs" and tblgen.
   // Adjust operands of B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction
   // so the Mnemonic is the original name "subs" and delete the predicate
   // operand so it will match the table entry.
-  if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 &&
-      static_cast<ARMOperand &>(*Operands[3]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[3]).getReg() == ARM::PC &&
-      static_cast<ARMOperand &>(*Operands[4]).isReg() &&
-      static_cast<ARMOperand &>(*Operands[4]).getReg() == ARM::LR &&
-      static_cast<ARMOperand &>(*Operands[5]).isImm()) {
+  if (isThumbTwo() && Mnemonic == "sub" &&
+      Operands.size() == MnemonicOpsEndInd + 3 &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]).getReg() ==
+          ARM::PC &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]).isReg() &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]).getReg() ==
+          ARM::LR &&
+      static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 2]).isImm()) {
     Operands.front() = ARMOperand::CreateToken(Name, NameLoc);
-    Operands.erase(Operands.begin() + 1);
+    removeCCOut(Operands, MnemonicOpsEndInd);
   }
   return false;
 }
@@ -7398,49 +7437,61 @@ static bool instIsBreakpoint(const MCInst &Inst) {
            Inst.getOpcode() == ARM::HLT;
 }
 
+unsigned getRegListInd(const OperandVector &Operands,
+                       unsigned MnemonicOpsEndInd) {
+  for (unsigned I = MnemonicOpsEndInd; I < Operands.size(); ++I) {
+    const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[I]);
+    if (Op.isRegList()) {
+      return I;
+    }
+  }
+  return 0;
+}
+
 bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst,
                                        const OperandVector &Operands,
-                                       unsigned ListNo, bool IsARPop) {
-  const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
-  bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
-
-  bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
-  bool ListContainsLR = listContainsReg(Inst, ListNo, ARM::LR);
-  bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+                                       unsigned MnemonicOpsEndInd,
+                                       unsigned ListIndex, bool IsARPop) {
+  bool ListContainsSP = listContainsReg(Inst, ListIndex, ARM::SP);
+  bool ListContainsLR = listContainsReg(Inst, ListIndex, ARM::LR);
+  bool ListContainsPC = listContainsReg(Inst, ListIndex, ARM::PC);
 
   if (!IsARPop && ListContainsSP)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "SP may not be in the register list");
-  else if (ListContainsPC && ListContainsLR)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "PC and LR may not be in the register list simultaneously");
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "SP may not be in the register list");
+  if (ListContainsPC && ListContainsLR)
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "PC and LR may not be in the register list simultaneously");
   return false;
 }
 
 bool ARMAsmParser::validatetSTMRegList(const MCInst &Inst,
                                        const OperandVector &Operands,
-                                       unsigned ListNo) {
-  const ARMOperand &Op = static_cast<const ARMOperand &>(*Operands[ListNo]);
-  bool HasWritebackToken = Op.isToken() && Op.getToken() == "!";
-
-  bool ListContainsSP = listContainsReg(Inst, ListNo, ARM::SP);
-  bool ListContainsPC = listContainsReg(Inst, ListNo, ARM::PC);
+                                       unsigned MnemonicOpsEndInd,
+                                       unsigned ListIndex) {
+  bool ListContainsSP = listContainsReg(Inst, ListIndex, ARM::SP);
+  bool ListContainsPC = listContainsReg(Inst, ListIndex, ARM::PC);
 
   if (ListContainsSP && ListContainsPC)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "SP and PC may not be in the register list");
-  else if (ListContainsSP)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "SP may not be in the register list");
-  else if (ListContainsPC)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "PC may not be in the register list");
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "SP and PC may not be in the register list");
+  if (ListContainsSP)
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "SP may not be in the register list");
+  if (ListContainsPC)
+    return Error(
+        Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+        "PC may not be in the register list");
   return false;
 }
 
-bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
-                                    const OperandVector &Operands,
-                                    bool Load, bool ARMMode, bool Writeback) {
+bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst, const OperandVector &Operands,
+                                    bool Load, bool ARMMode, bool Writeback,
+                                    unsigned MnemonicOpsEndInd) {
   unsigned RtIndex = Load || !Writeback ? 0 : 1;
   unsigned Rt = MRI->getEncodingValue(Inst.getOperand(RtIndex).getReg());
   unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(RtIndex + 1).getReg());
@@ -7448,21 +7499,21 @@ bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
   if (ARMMode) {
     // Rt can't be R14.
     if (Rt == 14)
-      return Error(Operands[3]->getStartLoc(),
-                  "Rt can't be R14");
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Rt can't be R14");
 
     // Rt must be even-numbered.
     if ((Rt & 1) == 1)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "Rt must be even-numbered");
 
     // Rt2 must be Rt + 1.
     if (Rt2 != Rt + 1) {
       if (Load)
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "destination operands must be sequential");
       else
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "source operands must be sequential");
     }
 
@@ -7472,7 +7523,7 @@ bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
 
   if (!ARMMode && Load) {
     if (Rt2 == Rt)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "destination operands can't be identical");
   }
 
@@ -7481,11 +7532,11 @@ bool ARMAsmParser::validateLDRDSTRD(MCInst &Inst,
 
     if (Rn == Rt || Rn == Rt2) {
       if (Load)
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "base register needs to be different from destination "
                      "registers");
       else
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "source register and base register can't be identical");
     }
 
@@ -7523,7 +7574,8 @@ static bool isARMMCExpr(MCParsedAsmOperand &MCOp) {
 
 // FIXME: We would really like to be able to tablegen'erate this.
 bool ARMAsmParser::validateInstruction(MCInst &Inst,
-                                       const OperandVector &Operands) {
+                                       const OperandVector &Operands,
+                                       unsigned MnemonicOpsEndInd) {
   const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
   SMLoc Loc = Operands[0]->getStartLoc();
 
@@ -7538,7 +7590,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm());
     if (Cond != currentITCond()) {
       // Find the condition code Operand to get its SMLoc information.
-      SMLoc CondLoc;
+      SMLoc CondLoc = Operands[0]->getEndLoc();
       for (unsigned I = 1; I < Operands.size(); ++I)
         if (static_cast<ARMOperand &>(*Operands[I]).isCondCode())
           CondLoc = Operands[I]->getStartLoc();
@@ -7608,9 +7660,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::VLSTM_T2: {
     // Since in some cases both T1 and T2 are valid, tablegen can not always
     // pick the correct instruction.
-    if (Operands.size() == 4) { // a register list has been provided
+    if (Operands.size() ==
+        MnemonicOpsEndInd + 2) { // a register list has been provided
       ARMOperand &Op = static_cast<ARMOperand &>(
-          *Operands[3]); // the register list, a dpr_reglist
+          *Operands[MnemonicOpsEndInd + 1]); // the register list, a dpr_reglist
       assert(Op.isDPRRegList());
       auto &RegList = Op.getRegList();
       // T2 requires v8.1-M.Main (cannot be handled by tablegen)
@@ -7644,50 +7697,50 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::LDRD:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/true,
-                         /*Writeback*/false))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ true, /*ARMMode*/ true,
+                         /*Writeback*/ false, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::LDRD_PRE:
   case ARM::LDRD_POST:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/true,
-                         /*Writeback*/true))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ true, /*ARMMode*/ true,
+                         /*Writeback*/ true, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::t2LDRDi8:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/false,
-                         /*Writeback*/false))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ true, /*ARMMode*/ false,
+                         /*Writeback*/ false, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::t2LDRD_PRE:
   case ARM::t2LDRD_POST:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/true, /*ARMMode*/false,
-                         /*Writeback*/true))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ true, /*ARMMode*/ false,
+                         /*Writeback*/ true, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::t2BXJ: {
     const unsigned RmReg = Inst.getOperand(0).getReg();
     // Rm = SP is no longer unpredictable in v8-A
     if (RmReg == ARM::SP && !hasV8Ops())
-      return Error(Operands[2]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "r13 (SP) is an unpredictable operand to BXJ");
     return false;
   }
   case ARM::STRD:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/true,
-                         /*Writeback*/false))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ false, /*ARMMode*/ true,
+                         /*Writeback*/ false, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::STRD_PRE:
   case ARM::STRD_POST:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/true,
-                         /*Writeback*/true))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ false, /*ARMMode*/ true,
+                         /*Writeback*/ true, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::t2STRD_PRE:
   case ARM::t2STRD_POST:
-    if (validateLDRDSTRD(Inst, Operands, /*Load*/false, /*ARMMode*/false,
-                         /*Writeback*/true))
+    if (validateLDRDSTRD(Inst, Operands, /*Load*/ false, /*ARMMode*/ false,
+                         /*Writeback*/ true, MnemonicOpsEndInd))
       return true;
     break;
   case ARM::STR_PRE_IMM:
@@ -7711,7 +7764,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
 
     if (Rt == Rn)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "source register and base register can't be identical");
     return false;
   }
@@ -7724,19 +7777,19 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(1).getReg());
 
     if (Rt == Rn)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "destination register and base register can't be identical");
     if (Inst.getOpcode() == ARM::t2LDR_POST_imm ||
         Inst.getOpcode() == ARM::t2STR_POST_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     }
     if (Inst.getOpcode() == ARM::t2STR_PRE_imm ||
         Inst.getOpcode() == ARM::t2STR_POST_imm) {
       if (Inst.getOperand(0).getReg() == ARM::PC) {
-        return Error(Operands[3]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "operand must be a register in range [r0, r14]");
       }
     }
@@ -7755,17 +7808,17 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOpcode() == ARM::t2STRB_PRE_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     } else if (Inst.getOpcode() == ARM::t2LDRB_OFFSET_imm ||
                Inst.getOpcode() == ARM::t2STRB_OFFSET_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 0 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [0, 255] with a negative sign");
     }
     if (Inst.getOperand(0).getReg() == ARM::PC) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "if operand is PC, should call the LDRB (literal)");
     }
     return false;
@@ -7783,17 +7836,17 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOpcode() == ARM::t2STRH_PRE_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     } else if (Inst.getOpcode() == ARM::t2LDRH_OFFSET_imm ||
                Inst.getOpcode() == ARM::t2STRH_OFFSET_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 0 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [0, 255] with a negative sign");
     }
     if (Inst.getOperand(0).getReg() == ARM::PC) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "if operand is PC, should call the LDRH (literal)");
     }
     return false;
@@ -7806,16 +7859,16 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOpcode() == ARM::t2LDRSB_PRE_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     } else if (Inst.getOpcode() == ARM::t2LDRSB_OFFSET_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 0 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [0, 255] with a negative sign");
     }
     if (Inst.getOperand(0).getReg() == ARM::PC) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "if operand is PC, should call the LDRH (literal)");
     }
     return false;
@@ -7828,16 +7881,16 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
         Inst.getOpcode() == ARM::t2LDRSH_PRE_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 255 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [-255, 255]");
     } else if (Inst.getOpcode() == ARM::t2LDRSH_OFFSET_imm) {
       int Imm = Inst.getOperand(2).getImm();
       if (Imm > 0 || Imm < -255)
-        return Error(Operands[5]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                      "operand must be in range [0, 255] with a negative sign");
     }
     if (Inst.getOperand(0).getReg() == ARM::PC) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "if operand is PC, should call the LDRH (literal)");
     }
     return false;
@@ -7872,7 +7925,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
 
     if (Rt == Rn)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "destination register and base register can't be identical");
     return false;
   }
@@ -7916,10 +7969,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Qm = MRI->getEncodingValue(Inst.getOperand(QmIdx).getReg());
 
     if (Qd == Qm) {
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    Twine("destination vector register and vector ") +
-                   (QmIsPointer ? "pointer" : "offset") +
-                   " register can't be identical");
+                       (QmIsPointer ? "pointer" : "offset") +
+                       " register can't be identical");
     }
     return false;
   }
@@ -7932,7 +7985,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     unsigned LSB = Inst.getOperand(2).getImm();
     unsigned Widthm1 = Inst.getOperand(3).getImm();
     if (Widthm1 >= 32 - LSB)
-      return Error(Operands[5]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "bitfield width must be in range [1,32-lsb]");
     return false;
   }
@@ -7946,24 +7999,29 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     // in the register list.
     unsigned Rn = Inst.getOperand(0).getReg();
     bool HasWritebackToken =
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+        (static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+             .isToken() &&
+         static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+                 .getToken() == "!");
+
     bool ListContainsBase;
     if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
-      return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
-                   "registers must be in range r0-r7");
+      return Error(
+          Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+          "registers must be in range r0-r7");
     // If we should have writeback, then there should be a '!' token.
     if (!ListContainsBase && !HasWritebackToken && !isThumbTwo())
-      return Error(Operands[2]->getStartLoc(),
-                   "writeback operator '!' expected");
+      return Error(
+          Operands[getRegListInd(Operands, MnemonicOpsEndInd)]->getStartLoc(),
+          "writeback operator '!' expected");
     // If we should not have writeback, there must not be a '!'. This is
     // true even for the 32-bit wide encodings.
     if (ListContainsBase && HasWritebackToken)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
 
-    if (validatetLDMRegList(Inst, Operands, 3))
+    if (validatetLDMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
       return true;
     break;
   }
@@ -7981,12 +8039,12 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   case ARM::t2LDMIA:
   case ARM::t2LDMDB:
-    if (validatetLDMRegList(Inst, Operands, 3))
+    if (validatetLDMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
       return true;
     break;
   case ARM::t2STMIA:
   case ARM::t2STMDB:
-    if (validatetSTMRegList(Inst, Operands, 3))
+    if (validatetSTMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
       return true;
     break;
   case ARM::t2LDMIA_UPD:
@@ -7998,10 +8056,10 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
                    "writeback register not allowed in register list");
 
     if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
-      if (validatetLDMRegList(Inst, Operands, 3))
+      if (validatetLDMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
         return true;
     } else {
-      if (validatetSTMRegList(Inst, Operands, 3))
+      if (validatetSTMRegList(Inst, Operands, MnemonicOpsEndInd, 3))
         return true;
     }
     break;
@@ -8011,7 +8069,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::sysLDMDB_UPD:
   case ARM::sysLDMIB_UPD:
     if (!listContainsReg(Inst, 3, ARM::PC))
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "writeback register only allowed on system LDM "
                    "if PC in register-list");
     break;
@@ -8019,26 +8077,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::sysSTMDA_UPD:
   case ARM::sysSTMDB_UPD:
   case ARM::sysSTMIB_UPD:
-    return Error(Operands[2]->getStartLoc(),
+    return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                  "system STM cannot have writeback register");
-  case ARM::tMUL:
-    // The second source operand must be the same register as the destination
-    // operand.
-    //
-    // In this case, we must directly check the parsed operands because the
-    // cvtThumbMultiply() function is written in such a way that it guarantees
-    // this first statement is always true for the new Inst.  Essentially, the
-    // destination is unconditionally copied into the second source operand
-    // without checking to see if it matches what we actually parsed.
-    if (Operands.size() == 6 && (((ARMOperand &)*Operands[3]).getReg() !=
-                                 ((ARMOperand &)*Operands[5]).getReg()) &&
-        (((ARMOperand &)*Operands[3]).getReg() !=
-         ((ARMOperand &)*Operands[4]).getReg())) {
-      return Error(Operands[3]->getStartLoc(),
-                   "destination register must match source register");
-    }
-    break;
-
   // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2,
   // so only issue a diagnostic for thumb1. The instructions will be
   // switched to the t2 encodings in processInstruction() if necessary.
@@ -8046,9 +8086,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     bool ListContainsBase;
     if (checkLowRegisterList(Inst, 2, 0, ARM::PC, ListContainsBase) &&
         !isThumbTwo())
-      return Error(Operands[2]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "registers must be in range r0-r7 or pc");
-    if (validatetLDMRegList(Inst, Operands, 2, !isMClass()))
+    if (validatetLDMRegList(Inst, Operands, MnemonicOpsEndInd, 2, !isMClass()))
       return true;
     break;
   }
@@ -8056,9 +8096,9 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     bool ListContainsBase;
     if (checkLowRegisterList(Inst, 2, 0, ARM::LR, ListContainsBase) &&
         !isThumbTwo())
-      return Error(Operands[2]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "registers must be in range r0-r7 or lr");
-    if (validatetSTMRegList(Inst, Operands, 2))
+    if (validatetSTMRegList(Inst, Operands, MnemonicOpsEndInd, 2))
       return true;
     break;
   }
@@ -8067,17 +8107,17 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     InvalidLowList = checkLowRegisterList(Inst, 4, Inst.getOperand(0).getReg(),
                                           0, ListContainsBase);
     if (InvalidLowList && !isThumbTwo())
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "registers must be in range r0-r7");
 
     // This would be converted to a 32-bit stm, but that's not valid if the
     // writeback register is in the list.
     if (InvalidLowList && ListContainsBase)
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
 
-    if (validatetSTMRegList(Inst, Operands, 4))
+    if (validatetSTMRegList(Inst, Operands, MnemonicOpsEndInd, 4))
       return true;
     break;
   }
@@ -8086,7 +8126,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     // same, we need thumb2 (for the wide encoding), or we have an error.
     if (!isThumbTwo() &&
         Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "source register must be the same as destination");
     }
     break;
@@ -8097,17 +8137,20 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::t2SUBrs:
     if (Inst.getOperand(0).getReg() == ARM::SP &&
         Inst.getOperand(1).getReg() != ARM::SP)
-      return Error(Operands[4]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "source register must be sp if destination is sp");
     break;
 
   // Final range checking for Thumb unconditional branch instructions.
   case ARM::tB:
-    if (!(static_cast<ARMOperand &>(*Operands[2])).isSignedOffset<11, 1>())
-      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    if (!(static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd]))
+             .isSignedOffset<11, 1>())
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "branch target out of range");
     break;
   case ARM::t2B: {
-    int op = (Operands[2]->isImm()) ? 2 : 3;
+    int op = (Operands[MnemonicOpsEndInd]->isImm()) ? MnemonicOpsEndInd
+                                                    : MnemonicOpsEndInd + 1;
     ARMOperand &Operand = static_cast<ARMOperand &>(*Operands[op]);
     // Delay the checks of symbolic expressions until they are resolved.
     if (!isa<MCBinaryExpr>(Operand.getImm()) &&
@@ -8117,19 +8160,24 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   }
   // Final range checking for Thumb conditional branch instructions.
   case ARM::tBcc:
-    if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<8, 1>())
-      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd])
+             .isSignedOffset<8, 1>())
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "branch target out of range");
     break;
   case ARM::t2Bcc: {
-    int Op = (Operands[2]->isImm()) ? 2 : 3;
+    int Op = (Operands[MnemonicOpsEndInd]->isImm()) ? MnemonicOpsEndInd
+                                                    : MnemonicOpsEndInd + 1;
     if (!static_cast<ARMOperand &>(*Operands[Op]).isSignedOffset<20, 1>())
       return Error(Operands[Op]->getStartLoc(), "branch target out of range");
     break;
   }
   case ARM::tCBZ:
   case ARM::tCBNZ: {
-    if (!static_cast<ARMOperand &>(*Operands[2]).isUnsignedOffset<6, 1>())
-      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+             .isUnsignedOffset<6, 1>())
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
+                   "branch target out of range");
     break;
   }
   case ARM::MOVi16:
@@ -8143,7 +8191,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     // want the behavior of silently truncating, which can be unexpected and
     // lead to bugs that are difficult to find since this is an easy mistake
     // to make.
-    int i = (Operands[3]->isImm()) ? 3 : 4;
+    int i = (Operands[MnemonicOpsEndInd]->isImm()) ? MnemonicOpsEndInd
+                                                   : MnemonicOpsEndInd + 1;
     ARMOperand &Op = static_cast<ARMOperand &>(*Operands[i]);
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm());
     if (CE) break;
@@ -8158,7 +8207,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::tADDi8: {
-    MCParsedAsmOperand &Op = *Operands[4];
+    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd + 1];
     if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
       return Error(Op.getStartLoc(),
                    "Immediate expression for Thumb adds requires :lower0_7:,"
@@ -8166,7 +8215,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::tMOVi8: {
-    MCParsedAsmOperand &Op = *Operands[2];
+    MCParsedAsmOperand &Op = *Operands[MnemonicOpsEndInd];
     if (isARMMCExpr(Op) && !isThumbI8Relocation(Op))
       return Error(Op.getStartLoc(),
                    "Immediate expression for Thumb movs requires :lower0_7:,"
@@ -8193,30 +8242,36 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::t2BFr:
   case ARM::t2BFLi:
   case ARM::t2BFLr: {
-    if (!static_cast<ARMOperand &>(*Operands[2]).isUnsignedOffset<4, 1>() ||
-        (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0))
-      return Error(Operands[2]->getStartLoc(),
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd])
+             .isUnsignedOffset<4, 1>() ||
+        (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0)) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "branch location out of range or not a multiple of 2");
+    }
 
     if (Opcode == ARM::t2BFi) {
-      if (!static_cast<ARMOperand &>(*Operands[3]).isSignedOffset<16, 1>())
-        return Error(Operands[3]->getStartLoc(),
+      if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+               .isSignedOffset<16, 1>())
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "branch target out of range or not a multiple of 2");
     } else if (Opcode == ARM::t2BFLi) {
-      if (!static_cast<ARMOperand &>(*Operands[3]).isSignedOffset<18, 1>())
-        return Error(Operands[3]->getStartLoc(),
+      if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+               .isSignedOffset<18, 1>())
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "branch target out of range or not a multiple of 2");
     }
     break;
   }
   case ARM::t2BFic: {
-    if (!static_cast<ARMOperand &>(*Operands[1]).isUnsignedOffset<4, 1>() ||
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd])
+             .isUnsignedOffset<4, 1>() ||
         (Inst.getOperand(0).isImm() && Inst.getOperand(0).getImm() == 0))
       return Error(Operands[1]->getStartLoc(),
                    "branch location out of range or not a multiple of 2");
 
-    if (!static_cast<ARMOperand &>(*Operands[2]).isSignedOffset<16, 1>())
-      return Error(Operands[2]->getStartLoc(),
+    if (!static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+             .isSignedOffset<16, 1>())
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "branch target out of range or not a multiple of 2");
 
     assert(Inst.getOperand(0).isImm() == Inst.getOperand(2).isImm() &&
@@ -8237,7 +8292,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
       if (Inst.getOperand(i).isReg() &&
           !ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(
               Inst.getOperand(i).getReg())) {
-        return Error(Operands[2]->getStartLoc(),
+        return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                      "invalid register in register list. Valid registers are "
                      "r0-r12, lr/r14 and APSR.");
       }
@@ -8269,7 +8324,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(2).getReg());
     const unsigned Sm1 = MRI->getEncodingValue(Inst.getOperand(3).getReg());
     if (Sm1 != Sm + 1)
-      return Error(Operands[5]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
                    "source operands must be sequential");
     break;
   }
@@ -8278,16 +8333,17 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const unsigned Sm = MRI->getEncodingValue(Inst.getOperand(0).getReg());
     const unsigned Sm1 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
     if (Sm1 != Sm + 1)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "destination operands must be sequential");
     break;
   }
   case ARM::VLDMDIA:
   case ARM::VSTMDIA: {
-    ARMOperand &Op = static_cast<ARMOperand&>(*Operands[3]);
+    ARMOperand &Op =
+        static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]);
     auto &RegList = Op.getRegList();
     if (RegList.size() < 1 || RegList.size() > 16)
-      return Error(Operands[3]->getStartLoc(),
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
                    "list of registers must be at least 1 and at most 16");
     break;
   }
@@ -8298,13 +8354,15 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::MVE_VMULLTs32:
   case ARM::MVE_VMULLBu32:
   case ARM::MVE_VMULLTu32: {
-    if (Operands[3]->getReg() == Operands[4]->getReg()) {
-      return Error (Operands[3]->getStartLoc(),
-                    "Qd register and Qn register can't be identical");
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 1]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Qd register and Qn register can't be identical");
     }
-    if (Operands[3]->getReg() == Operands[5]->getReg()) {
-      return Error (Operands[3]->getStartLoc(),
-                    "Qd register and Qm register can't be identical");
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 2]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Qd register and Qm register can't be identical");
     }
     break;
   }
@@ -8313,41 +8371,56 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
   case ARM::MVE_VREV64_32:
   case ARM::MVE_VQDMULL_qr_s32bh:
   case ARM::MVE_VQDMULL_qr_s32th: {
-    if (Operands[3]->getReg() == Operands[4]->getReg()) {
-      return Error (Operands[3]->getStartLoc(),
-                    "Qd register and Qn register can't be identical");
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 1]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Qd register and Qn register can't be identical");
     }
     break;
   }
   case ARM::MVE_VCADDi32:
   case ARM::MVE_VCADDf32:
   case ARM::MVE_VHCADDs32: {
-    if (Operands[3]->getReg() == Operands[5]->getReg()) {
-      return Error (Operands[3]->getStartLoc(),
-                    "Qd register and Qm register can't be identical");
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 2]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Qd register and Qm register can't be identical");
     }
     break;
   }
   case ARM::MVE_VMOV_rr_q: {
-    if (Operands[4]->getReg() != Operands[6]->getReg())
-      return Error (Operands[4]->getStartLoc(), "Q-registers must be the same");
-    if (static_cast<ARMOperand &>(*Operands[5]).getVectorIndex() !=
-        static_cast<ARMOperand &>(*Operands[7]).getVectorIndex() + 2)
-      return Error (Operands[5]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1");
+    if (Operands[MnemonicOpsEndInd + 2]->getReg() !=
+        Operands[MnemonicOpsEndInd + 4]->getReg())
+      return Error(Operands[MnemonicOpsEndInd + 2]->getStartLoc(),
+                   "Q-registers must be the same");
+    if (static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 3])
+            .getVectorIndex() !=
+        static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 5])
+                .getVectorIndex() +
+            2)
+      return Error(Operands[MnemonicOpsEndInd + 3]->getStartLoc(),
+                   "Q-register indexes must be 2 and 0 or 3 and 1");
     break;
   }
   case ARM::MVE_VMOV_q_rr: {
-    if (Operands[2]->getReg() != Operands[4]->getReg())
-      return Error (Operands[2]->getStartLoc(), "Q-registers must be the same");
-    if (static_cast<ARMOperand &>(*Operands[3]).getVectorIndex() !=
-        static_cast<ARMOperand &>(*Operands[5]).getVectorIndex() + 2)
-      return Error (Operands[3]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1");
+    if (Operands[MnemonicOpsEndInd]->getReg() !=
+        Operands[MnemonicOpsEndInd + 2]->getReg())
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
+                   "Q-registers must be the same");
+    if (static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+            .getVectorIndex() !=
+        static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 3])
+                .getVectorIndex() +
+            2)
+      return Error(Operands[MnemonicOpsEndInd + 1]->getStartLoc(),
+                   "Q-register indexes must be 2 and 0 or 3 and 1");
     break;
   }
   case ARM::MVE_SQRSHR:
   case ARM::MVE_UQRSHL: {
-    if (Operands[2]->getReg() == Operands[3]->getReg()) {
-      return Error(Operands[2]->getStartLoc(),
+    if (Operands[MnemonicOpsEndInd]->getReg() ==
+        Operands[MnemonicOpsEndInd + 1]->getReg()) {
+      return Error(Operands[MnemonicOpsEndInd]->getStartLoc(),
                    "Rda register and Rm register can't be identical");
     }
     break;
@@ -8751,6 +8824,7 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
 
 bool ARMAsmParser::processInstruction(MCInst &Inst,
                                       const OperandVector &Operands,
+                                      unsigned MnemonicOpsEndInd,
                                       MCStreamer &Out) {
   // Check if we have the wide qualifier, because if it's present we
   // must avoid selecting a 16-bit thumb instruction.
@@ -8768,9 +8842,10 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   case ARM::VLSTM: {
     // In some cases both T1 and T2 are valid, causing tablegen pick T1 instead
     // of T2
-    if (Operands.size() == 4) { // a register list has been provided
+    if (Operands.size() ==
+        MnemonicOpsEndInd + 2) { // a register list has been provided
       ARMOperand &Op = static_cast<ARMOperand &>(
-          *Operands[3]); // the register list, a dpr_reglist
+          *Operands[MnemonicOpsEndInd + 1]); // the register list, a dpr_reglist
       assert(Op.isDPRRegList());
       auto &RegList = Op.getRegList();
       // When the register list is {d0-d31} the instruction has to be the T2
@@ -9097,9 +9172,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     else if (Inst.getOpcode() == ARM::t2LDRConstPool)
       TmpInst.setOpcode(ARM::t2LDRpci);
     const ARMOperand &PoolOperand =
-      (HasWideQualifier ?
-       static_cast<ARMOperand &>(*Operands[4]) :
-       static_cast<ARMOperand &>(*Operands[3]));
+        static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1]);
     const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
     // If SubExprVal is a constant we may be able to use a MOV
     if (isa<MCConstantExpr>(SubExprVal) &&
@@ -10526,7 +10599,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // to encoding T2 if <Rd> is specified and encoding T2 is preferred
     // to encoding T1 if <Rd> is omitted."
     if (Inst.getOperand(3).isImm() &&
-        (unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+        (unsigned)Inst.getOperand(3).getImm() < 8 &&
+        Operands.size() == MnemonicOpsEndInd + 3) {
       Inst.setOpcode(ARM::tADDi3);
       return true;
     }
@@ -10536,7 +10610,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
     // to encoding T2 if <Rd> is specified and encoding T2 is preferred
     // to encoding T1 if <Rd> is omitted."
-    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 &&
+        Operands.size() == MnemonicOpsEndInd + 3) {
       Inst.setOpcode(ARM::tSUBi3);
       return true;
     }
@@ -10656,8 +10731,10 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // an error in validateInstruction().
     unsigned Rn = Inst.getOperand(0).getReg();
     bool hasWritebackToken =
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == "!");
+        (static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+             .isToken() &&
+         static_cast<ARMOperand &>(*Operands[MnemonicOpsEndInd + 1])
+                 .getToken() == "!");
     bool listContainsBase;
     if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
         (!listContainsBase && !hasWritebackToken) ||
@@ -10956,6 +11033,26 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   return false;
 }
 
+unsigned
+ARMAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst,
+                                             const OperandVector &Operands) {
+  unsigned Opc = Inst.getOpcode();
+  switch (Opc) {
+  // Prevent the mov r8 r8 encoding for nop being selected when the v6/thumb 2
+  // encoding is available.
+  case ARM::tMOVr: {
+    if (Operands[0]->isToken() &&
+        static_cast<ARMOperand &>(*Operands[0]).getToken() == "nop" &&
+        ((isThumb() && !isThumbOne()) || hasV6MOps())) {
+      return Match_MnemonicFail;
+    }
+  }
+    LLVM_FALLTHROUGH;
+  default:
+    return Match_Success;
+  }
+}
+
 unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   // 16-bit thumb arithmetic instructions either require or preclude the 'S'
   // suffix depending on whether they're in an IT block or not.
@@ -10966,22 +11063,23 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
            "optionally flag setting instruction missing optional def operand");
     assert(MCID.NumOperands == Inst.getNumOperands() &&
            "operand count mismatch!");
-    // Find the optional-def operand (cc_out).
-    unsigned OpNo;
-    for (OpNo = 0;
-         OpNo < MCID.NumOperands && !MCID.operands()[OpNo].isOptionalDef();
-         ++OpNo)
-      ;
+    bool IsCPSR = false;
+    // Check if the instruction has CPSR set.
+    for (unsigned OpNo = 0; OpNo < MCID.NumOperands; ++OpNo) {
+      if (MCID.operands()[OpNo].isOptionalDef() &&
+          Inst.getOperand(OpNo).isReg() &&
+          Inst.getOperand(OpNo).getReg() == ARM::CPSR)
+        IsCPSR = true;
+    }
+
     // If we're parsing Thumb1, reject it completely.
-    if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR)
+    if (isThumbOne() && !IsCPSR)
       return Match_RequiresFlagSetting;
     // If we're parsing Thumb2, which form is legal depends on whether we're
     // in an IT block.
-    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR &&
-        !inITBlock())
+    if (isThumbTwo() && !IsCPSR && !inITBlock())
       return Match_RequiresITBlock;
-    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
-        inITBlock())
+    if (isThumbTwo() && IsCPSR && inITBlock())
       return Match_RequiresNotITBlock;
     // LSL with zero immediate is not allowed in an IT block
     if (Opc == ARM::tLSLri && Inst.getOperand(3).getImm() == 0 && inITBlock())
@@ -11042,6 +11140,14 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     if (!hasV8Ops() && (Inst.getOperand(0).getReg() == ARM::SP))
       return Match_RequiresV8;
     break;
+  case ARM::tMUL:
+    // The second source operand must be the same register as the destination
+    // operand.
+    // FIXME: Ideally this would be handled by ARMGenAsmMatcher and
+    // emitAsmTiedOperandConstraints.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(3).getReg())
+      return Match_InvalidTiedOperand;
+    break;
   default:
     break;
   }
@@ -11200,6 +11306,9 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   MatchResult = MatchInstruction(Operands, Inst, NearMisses, MatchingInlineAsm,
                                  PendConditionalInstruction, Out);
 
+  // Find the number of operators that are part of the Mnumonic (LHS).
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+
   switch (MatchResult) {
   case Match_Success:
     LLVM_DEBUG(dbgs() << "Parsed as: ";
@@ -11208,7 +11317,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
     // Context sensitive operand constraints aren't handled by the matcher,
     // so check them here.
-    if (validateInstruction(Inst, Operands)) {
+    if (validateInstruction(Inst, Operands, MnemonicOpsEndInd)) {
       // Still progress the IT block, otherwise one wrong condition causes
       // nasty cascading errors.
       forwardITPosition();
@@ -11221,7 +11330,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       // encoding is selected. Loop on it while changes happen so the
       // individual transformations can chain off each other. E.g.,
       // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
-      while (processInstruction(Inst, Operands, Out))
+      while (processInstruction(Inst, Operands, MnemonicOpsEndInd, Out))
         LLVM_DEBUG(dbgs() << "Changed to: ";
                    Inst.dump_pretty(dbgs(), MII.getName(Inst.getOpcode()));
                    dbgs() << "\n");
@@ -12562,6 +12671,8 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
   SmallSet<FeatureBitset, 4> FeatureMissesSeen;
   bool ReportedTooFewOperands = false;
 
+  unsigned MnemonicOpsEndInd = getMnemonicOpsEndInd(Operands);
+
   // Process the near-misses in reverse order, so that we see more general ones
   // first, and so can avoid emitting more specific ones.
   for (NearMissInfo &I : reverse(NearMissesIn)) {
@@ -12670,6 +12781,16 @@ ARMAsmParser::FilterNearMisses(SmallVectorImpl<NearMissInfo> &NearMissesIn,
       case Match_RequiresFlagSetting:
         Message.Message = "no flag-preserving variant of this instruction available";
         break;
+      case Match_InvalidTiedOperand: {
+        ARMOperand &Op = static_cast<ARMOperand &>(*Operands[0]);
+        if (Op.isToken() && Op.getToken() == "mul") {
+          Message.Message = "destination register must match a source register";
+          Message.Loc = Operands[MnemonicOpsEndInd]->getStartLoc();
+        } else {
+          llvm_unreachable("Match_InvalidTiedOperand only used for tMUL.");
+        }
+        break;
+      }
       case Match_InvalidOperand:
         Message.Message = "invalid operand for instruction";
         break;
@@ -12869,11 +12990,29 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
     if (hasV8Ops() && Op.isReg() && Op.getReg() == ARM::SP)
       return Match_Success;
     return Match_rGPR;
-  case MCK_GPRPair:
-    if (Op.isReg() &&
-        MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg()))
+  // Note: This mutates the operand which could cause issues for future
+  // matches if this one fails later.
+  // It would be better to do this in addVecList but as this doesn't have access
+  // to MRI this isn't possible.
+  // If trying to match a VecListDPair with a Q register, convert Q to list.
+  case MCK_VecListDPair:
+    if (Op.isQReg() && !hasMVE()) {
+      auto DPair = getDRegFromQReg(Op.getReg());
+      DPair = MRI->getMatchingSuperReg(
+          DPair, ARM::dsub_0, &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+      Op.setVecListDPair(DPair);
       return Match_Success;
-    break;
+    }
+    return Match_InvalidOperand;
+  // Note: This mutates the operand (see above).
+  // If trying to match a VecListDPair with a D register, convert D singleton
+  // list.
+  case MCK_VecListOneD:
+    if (Op.isDReg() && !hasMVE()) {
+      Op.setVecListOneD(Op.getReg());
+      return Match_Success;
+    }
+    return Match_InvalidOperand;
   }
   return Match_InvalidOperand;
 }
@@ -12921,3 +13060,15 @@ bool ARMAsmParser::isMnemonicVPTPredicable(StringRef Mnemonic,
       std::begin(predicable_prefixes), std::end(predicable_prefixes),
       [&Mnemonic](const char *prefix) { return Mnemonic.starts_with(prefix); });
 }
+
+std::unique_ptr<ARMOperand> ARMAsmParser::defaultCondCodeOp() {
+  return ARMOperand::CreateCondCode(ARMCC::AL, SMLoc());
+}
+
+std::unique_ptr<ARMOperand> ARMAsmParser::defaultCCOutOp() {
+  return ARMOperand::CreateCCOut(0, SMLoc());
+}
+
+std::unique_ptr<ARMOperand> ARMAsmParser::defaultVPTPredOp() {
+  return ARMOperand::CreateVPTPred(ARMVCC::None, SMLoc());
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 163360c08ffba0..28e5840fdde5b4 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -41,7 +41,7 @@ namespace ARM_AM {
 
   inline const char *getAddrOpcStr(AddrOpc Op) { return Op == sub ? "-" : ""; }
 
-  inline const char *getShiftOpcStr(ShiftOpc Op) {
+  inline const StringRef getShiftOpcStr(ShiftOpc Op) {
     switch (Op) {
     default: llvm_unreachable("Unknown shift opc!");
     case ARM_AM::asr: return "asr";
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 404c280ab97991..924a4580aa5fdf 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -780,8 +780,9 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
   // Make sure the offsets are scaled correctly
   Instruction *ScaledOffsets = BinaryOperator::Create(
       Instruction::Shl, OffsetsIncoming,
-      Builder.CreateVectorSplat(Ty->getNumElements(), Builder.getInt32(TypeScale)),
-      "ScaledIndex", I);
+      Builder.CreateVectorSplat(Ty->getNumElements(),
+                                Builder.getInt32(TypeScale)),
+      "ScaledIndex", I->getIterator());
   // Add the base to the offsets
   OffsetsIncoming = BinaryOperator::Create(
       Instruction::Add, ScaledOffsets,
@@ -790,7 +791,7 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
           Builder.CreatePtrToInt(
               BasePtr,
               cast<VectorType>(ScaledOffsets->getType())->getElementType())),
-      "StartIndex", I);
+      "StartIndex", I->getIterator());
 
   if (I->getIntrinsicID() == Intrinsic::masked_gather)
     return tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate);
@@ -838,7 +839,8 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat(
   Instruction *ScaledOffsets = BinaryOperator::Create(
       Instruction::Shl, Phi->getIncomingValue(1 - IncrementIndex),
       Builder.CreateVectorSplat(NumElems, Builder.getInt32(TypeScale)),
-      "ScaledIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+      "ScaledIndex",
+      Phi->getIncomingBlock(1 - IncrementIndex)->back().getIterator());
   // Add the base to the offsets
   OffsetsIncoming = BinaryOperator::Create(
       Instruction::Add, ScaledOffsets,
@@ -847,13 +849,14 @@ Instruction *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat(
           Builder.CreatePtrToInt(
               BasePtr,
               cast<VectorType>(ScaledOffsets->getType())->getElementType())),
-      "StartIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+      "StartIndex",
+      Phi->getIncomingBlock(1 - IncrementIndex)->back().getIterator());
   // The gather is pre-incrementing
   OffsetsIncoming = BinaryOperator::Create(
       Instruction::Sub, OffsetsIncoming,
       Builder.CreateVectorSplat(NumElems, Builder.getInt32(Immediate)),
       "PreIncrementStartIndex",
-      &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+      Phi->getIncomingBlock(1 - IncrementIndex)->back().getIterator());
   Phi->setIncomingValue(1 - IncrementIndex, OffsetsIncoming);
 
   Builder.SetInsertPoint(I);
@@ -886,8 +889,8 @@ void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi,
                                           Value *OffsSecondOperand,
                                           unsigned StartIndex) {
   LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising add instruction\n");
-  Instruction *InsertionPoint =
-        &cast<Instruction>(Phi->getIncomingBlock(StartIndex)->back());
+  BasicBlock::iterator InsertionPoint =
+      Phi->getIncomingBlock(StartIndex)->back().getIterator();
   // Initialize the phi with a vector that contains a sum of the constants
   Instruction *NewIndex = BinaryOperator::Create(
       Instruction::Add, Phi->getIncomingValue(StartIndex), OffsSecondOperand,
@@ -911,8 +914,8 @@ void MVEGatherScatterLowering::pushOutMulShl(unsigned Opcode, PHINode *&Phi,
 
   // Create a new scalar add outside of the loop and transform it to a splat
   // by which loop variable can be incremented
-  Instruction *InsertionPoint = &cast<Instruction>(
-        Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back());
+  BasicBlock::iterator InsertionPoint =
+      Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back().getIterator();
 
   // Create a new index
   Value *StartIndex =
@@ -923,11 +926,14 @@ void MVEGatherScatterLowering::pushOutMulShl(unsigned Opcode, PHINode *&Phi,
   Instruction *Product =
       BinaryOperator::Create((Instruction::BinaryOps)Opcode, IncrementPerRound,
                              OffsSecondOperand, "Product", InsertionPoint);
+
+  BasicBlock::iterator NewIncrInsertPt =
+      Phi->getIncomingBlock(LoopIncrement)->back().getIterator();
+  NewIncrInsertPt = std::prev(NewIncrInsertPt);
+
   // Increment NewIndex by Product instead of the multiplication
   Instruction *NewIncrement = BinaryOperator::Create(
-      Instruction::Add, Phi, Product, "IncrementPushedOutMul",
-      cast<Instruction>(Phi->getIncomingBlock(LoopIncrement)->back())
-          .getPrevNode());
+      Instruction::Add, Phi, Product, "IncrementPushedOutMul", NewIncrInsertPt);
 
   Phi->addIncoming(StartIndex,
                    Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1));
@@ -1054,7 +1060,7 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
       // our phi, we need to copy it
       IncInstruction = BinaryOperator::Create(
           Instruction::BinaryOps(IncInstruction->getOpcode()), Phi,
-          IncrementPerRound, "LoopIncrement", IncInstruction);
+          IncrementPerRound, "LoopIncrement", IncInstruction->getIterator());
       Phi->setIncomingValue(IncrementingBlock, IncInstruction);
     }
     NewPhi = Phi;
@@ -1066,7 +1072,7 @@ bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
                         Phi->getIncomingBlock(IncrementingBlock == 1 ? 0 : 1));
     IncInstruction = BinaryOperator::Create(
         Instruction::BinaryOps(IncInstruction->getOpcode()), NewPhi,
-        IncrementPerRound, "LoopIncrement", IncInstruction);
+        IncrementPerRound, "LoopIncrement", IncInstruction->getIterator());
     NewPhi->addIncoming(IncInstruction,
                         Phi->getIncomingBlock(IncrementingBlock));
     IncrementingBlock = 1;
@@ -1229,7 +1235,7 @@ bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB,
         BaseTy = FixedVectorType::get(BaseTy, VecTy);
       GetElementPtrInst *NewAddress = GetElementPtrInst::Create(
           Builder.getInt8Ty(), Builder.CreateBitCast(Base, BaseTy), Offsets,
-          "gep.merged", GEP);
+          "gep.merged", GEP->getIterator());
       LLVM_DEBUG(dbgs() << "Folded GEP: " << *GEP
                         << "\n      new :  " << *NewAddress << "\n");
       GEP->replaceAllUsesWith(
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 68e14f7dc92031..fe0d3b6c8189b7 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -1387,7 +1387,7 @@ let mayLoad = 1, hasSideEffects = 0,
 
 // Load indirect with displacement operations.
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  let Constraints = "@earlyclobber $reg" in def LDDRdPtrQ
+  def LDDRdPtrQ
       : FSTDLDD<0,
                 (outs GPR8
                  : $reg),
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 3145bc3d19f5dc..1688355f427cc7 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -232,6 +232,7 @@ struct BPFOperand : public MCParsedAsmOperand {
         .Case("callx", true)
         .Case("goto", true)
         .Case("gotol", true)
+        .Case("may_goto", true)
         .Case("*", true)
         .Case("exit", true)
         .Case("lock", true)
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index f2d1206d02316a..4be6220b358ba3 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -426,8 +426,9 @@ static void replaceWithGEP(CallInst *Call, uint32_t DimensionIndex,
     IdxList.push_back(Zero);
   IdxList.push_back(Call->getArgOperand(GEPIndex));
 
-  auto *GEP = GetElementPtrInst::CreateInBounds(
-      getBaseElementType(Call), Call->getArgOperand(0), IdxList, "", Call);
+  auto *GEP = GetElementPtrInst::CreateInBounds(getBaseElementType(Call),
+                                                Call->getArgOperand(0), IdxList,
+                                                "", Call->getIterator());
   Call->replaceAllUsesWith(GEP);
   Call->eraseFromParent();
 }
@@ -1091,9 +1092,11 @@ bool BPFAbstractMemberAccess::transformGEPChain(CallInst *Call,
     // Load the global variable which represents the returned field info.
     LoadInst *LDInst;
     if (IsInt32Ret)
-      LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "", Call);
+      LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "",
+                            Call->getIterator());
     else
-      LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+      LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "",
+                            Call->getIterator());
 
     Instruction *PassThroughInst =
         BPFCoreSharedInfo::insertPassThrough(M, BB, LDInst, Call);
@@ -1113,7 +1116,8 @@ bool BPFAbstractMemberAccess::transformGEPChain(CallInst *Call,
   // The original Call inst is removed.
 
   // Load the global variable.
-  auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+  auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "",
+                              Call->getIterator());
 
   // Generate a BitCast
   auto *BCInst =
diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
index edd59aaa6d01d2..2b78ed7134c92f 100644
--- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
+++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -15,7 +15,7 @@
 //   - remove llvm.bpf.getelementptr.and.load builtins.
 //   - remove llvm.bpf.getelementptr.and.store builtins.
 //   - for loads and stores with base addresses from non-zero address space
-//     cast base address to zero address space (support for BPF arenas).
+//     cast base address to zero address space (support for BPF address spaces).
 //
 //===----------------------------------------------------------------------===//
 
@@ -482,7 +482,7 @@ static void aspaceWrapOperand(DenseMap<Value *, Value *> &Cache, Instruction *I,
   }
 }
 
-// Support for BPF arenas:
+// Support for BPF address spaces:
 // - for each function in the module M, update pointer operand of
 //   each memory access instruction (load/store/cmpxchg/atomicrmw)
 //   by casting it from non-zero address space to zero address space, e.g:
@@ -490,7 +490,7 @@ static void aspaceWrapOperand(DenseMap<Value *, Value *> &Cache, Instruction *I,
 //   (load (ptr addrspace (N) %p) ...)
 //     -> (load (addrspacecast ptr addrspace (N) %p to ptr))
 //
-// - assign section with name .arena.N for globals defined in
+// - assign section with name .addr_space.N for globals defined in
 //   non-zero address space N
 bool BPFCheckAndAdjustIR::insertASpaceCasts(Module &M) {
   bool Changed = false;
@@ -517,13 +517,13 @@ bool BPFCheckAndAdjustIR::insertASpaceCasts(Module &M) {
     Changed |= !CastsCache.empty();
   }
   // Merge all globals within same address space into single
-  // .arena.<addr space no> section
+  // .addr_space.<addr space no> section
   for (GlobalVariable &G : M.globals()) {
     if (G.getAddressSpace() == 0 || G.hasSection())
       continue;
     SmallString<16> SecName;
     raw_svector_ostream OS(SecName);
-    OS << ".arena." << G.getAddressSpace();
+    OS << ".addr_space." << G.getAddressSpace();
     G.setSection(SecName);
     // Prevent having separate section for constants
     G.setConstant(false);
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index d8139958e9fcf2..7b8bcb2c5866da 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -242,7 +242,9 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
   bool to_replace = false;
   SDLoc DL(Node);
   const LoadSDNode *LD = cast<LoadSDNode>(Node);
-  uint64_t size = LD->getMemOperand()->getSize();
+  if (!LD->getMemOperand()->getSize().hasValue())
+    return;
+  uint64_t size = LD->getMemOperand()->getSize().getValue();
 
   if (!size || size > 8 || (size & (size - 1)) || !LD->isSimple())
     return;
diff --git a/llvm/lib/Target/BPF/BPFInstrFormats.td b/llvm/lib/Target/BPF/BPFInstrFormats.td
index 6ed83d877ac0f2..feffdbc69465ea 100644
--- a/llvm/lib/Target/BPF/BPFInstrFormats.td
+++ b/llvm/lib/Target/BPF/BPFInstrFormats.td
@@ -73,6 +73,7 @@ def BPF_JLT  : BPFJumpOp<0xa>;
 def BPF_JLE  : BPFJumpOp<0xb>;
 def BPF_JSLT : BPFJumpOp<0xc>;
 def BPF_JSLE : BPFJumpOp<0xd>;
+def BPF_JCOND : BPFJumpOp<0xe>;
 
 class BPFWidthModifer<bits<2> val> {
   bits<2> Value = val;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 7198e9499bc32a..66c57952a7f10e 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -215,6 +215,18 @@ class JMP_RI<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
   let BPFClass = BPF_JMP;
 }
 
+class JMP_JCOND<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
+    : TYPE_ALU_JMP<Opc.Value, BPF_K.Value,
+                   (outs),
+                   (ins brtarget:$BrDst),
+                   !strconcat(OpcodeStr, " $BrDst"),
+                   Pattern> {
+  bits<16> BrDst;
+
+  let Inst{47-32} = BrDst;
+  let BPFClass = BPF_JMP;
+}
+
 class JMP_RR_32<BPFJumpOp Opc, string OpcodeStr, PatLeaf Cond>
     : TYPE_ALU_JMP<Opc.Value, BPF_X.Value,
                    (outs),
@@ -267,6 +279,7 @@ defm JULE : J<BPF_JLE, "<=", BPF_CC_LEU, BPF_CC_LEU_32>;
 defm JSLT : J<BPF_JSLT, "s<", BPF_CC_LT, BPF_CC_LT_32>;
 defm JSLE : J<BPF_JSLE, "s<=", BPF_CC_LE, BPF_CC_LE_32>;
 defm JSET : J<BPF_JSET, "&", NoCond, NoCond>;
+def JCOND : JMP_JCOND<BPF_JCOND, "may_goto", []>;
 }
 
 // ALU instructions
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index dae1aeea352189..afc154988050c8 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -116,8 +116,8 @@ static bool BPFPreserveDITypeImpl(Function &F) {
     GV->setMetadata(LLVMContext::MD_preserve_access_index, MD);
 
     // Load the global variable which represents the type info.
-    auto *LDInst =
-        new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
+    auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "",
+                                Call->getIterator());
     Instruction *PassThroughInst =
         BPFCoreSharedInfo::insertPassThrough(M, BB, LDInst, Call);
     Call->replaceAllUsesWith(PassThroughInst);
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 7dad40803d4770..44932383fb43e9 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -81,7 +81,10 @@ class BPFMCInstrAnalysis : public MCInstrAnalysis {
     // The target is the 3rd operand of cond inst and the 1st of uncond inst.
     int32_t Imm;
     if (isConditionalBranch(Inst)) {
-      Imm = (short)Inst.getOperand(2).getImm();
+      if (Inst.getOpcode() == BPF::JCOND)
+        Imm = (short)Inst.getOperand(0).getImm();
+      else
+        Imm = (short)Inst.getOperand(2).getImm();
     } else if (isUnconditionalBranch(Inst)) {
       if (Inst.getOpcode() == BPF::JMP)
         Imm = (short)Inst.getOperand(0).getImm();
diff --git a/llvm/lib/Target/DirectX/CMakeLists.txt b/llvm/lib/Target/DirectX/CMakeLists.txt
index bf93280779bf8b..4c70b3f9230edb 100644
--- a/llvm/lib/Target/DirectX/CMakeLists.txt
+++ b/llvm/lib/Target/DirectX/CMakeLists.txt
@@ -19,6 +19,7 @@ add_llvm_target(DirectXCodeGen
   DirectXSubtarget.cpp
   DirectXTargetMachine.cpp
   DXContainerGlobals.cpp
+  DXILIntrinsicExpansion.cpp
   DXILMetadata.cpp
   DXILOpBuilder.cpp
   DXILOpLowering.cpp
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 56063b487f6847..65cf1dfdb4031b 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -28,7 +28,7 @@ using namespace llvm::dxil;
 namespace {
 class DXContainerGlobals : public llvm::ModulePass {
 
-  GlobalVariable *getShaderFlags(Module &M);
+  GlobalVariable *getFeatureFlags(Module &M);
   GlobalVariable *computeShaderHash(Module &M);
 
 public:
@@ -53,21 +53,24 @@ class DXContainerGlobals : public llvm::ModulePass {
 
 bool DXContainerGlobals::runOnModule(Module &M) {
   llvm::SmallVector<GlobalValue *> Globals;
-  Globals.push_back(getShaderFlags(M));
+  Globals.push_back(getFeatureFlags(M));
   Globals.push_back(computeShaderHash(M));
 
   appendToCompilerUsed(M, Globals);
   return true;
 }
 
-GlobalVariable *DXContainerGlobals::getShaderFlags(Module &M) {
-  const uint64_t Flags =
-      (uint64_t)(getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags());
+GlobalVariable *DXContainerGlobals::getFeatureFlags(Module &M) {
+  const uint64_t FeatureFlags =
+      static_cast<uint64_t>(getAnalysis<ShaderFlagsAnalysisWrapper>()
+                                .getShaderFlags()
+                                .getFeatureFlags());
 
-  Constant *FlagsConstant = ConstantInt::get(M.getContext(), APInt(64, Flags));
-  auto *GV = new llvm::GlobalVariable(M, FlagsConstant->getType(), true,
+  Constant *FeatureFlagsConstant =
+      ConstantInt::get(M.getContext(), APInt(64, FeatureFlags));
+  auto *GV = new llvm::GlobalVariable(M, FeatureFlagsConstant->getType(), true,
                                       GlobalValue::PrivateLinkage,
-                                      FlagsConstant, "dx.sfi0");
+                                      FeatureFlagsConstant, "dx.sfi0");
   GV->setSection("SFI0");
   GV->setAlignment(Align(4));
   return GV;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 66b0ef24332c25..216fa5b10c8f4d 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -266,12 +266,26 @@ def Frac : DXILOpMapping<22, unary, int_dx_frac,
                          "Returns a fraction from 0 to 1 that represents the "
                          "decimal part of the input.",
                          [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def RSqrt : DXILOpMapping<25, unary, int_dx_rsqrt,
+                         "Returns the reciprocal of the square root of the specified value."
+                         "rsqrt(x) = 1 / sqrt(x).",
+                         [llvm_halforfloat_ty, LLVMMatchType<0>]>;
 def Round : DXILOpMapping<26, unary, int_round,
                          "Returns the input rounded to the nearest integer"
                          "within a floating-point type.",
                          [llvm_halforfloat_ty, LLVMMatchType<0>]>;
+def FMax : DXILOpMapping<35, binary, int_maxnum,
+                         "Float maximum. FMax(a,b) = a > b ? a : b">;
+def FMin : DXILOpMapping<36, binary, int_minnum,
+                         "Float minimum. FMin(a,b) = a < b ? a : b">;
+def SMax : DXILOpMapping<37, binary, int_smax,
+                         "Signed integer maximum. SMax(a,b) = a > b ? a : b">;
+def SMin : DXILOpMapping<38, binary, int_smin,
+                         "Signed integer minimum. SMin(a,b) = a < b ? a : b">;
 def UMax : DXILOpMapping<39, binary, int_umax,
                          "Unsigned integer maximum. UMax(a,b) = a > b ? a : b">;
+def UMin : DXILOpMapping<40, binary, int_umin,
+                         "Unsigned integer minimum. UMin(a,b) = a < b ? a : b">;
 def FMad : DXILOpMapping<46, tertiary, int_fmuladd,
                          "Floating point arithmetic multiply/add operation. fmad(m,a,b) = m * a + b.">;
 def IMad : DXILOpMapping<48, tertiary, int_dx_imad,
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
new file mode 100644
index 00000000000000..0db42bc0a0fb64
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -0,0 +1,271 @@
+//===- DXILIntrinsicExpansion.cpp - Prepare LLVM Module for DXIL encoding--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file contains DXIL intrinsic expansions for those that don't have
+//  opcodes in DirectX Intermediate Language (DXIL).
+//===----------------------------------------------------------------------===//
+
+#include "DXILIntrinsicExpansion.h"
+#include "DirectX.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+
+#define DEBUG_TYPE "dxil-intrinsic-expansion"
+
+using namespace llvm;
+
+static bool isIntrinsicExpansion(Function &F) {
+  switch (F.getIntrinsicID()) {
+  case Intrinsic::exp:
+  case Intrinsic::dx_any:
+  case Intrinsic::dx_clamp:
+  case Intrinsic::dx_uclamp:
+  case Intrinsic::dx_lerp:
+  case Intrinsic::dx_rcp:
+  case Intrinsic::dx_sdot:
+  case Intrinsic::dx_udot:
+    return true;
+  }
+  return false;
+}
+
+static bool expandIntegerDot(CallInst *Orig, Intrinsic::ID DotIntrinsic) {
+  assert(DotIntrinsic == Intrinsic::dx_sdot ||
+         DotIntrinsic == Intrinsic::dx_udot);
+  Intrinsic::ID MadIntrinsic = DotIntrinsic == Intrinsic::dx_sdot
+                                   ? Intrinsic::dx_imad
+                                   : Intrinsic::dx_umad;
+  Value *A = Orig->getOperand(0);
+  Value *B = Orig->getOperand(1);
+  Type *ATy = A->getType();
+  Type *BTy = B->getType();
+  assert(ATy->isVectorTy() && BTy->isVectorTy());
+
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+
+  auto *AVec = dyn_cast<FixedVectorType>(A->getType());
+  Value *Elt0 = Builder.CreateExtractElement(A, (uint64_t)0);
+  Value *Elt1 = Builder.CreateExtractElement(B, (uint64_t)0);
+  Value *Result = Builder.CreateMul(Elt0, Elt1);
+  for (unsigned I = 1; I < AVec->getNumElements(); I++) {
+    Elt0 = Builder.CreateExtractElement(A, I);
+    Elt1 = Builder.CreateExtractElement(B, I);
+    Result = Builder.CreateIntrinsic(Result->getType(), MadIntrinsic,
+                                     ArrayRef<Value *>{Elt0, Elt1, Result},
+                                     nullptr, "dx.mad");
+  }
+  Orig->replaceAllUsesWith(Result);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandExpIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  Type *Ty = X->getType();
+  Type *EltTy = Ty->getScalarType();
+  Constant *Log2eConst =
+      Ty->isVectorTy() ? ConstantVector::getSplat(
+                             ElementCount::getFixed(
+                                 cast<FixedVectorType>(Ty)->getNumElements()),
+                             ConstantFP::get(EltTy, numbers::log2e))
+                       : ConstantFP::get(EltTy, numbers::log2e);
+  Value *NewX = Builder.CreateFMul(Log2eConst, X);
+  auto *Exp2Call =
+      Builder.CreateIntrinsic(Ty, Intrinsic::exp2, {NewX}, nullptr, "dx.exp2");
+  Exp2Call->setTailCall(Orig->isTailCall());
+  Exp2Call->setAttributes(Orig->getAttributes());
+  Orig->replaceAllUsesWith(Exp2Call);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandAnyIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  Type *Ty = X->getType();
+  Type *EltTy = Ty->getScalarType();
+
+  if (!Ty->isVectorTy()) {
+    Value *Cond = EltTy->isFloatingPointTy()
+                      ? Builder.CreateFCmpUNE(X, ConstantFP::get(EltTy, 0))
+                      : Builder.CreateICmpNE(X, ConstantInt::get(EltTy, 0));
+    Orig->replaceAllUsesWith(Cond);
+  } else {
+    auto *XVec = dyn_cast<FixedVectorType>(Ty);
+    Value *Cond =
+        EltTy->isFloatingPointTy()
+            ? Builder.CreateFCmpUNE(
+                  X, ConstantVector::getSplat(
+                         ElementCount::getFixed(XVec->getNumElements()),
+                         ConstantFP::get(EltTy, 0)))
+            : Builder.CreateICmpNE(
+                  X, ConstantVector::getSplat(
+                         ElementCount::getFixed(XVec->getNumElements()),
+                         ConstantInt::get(EltTy, 0)));
+    Value *Result = Builder.CreateExtractElement(Cond, (uint64_t)0);
+    for (unsigned I = 1; I < XVec->getNumElements(); I++) {
+      Value *Elt = Builder.CreateExtractElement(Cond, I);
+      Result = Builder.CreateOr(Result, Elt);
+    }
+    Orig->replaceAllUsesWith(Result);
+  }
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandLerpIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  Value *Y = Orig->getOperand(1);
+  Value *S = Orig->getOperand(2);
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  auto *V = Builder.CreateFSub(Y, X);
+  V = Builder.CreateFMul(S, V);
+  auto *Result = Builder.CreateFAdd(X, V, "dx.lerp");
+  Orig->replaceAllUsesWith(Result);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandRcpIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  Type *Ty = X->getType();
+  Type *EltTy = Ty->getScalarType();
+  Constant *One =
+      Ty->isVectorTy()
+          ? ConstantVector::getSplat(
+                ElementCount::getFixed(
+                    dyn_cast<FixedVectorType>(Ty)->getNumElements()),
+                ConstantFP::get(EltTy, 1.0))
+          : ConstantFP::get(EltTy, 1.0);
+  auto *Result = Builder.CreateFDiv(One, X, "dx.rcp");
+  Orig->replaceAllUsesWith(Result);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static Intrinsic::ID getMaxForClamp(Type *ElemTy,
+                                    Intrinsic::ID ClampIntrinsic) {
+  if (ClampIntrinsic == Intrinsic::dx_uclamp)
+    return Intrinsic::umax;
+  assert(ClampIntrinsic == Intrinsic::dx_clamp);
+  if (ElemTy->isVectorTy())
+    ElemTy = ElemTy->getScalarType();
+  if (ElemTy->isIntegerTy())
+    return Intrinsic::smax;
+  assert(ElemTy->isFloatingPointTy());
+  return Intrinsic::maxnum;
+}
+
+static Intrinsic::ID getMinForClamp(Type *ElemTy,
+                                    Intrinsic::ID ClampIntrinsic) {
+  if (ClampIntrinsic == Intrinsic::dx_uclamp)
+    return Intrinsic::umin;
+  assert(ClampIntrinsic == Intrinsic::dx_clamp);
+  if (ElemTy->isVectorTy())
+    ElemTy = ElemTy->getScalarType();
+  if (ElemTy->isIntegerTy())
+    return Intrinsic::smin;
+  assert(ElemTy->isFloatingPointTy());
+  return Intrinsic::minnum;
+}
+
+static bool expandClampIntrinsic(CallInst *Orig, Intrinsic::ID ClampIntrinsic) {
+  Value *X = Orig->getOperand(0);
+  Value *Min = Orig->getOperand(1);
+  Value *Max = Orig->getOperand(2);
+  Type *Ty = X->getType();
+  IRBuilder<> Builder(Orig->getParent());
+  Builder.SetInsertPoint(Orig);
+  auto *MaxCall = Builder.CreateIntrinsic(
+      Ty, getMaxForClamp(Ty, ClampIntrinsic), {X, Min}, nullptr, "dx.max");
+  auto *MinCall =
+      Builder.CreateIntrinsic(Ty, getMinForClamp(Ty, ClampIntrinsic),
+                              {MaxCall, Max}, nullptr, "dx.min");
+
+  Orig->replaceAllUsesWith(MinCall);
+  Orig->eraseFromParent();
+  return true;
+}
+
+static bool expandIntrinsic(Function &F, CallInst *Orig) {
+  switch (F.getIntrinsicID()) {
+  case Intrinsic::exp:
+    return expandExpIntrinsic(Orig);
+  case Intrinsic::dx_any:
+    return expandAnyIntrinsic(Orig);
+  case Intrinsic::dx_uclamp:
+  case Intrinsic::dx_clamp:
+    return expandClampIntrinsic(Orig, F.getIntrinsicID());
+  case Intrinsic::dx_lerp:
+    return expandLerpIntrinsic(Orig);
+  case Intrinsic::dx_rcp:
+    return expandRcpIntrinsic(Orig);
+  case Intrinsic::dx_sdot:
+  case Intrinsic::dx_udot:
+    return expandIntegerDot(Orig, F.getIntrinsicID());
+  }
+  return false;
+}
+
+static bool expansionIntrinsics(Module &M) {
+  for (auto &F : make_early_inc_range(M.functions())) {
+    if (!isIntrinsicExpansion(F))
+      continue;
+    bool IntrinsicExpanded = false;
+    for (User *U : make_early_inc_range(F.users())) {
+      auto *IntrinsicCall = dyn_cast<CallInst>(U);
+      if (!IntrinsicCall)
+        continue;
+      IntrinsicExpanded = expandIntrinsic(F, IntrinsicCall);
+    }
+    if (F.user_empty() && IntrinsicExpanded)
+      F.eraseFromParent();
+  }
+  return true;
+}
+
+PreservedAnalyses DXILIntrinsicExpansion::run(Module &M,
+                                              ModuleAnalysisManager &) {
+  if (expansionIntrinsics(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+bool DXILIntrinsicExpansionLegacy::runOnModule(Module &M) {
+  return expansionIntrinsics(M);
+}
+
+char DXILIntrinsicExpansionLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DXILIntrinsicExpansionLegacy, DEBUG_TYPE,
+                      "DXIL Intrinsic Expansion", false, false)
+INITIALIZE_PASS_END(DXILIntrinsicExpansionLegacy, DEBUG_TYPE,
+                    "DXIL Intrinsic Expansion", false, false)
+
+ModulePass *llvm::createDXILIntrinsicExpansionLegacyPass() {
+  return new DXILIntrinsicExpansionLegacy();
+}
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.h b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.h
new file mode 100644
index 00000000000000..c86681af7a3712
--- /dev/null
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.h
@@ -0,0 +1,33 @@
+//===- DXILIntrinsicExpansion.h - Prepare LLVM Module for DXIL encoding----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TARGET_DIRECTX_DXILINTRINSICEXPANSION_H
+#define LLVM_TARGET_DIRECTX_DXILINTRINSICEXPANSION_H
+
+#include "DXILResource.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+/// A pass that transforms DXIL Intrinsics that don't have DXIL opCodes
+class DXILIntrinsicExpansion : public PassInfoMixin<DXILIntrinsicExpansion> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
+};
+
+class DXILIntrinsicExpansionLegacy : public ModulePass {
+
+public:
+  bool runOnModule(Module &M) override;
+  DXILIntrinsicExpansionLegacy() : ModulePass(ID) {}
+
+  static char ID; // Pass identification.
+};
+} // namespace llvm
+
+#endif // LLVM_TARGET_DIRECTX_DXILINTRINSICEXPANSION_H
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 6b649b76beecdf..e5c2042e7d16ae 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "DXILConstants.h"
+#include "DXILIntrinsicExpansion.h"
 #include "DXILOpBuilder.h"
 #include "DirectX.h"
 #include "llvm/ADT/SmallVector.h"
@@ -94,9 +95,12 @@ class DXILOpLoweringLegacy : public ModulePass {
   DXILOpLoweringLegacy() : ModulePass(ID) {}
 
   static char ID; // Pass identification.
+  void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
+    // Specify the passes that your pass depends on
+    AU.addRequired<DXILIntrinsicExpansionLegacy>();
+  }
 };
 char DXILOpLoweringLegacy::ID = 0;
-
 } // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(DXILOpLoweringLegacy, DEBUG_TYPE, "DXIL Op Lowering",
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index 66a9dc46bcbfbf..9fa137b4c025e1 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -51,9 +51,14 @@ void ComputedShaderFlags::print(raw_ostream &OS) const {
   if (FlagVal == 0)
     return;
   OS << "; Note: shader requires additional functionality:\n";
-#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleNum, FlagName, Str)          \
   if (FlagName)                                                                \
-    OS << ";       " Str "\n";
+    (OS << ";").indent(7) << Str << "\n";
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+  OS << "; Note: extra DXIL module flags:\n";
+#define DXIL_MODULE_FLAG(DxilModuleBit, FlagName, Str)                         \
+  if (FlagName)                                                                \
+    (OS << ";").indent(7) << Str << "\n";
 #include "llvm/BinaryFormat/DXContainerConstants.def"
   OS << ";\n";
 }
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h
index 574a7b090f5281..1df7d27de13d3c 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.h
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_TARGET_DIRECTX_DXILSHADERFLAGS_H
 #define LLVM_TARGET_DIRECTX_DXILSHADERFLAGS_H
 
-#include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
@@ -29,22 +28,37 @@ class GlobalVariable;
 namespace dxil {
 
 struct ComputedShaderFlags {
-#define SHADER_FEATURE_FLAG(bit, FlagName, Str) bool FlagName : 1;
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleBit, FlagName, Str)          \
+  bool FlagName : 1;
+#define DXIL_MODULE_FLAG(DxilModuleBit, FlagName, Str) bool FlagName : 1;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 
-#define SHADER_FEATURE_FLAG(bit, FlagName, Str) FlagName = false;
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleBit, FlagName, Str)          \
+  FlagName = false;
+#define DXIL_MODULE_FLAG(DxilModuleBit, FlagName, Str) FlagName = false;
   ComputedShaderFlags() {
 #include "llvm/BinaryFormat/DXContainerConstants.def"
   }
 
+  constexpr uint64_t getMask(int Bit) const {
+    return Bit != -1 ? 1ull << Bit : 0;
+  }
   operator uint64_t() const {
     uint64_t FlagValue = 0;
-#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
-  FlagValue |=                                                                 \
-      FlagName ? static_cast<uint64_t>(dxbc::FeatureFlags::FlagName) : 0ull;
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleBit, FlagName, Str)          \
+  FlagValue |= FlagName ? getMask(DxilModuleBit) : 0ull;
+#define DXIL_MODULE_FLAG(DxilModuleBit, FlagName, Str)                         \
+  FlagValue |= FlagName ? getMask(DxilModuleBit) : 0ull;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
     return FlagValue;
   }
+  uint64_t getFeatureFlags() const {
+    uint64_t FeatureFlags = 0;
+#define SHADER_FEATURE_FLAG(FeatureBit, DxilModuleBit, FlagName, Str)          \
+  FeatureFlags |= FlagName ? getMask(FeatureBit) : 0ull;
+#include "llvm/BinaryFormat/DXContainerConstants.def"
+    return FeatureFlags;
+  }
 
   static ComputedShaderFlags computeFlags(Module &M);
   void print(raw_ostream &OS = dbgs()) const;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 9c959c66be8bdd..80d94bf0c9d488 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -53,8 +53,8 @@ bool DXILTranslateMetadata::runOnModule(Module &M) {
       getAnalysis<DXILResourceWrapper>().getDXILResource();
   Res.write(M);
 
-  const uint64_t Flags =
-      (uint64_t)(getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags());
+  const uint64_t Flags = static_cast<uint64_t>(
+      getAnalysis<ShaderFlagsAnalysisWrapper>().getShaderFlags());
   dxil::createEntryMD(M, Flags);
 
   return false;
diff --git a/llvm/lib/Target/DirectX/DirectX.h b/llvm/lib/Target/DirectX/DirectX.h
index eaecc3ac280c4c..11b5412c21d783 100644
--- a/llvm/lib/Target/DirectX/DirectX.h
+++ b/llvm/lib/Target/DirectX/DirectX.h
@@ -28,6 +28,12 @@ void initializeDXILPrepareModulePass(PassRegistry &);
 /// Pass to convert modules into DXIL-compatable modules
 ModulePass *createDXILPrepareModulePass();
 
+/// Initializer for DXIL Intrinsic Expansion
+void initializeDXILIntrinsicExpansionLegacyPass(PassRegistry &);
+
+/// Pass to expand intrinsic operations that lack DXIL opCodes
+ModulePass *createDXILIntrinsicExpansionLegacyPass();
+
 /// Initializer for DXILOpLowering
 void initializeDXILOpLoweringLegacyPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index 06938f8c74f155..03c825b3977db3 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -39,6 +39,7 @@ using namespace llvm;
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
   RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
   auto *PR = PassRegistry::getPassRegistry();
+  initializeDXILIntrinsicExpansionLegacyPass(*PR);
   initializeDXILPrepareModulePass(*PR);
   initializeEmbedDXILPassPass(*PR);
   initializeWriteDXILPassPass(*PR);
@@ -76,6 +77,7 @@ class DirectXPassConfig : public TargetPassConfig {
 
   FunctionPass *createTargetRegisterAllocator(bool) override { return nullptr; }
   void addCodeGenPrepare() override {
+    addPass(createDXILIntrinsicExpansionLegacyPass());
     addPass(createDXILOpLoweringLegacyPass());
     addPass(createDXILPrepareModulePass());
     addPass(createDXILTranslateMetadataPass());
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index fd7d25fa16d1da..864591d4eb9552 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/HexagonAttributes.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
@@ -79,7 +80,7 @@ static cl::opt<bool> ErrorNoncontigiousRegister(
     "merror-noncontigious-register",
     cl::desc("Error for register names that aren't contigious"),
     cl::init(false));
-
+static cl::opt<bool> AddBuildAttributes("hexagon-add-build-attributes");
 namespace {
 
 struct HexagonOperand;
@@ -120,6 +121,9 @@ class HexagonAsmParser : public MCTargetAsmParser {
                                SMLoc &EndLoc) override;
   bool ParseDirectiveSubsection(SMLoc L);
   bool ParseDirectiveComm(bool IsLocal, SMLoc L);
+
+  bool parseDirectiveAttribute(SMLoc L);
+
   bool RegisterMatchesArch(unsigned MatchNum) const;
 
   bool matchBundleOptions();
@@ -164,6 +168,9 @@ class HexagonAsmParser : public MCTargetAsmParser {
     Parser.addAliasForDirective(".word", ".4byte");
 
     MCAsmParserExtension::Initialize(_Parser);
+
+    if (AddBuildAttributes)
+      getTargetStreamer().emitTargetAttributes(*STI);
   }
 
   bool splitIdentifier(OperandVector &Operands);
@@ -652,6 +659,56 @@ bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return finishBundle(IDLoc, Out);
   return false;
 }
+/// parseDirectiveAttribute
+///  ::= .attribute int, int
+///  ::= .attribute Tag_name, int
+bool HexagonAsmParser::parseDirectiveAttribute(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+  int64_t Tag;
+  SMLoc TagLoc = Parser.getTok().getLoc();
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    StringRef Name = Parser.getTok().getIdentifier();
+    std::optional<unsigned> Ret = ELFAttrs::attrTypeFromString(
+        Name, HexagonAttrs::getHexagonAttributeTags());
+    if (!Ret)
+      return Error(TagLoc, "attribute name not recognized: " + Name);
+    Tag = *Ret;
+    Parser.Lex();
+  } else {
+    const MCExpr *AttrExpr;
+
+    TagLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(AttrExpr))
+      return true;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(AttrExpr);
+    if (check(!CE, TagLoc, "expected numeric constant"))
+      return true;
+
+    Tag = CE->getValue();
+  }
+
+  if (Parser.parseComma())
+    return true;
+
+  // We currently only have integer values.
+  int64_t IntegerValue = 0;
+  SMLoc ValueExprLoc = Parser.getTok().getLoc();
+  const MCExpr *ValueExpr;
+  if (Parser.parseExpression(ValueExpr))
+    return true;
+
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ValueExpr);
+  if (!CE)
+    return Error(ValueExprLoc, "expected numeric constant");
+  IntegerValue = CE->getValue();
+
+  if (Parser.parseEOL())
+    return true;
+
+  getTargetStreamer().emitAttribute(Tag, IntegerValue);
+  return false;
+}
 
 /// ParseDirective parses the Hexagon specific directives
 bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
@@ -664,6 +721,8 @@ bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) {
     return ParseDirectiveComm(false, DirectiveID.getLoc());
   if (IDVal.lower() == ".subsection")
     return ParseDirectiveSubsection(DirectiveID.getLoc());
+  if (IDVal == ".attribute")
+    return parseDirectiveAttribute(DirectiveID.getLoc());
 
   return true;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 4ee67cb05d4953..d2f64ac9e90b0b 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
+#include "HexagonTargetStreamer.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCExpr.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
@@ -46,6 +47,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -775,6 +777,24 @@ void HexagonAsmPrinter::emitInstruction(const MachineInstr *MI) {
   OutStreamer->emitInstruction(MCB, getSubtargetInfo());
 }
 
+void HexagonAsmPrinter::emitStartOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    emitAttributes();
+}
+
+void HexagonAsmPrinter::emitEndOfAsmFile(Module &M) {
+  HexagonTargetStreamer &HTS =
+      static_cast<HexagonTargetStreamer &>(*OutStreamer->getTargetStreamer());
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    HTS.finishAttributeSection();
+}
+
+void HexagonAsmPrinter::emitAttributes() {
+  HexagonTargetStreamer &HTS =
+      static_cast<HexagonTargetStreamer &>(*OutStreamer->getTargetStreamer());
+  HTS.emitTargetAttributes(*TM.getMCSubtargetInfo());
+}
+
 void HexagonAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
   static const int8_t NoopsInSledCount = 4;
   // We want to emit the following pattern:
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
index 5cb1edf54d1a76..b555c885965036 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -29,6 +29,8 @@ class TargetMachine;
   class HexagonAsmPrinter : public AsmPrinter {
     const HexagonSubtarget *Subtarget = nullptr;
 
+    void emitAttributes();
+
   public:
     explicit HexagonAsmPrinter(TargetMachine &TM,
                                std::unique_ptr<MCStreamer> Streamer)
@@ -68,6 +70,8 @@ class TargetMachine;
                          const char *ExtraCode, raw_ostream &OS) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                const char *ExtraCode, raw_ostream &OS) override;
+    void emitStartOfAsmFile(Module &M) override;
+    void emitEndOfAsmFile(Module &M) override;
   };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 4ead8c68cb9014..b0de0405ce9619 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -1109,7 +1109,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
           break;
       }
     }
-    NewInst = GetElementPtrInst::Create(InpTy, Input, IdxList, "cgep", &*At);
+    NewInst = GetElementPtrInst::Create(InpTy, Input, IdxList, "cgep", At);
     NewInst->setIsInBounds(RN->Flags & GepNode::InBounds);
     LLVM_DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
     if (Idx < Num) {
diff --git a/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
index a647e699a8f07a..9d8e5c53b8227a 100644
--- a/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -292,8 +292,8 @@ bool HexagonStoreWidening::storesAreAdjacent(const MachineInstr *S1,
   int Off1 = S1->getOperand(1).getImm();
   int Off2 = S2->getOperand(1).getImm();
 
-  return (Off1 >= 0) ? Off1+S1MO.getSize() == unsigned(Off2)
-                     : int(Off1+S1MO.getSize()) == Off2;
+  return (Off1 >= 0) ? Off1 + S1MO.getSize().getValue() == unsigned(Off2)
+                     : int(Off1 + S1MO.getSize().getValue()) == Off2;
 }
 
 /// Given a sequence of adjacent stores, and a maximum size of a single wide
@@ -315,7 +315,7 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
   assert(!FirstMI->memoperands_empty() && "Expecting some memory operands");
   const MachineMemOperand &FirstMMO = getStoreTarget(FirstMI);
   unsigned Alignment = FirstMMO.getAlign().value();
-  unsigned SizeAccum = FirstMMO.getSize();
+  unsigned SizeAccum = FirstMMO.getSize().getValue();
   unsigned FirstOffset = getStoreOffset(FirstMI);
 
   // The initial value of SizeAccum should always be a power of 2.
@@ -357,7 +357,7 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
     if (!storesAreAdjacent(S1, S2))
       break;
 
-    unsigned S2Size = getStoreTarget(S2).getSize();
+    unsigned S2Size = getStoreTarget(S2).getSize().getValue();
     if (SizeAccum + S2Size > std::min(MaxSize, Alignment))
       break;
 
@@ -405,7 +405,7 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
     MachineOperand &SO = MI->getOperand(2);  // Source.
     assert(SO.isImm() && "Expecting an immediate operand");
 
-    unsigned NBits = MMO.getSize()*8;
+    unsigned NBits = MMO.getSize().getValue() * 8;
     unsigned Mask = (0xFFFFFFFFU >> (32-NBits));
     unsigned Val = (SO.getImm() & Mask) << Shift;
     Acc |= Val;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h b/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
index ff84363106629f..a2f93d476bfe1b 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
@@ -24,6 +24,15 @@ class HexagonTargetStreamer : public MCTargetStreamer {
   virtual void emitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                                            unsigned ByteAlign,
                                            unsigned AccessGranularity){};
+  void finish() override {}
+
+  virtual void finishAttributeSection() {}
+
+  virtual void emitAttribute(unsigned Attribute, unsigned Value) {}
+
+  void emitTargetAttributes(const MCSubtargetInfo &STI);
+
+  virtual void reset() {}
 };
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index cf4b66f8bf8619..458b8717256f24 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -138,20 +138,6 @@ ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth,
   return ElementCount::getFixed((8 * ST.getVectorLength()) / ElemWidth);
 }
 
-InstructionCost HexagonTTIImpl::getScalarizationOverhead(
-    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
-    TTI::TargetCostKind CostKind) {
-  return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
-                                         CostKind);
-}
-
-InstructionCost
-HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                                 ArrayRef<Type *> Tys,
-                                                 TTI::TargetCostKind CostKind) {
-  return BaseT::getOperandsScalarizationOverhead(Args, Tys, CostKind);
-}
-
 InstructionCost HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
                                                  ArrayRef<Type *> Tys,
                                                  TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index ec0fd454c80847..fdb34f308e641e 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -103,14 +103,6 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
     return true;
   }
 
-  InstructionCost getScalarizationOverhead(VectorType *Ty,
-                                           const APInt &DemandedElts,
-                                           bool Insert, bool Extract,
-                                           TTI::TargetCostKind CostKind);
-  InstructionCost
-  getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                   ArrayRef<Type *> Tys,
-                                   TTI::TargetCostKind CostKind);
   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
                                    ArrayRef<Type *> Tys,
                                    TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index d1678fa0241ca8..22c82bf5ac1b40 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/HexagonMCELFStreamer.h"
+#include "HexagonTargetStreamer.h"
+#include "MCTargetDesc/HexagonMCChecker.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/ADT/StringRef.h"
@@ -27,11 +29,13 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/HexagonAttributes.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
@@ -147,6 +151,63 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol,
   HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize);
 }
 
+static unsigned featureToArchVersion(unsigned Feature) {
+  switch (Feature) {
+  case Hexagon::ArchV5:
+    return 5;
+  case Hexagon::ArchV55:
+    return 55;
+  case Hexagon::ArchV60:
+  case Hexagon::ExtensionHVXV60:
+    return 60;
+  case Hexagon::ArchV62:
+  case Hexagon::ExtensionHVXV62:
+    return 62;
+  case Hexagon::ArchV65:
+  case Hexagon::ExtensionHVXV65:
+    return 65;
+  case Hexagon::ArchV66:
+  case Hexagon::ExtensionHVXV66:
+    return 66;
+  case Hexagon::ArchV67:
+  case Hexagon::ExtensionHVXV67:
+    return 67;
+  case Hexagon::ArchV68:
+  case Hexagon::ExtensionHVXV68:
+    return 68;
+  case Hexagon::ArchV69:
+  case Hexagon::ExtensionHVXV69:
+    return 69;
+  case Hexagon::ArchV71:
+  case Hexagon::ExtensionHVXV71:
+    return 71;
+  case Hexagon::ArchV73:
+  case Hexagon::ExtensionHVXV73:
+    return 73;
+  }
+  llvm_unreachable("Expected valid arch feature");
+  return 0;
+}
+
+void HexagonTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
+  auto Features = STI.getFeatureBits();
+  unsigned Arch = featureToArchVersion(Hexagon_MC::getArchVersion(Features));
+  std::optional<unsigned> HVXArch = Hexagon_MC::getHVXVersion(Features);
+  emitAttribute(HexagonAttrs::ARCH, Arch);
+  if (HVXArch)
+    emitAttribute(HexagonAttrs::HVXARCH, featureToArchVersion(*HVXArch));
+  if (Features.test(Hexagon::ExtensionHVXIEEEFP))
+    emitAttribute(HexagonAttrs::HVXIEEEFP, 1);
+  if (Features.test(Hexagon::ExtensionHVXQFloat))
+    emitAttribute(HexagonAttrs::HVXQFLOAT, 1);
+  if (Features.test(Hexagon::ExtensionZReg))
+    emitAttribute(HexagonAttrs::ZREG, 1);
+  if (Features.test(Hexagon::ExtensionAudio))
+    emitAttribute(HexagonAttrs::AUDIO, 1);
+  if (Features.test(Hexagon::FeatureCabac))
+    emitAttribute(HexagonAttrs::CABAC, 1);
+}
+
 namespace llvm {
 MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
                                      std::unique_ptr<MCAsmBackend> MAB,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 0740ac58a33813..dc8328a6705da8 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -35,6 +35,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/HexagonAttributes.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
@@ -224,12 +225,13 @@ bool isSlot0Only(unsigned units) {
 namespace {
 
 class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
+  formatted_raw_ostream &OS;
+  bool IsVerboseAsm;
+
 public:
-  HexagonTargetAsmStreamer(MCStreamer &S,
-                           formatted_raw_ostream &OS,
-                           bool isVerboseAsm,
-                           MCInstPrinter &IP)
-      : HexagonTargetStreamer(S) {}
+  HexagonTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                           bool IsVerboseAsm, MCInstPrinter &IP)
+      : HexagonTargetStreamer(S), OS(OS), IsVerboseAsm(IsVerboseAsm) {}
 
   void prettyPrintAsm(MCInstPrinter &InstPrinter, uint64_t Address,
                       const MCInst &Inst, const MCSubtargetInfo &STI,
@@ -266,6 +268,21 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
     else
       OS << "\t}" << PacketBundle.second;
   }
+
+  void finish() override { finishAttributeSection(); }
+
+  void finishAttributeSection() override {}
+
+  void emitAttribute(unsigned Attribute, unsigned Value) override {
+    OS << "\t.attribute\t" << Attribute << ", " << Twine(Value);
+    if (IsVerboseAsm) {
+      StringRef Name = ELFAttrs::attrTypeAsString(
+          Attribute, HexagonAttrs::getHexagonAttributeTags());
+      if (!Name.empty())
+        OS << "\t// " << Name;
+    }
+    OS << "\n";
+  }
 };
 
 class HexagonTargetELFStreamer : public HexagonTargetStreamer {
@@ -279,7 +296,6 @@ class HexagonTargetELFStreamer : public HexagonTargetStreamer {
     MCA.setELFHeaderEFlags(Hexagon_MC::GetELFFlags(STI));
   }
 
-
   void emitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                               unsigned ByteAlignment,
                               unsigned AccessSize) override {
@@ -297,6 +313,27 @@ class HexagonTargetELFStreamer : public HexagonTargetStreamer {
     HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(
         Symbol, Size, Align(ByteAlignment), AccessSize);
   }
+
+  void finish() override { finishAttributeSection(); }
+
+  void reset() override { AttributeSection = nullptr; }
+
+private:
+  MCSection *AttributeSection = nullptr;
+
+  void finishAttributeSection() override {
+    MCELFStreamer &S = getStreamer();
+    if (S.Contents.empty())
+      return;
+
+    S.emitAttributesSection("hexagon", ".hexagon.attributes",
+                            ELF::SHT_HEXAGON_ATTRIBUTES, AttributeSection);
+  }
+
+  void emitAttribute(uint32_t Attribute, uint32_t Value) override {
+    getStreamer().setAttributeItem(Attribute, Value,
+                                   /*OverwriteExisting=*/true);
+  }
 };
 
 } // end anonymous namespace
@@ -591,6 +628,29 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI, StringRef FS) {
   }
 }
 
+std::optional<unsigned>
+Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {
+  for (auto Arch : {Hexagon::ExtensionHVXV73, Hexagon::ExtensionHVXV71,
+                    Hexagon::ExtensionHVXV69, Hexagon::ExtensionHVXV68,
+                    Hexagon::ExtensionHVXV67, Hexagon::ExtensionHVXV66,
+                    Hexagon::ExtensionHVXV65, Hexagon::ExtensionHVXV62,
+                    Hexagon::ExtensionHVXV60})
+    if (Features.test(Arch))
+      return Arch;
+  return {};
+}
+
+unsigned Hexagon_MC::getArchVersion(const FeatureBitset &Features) {
+  for (auto Arch :
+       {Hexagon::ArchV73, Hexagon::ArchV71, Hexagon::ArchV69, Hexagon::ArchV68,
+        Hexagon::ArchV67, Hexagon::ArchV66, Hexagon::ArchV65, Hexagon::ArchV62,
+        Hexagon::ArchV60, Hexagon::ArchV55, Hexagon::ArchV5})
+    if (Features.test(Arch))
+      return Arch;
+  llvm_unreachable("Expected arch v5-v73");
+  return 0;
+}
+
 unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
   return StringSwitch<unsigned>(STI.getCPU())
       .Case("generic", llvm::ELF::EF_HEXAGON_MACH_V5)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index ffb81bca208df1..6110c7a93982ca 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -81,7 +81,11 @@ namespace Hexagon_MC {
   unsigned GetELFFlags(const MCSubtargetInfo &STI);
 
   llvm::ArrayRef<MCPhysReg> GetVectRegRev();
-}
+
+  std::optional<unsigned> getHVXVersion(const FeatureBitset &Features);
+
+  unsigned getArchVersion(const FeatureBitset &Features);
+  } // namespace Hexagon_MC
 
 MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
                                           MCContext &MCT);
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 46f63a4103f9f9..cf163e4e12001c 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -453,6 +453,14 @@ class LoongArchOperand : public MCParsedAsmOperand {
   }
 
   bool isImm32() const { return isSImm<32>() || isUImm<32>(); }
+  bool isImm64() const {
+    if (!isImm())
+      return false;
+    int64_t Imm;
+    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && VK == LoongArchMCExpr::VK_LoongArch_None;
+  }
 
   /// Gets location of the first token of this operand.
   SMLoc getStartLoc() const override { return StartLoc; }
@@ -1514,6 +1522,10 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a 32 bit immediate");
   }
+  case Match_InvalidImm64: {
+    SMLoc ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "operand must be a 64 bit immediate");
+  }
   case Match_InvalidBareSymbol: {
     SMLoc ErrorLoc = ((LoongArchOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a bare symbol name");
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index fcfff08d1a9f54..80429bc45be14c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -219,6 +219,9 @@ def grlenimm : Operand<GRLenVT>;
 def imm32 : Operand<GRLenVT> {
   let ParserMatchClass = ImmAsmOperand<"", 32, "">;
 }
+def imm64 : Operand<i64> {
+  let ParserMatchClass = ImmAsmOperand<"", 64, "">;
+}
 
 def uimm1 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isUInt<1>(Imm);}]>{
   let ParserMatchClass = UImmAsmOperand<1>;
@@ -2179,7 +2182,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in {
 def PseudoLI_W : Pseudo<(outs GPR:$rd), (ins imm32:$imm), [],
                         "li.w", "$rd, $imm">;
-def PseudoLI_D : Pseudo<(outs GPR:$rd), (ins grlenimm:$imm), [],
+def PseudoLI_D : Pseudo<(outs GPR:$rd), (ins imm64:$imm), [],
                         "li.d", "$rd, $imm">, Requires<[IsLA64]>;
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index adfcea73615831..bdf299ae8ac112 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1255,28 +1255,6 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
     emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
 
-#if 0
-  // We haven't support -mabicalls -mno-shared yet.
-  if (-mno-shared) {
-    MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp");
-    const MipsMCExpr *HiExpr = MipsMCExpr::create(
-        MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
-        MCA.getContext());
-    const MipsMCExpr *LoExpr = MipsMCExpr::create(
-        MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
-        MCA.getContext());
-
-    // lui $gp, %hi(__gnu_local_gp)
-    emitRX(Mips::LUi, GPReg, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
-
-    // addiu  $gp, $gp, %lo(__gnu_local_gp)
-    emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(),
-            &STI);
-
-    return;
-  }
-#endif
-
   const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
       MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
       MCA.getContext());
diff --git a/llvm/lib/Target/Mips/Mips.td b/llvm/lib/Target/Mips/Mips.td
index dbff25dfa0f36c..923e0c9cdde758 100644
--- a/llvm/lib/Target/Mips/Mips.td
+++ b/llvm/lib/Target/Mips/Mips.td
@@ -208,6 +208,10 @@ def FeatureUseIndirectJumpsHazard : SubtargetFeature<"use-indirect-jump-hazard",
                                                     "true", "Use indirect jump"
                         " guards to prevent certain speculation based attacks">;
 
+def FeatureStrictAlign
+    : SubtargetFeature<"strict-align", "StrictAlign", "true",
+                       "Disable unaligned load store for r6">;
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index 5c96044b1f0b85..f7b623785b70c2 100644
--- a/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -414,7 +414,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
             C, Attribute::getWithMemoryEffects(C, MemoryEffects::none()));
         A = A.addFnAttribute(C, Attribute::NoInline);
         FunctionCallee F = (M->getOrInsertFunction(Name, A, MyVoid, T));
-        CallInst::Create(F, Params, "", &I);
+        CallInst::Create(F, Params, "", I.getIterator());
       } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
         FunctionType *FT = CI->getFunctionType();
         Function *F_ =  CI->getCalledFunction();
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7e5f148e7bf4e7..0a0d40751fcf05 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -365,8 +365,13 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::JumpTable,          MVT::i64,   Custom);
     setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
     setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
-    setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
-    setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+    if (Subtarget.hasMips64r6()) {
+      setOperationAction(ISD::LOAD,               MVT::i64,   Legal);
+      setOperationAction(ISD::STORE,              MVT::i64,   Legal);
+    } else {
+      setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
+      setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+    }
     setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
     setOperationAction(ISD::SHL_PARTS,          MVT::i64,   Custom);
     setOperationAction(ISD::SRA_PARTS,          MVT::i64,   Custom);
@@ -481,7 +486,12 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   if (!Subtarget.hasMips64r2())
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
-  if (Subtarget.isGP64bit()) {
+  if (Subtarget.isGP64bit() && Subtarget.hasMips64r6()) {
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, MVT::i32, Legal);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, MVT::i32, Legal);
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, MVT::i32, Legal);
+    setTruncStoreAction(MVT::i64, MVT::i32, Legal);
+  } else if (Subtarget.isGP64bit()) {
     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, MVT::i32, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, MVT::i32, Custom);
     setLoadExtAction(ISD::EXTLOAD, MVT::i64, MVT::i32, Custom);
diff --git a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
index 654f29d08af0b5..4e1e27088cce42 100644
--- a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -184,7 +184,8 @@ MipsInstructionSelector::selectLoadStoreOpCode(MachineInstr &I,
   const Register ValueReg = I.getOperand(0).getReg();
   const LLT Ty = MRI.getType(ValueReg);
   const unsigned TySize = Ty.getSizeInBits();
-  const unsigned MemSizeInBytes = (*I.memoperands_begin())->getSize();
+  const unsigned MemSizeInBytes =
+      (*I.memoperands_begin())->getSize().getValue();
   unsigned Opc = I.getOpcode();
   const bool isStore = Opc == TargetOpcode::G_STORE;
 
@@ -455,7 +456,8 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
     }
 
     // Unaligned memory access
-    if (MMO->getAlign() < MMO->getSize() &&
+    if ((!MMO->getSize().hasValue() ||
+         MMO->getAlign() < MMO->getSize().getValue()) &&
         !STI.systemSupportsUnalignedAccess()) {
       if (MMO->getSize() != 4 || !isRegInGprb(I.getOperand(0).getReg(), MRI))
         return false;
diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
index f5e94235859a06..3307e840a2afd6 100644
--- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -344,7 +344,7 @@ bool MipsLegalizerInfo::legalizeCustom(
   switch (MI.getOpcode()) {
   case G_LOAD:
   case G_STORE: {
-    unsigned MemSize = (**MI.memoperands_begin()).getSize();
+    unsigned MemSize = (**MI.memoperands_begin()).getSize().getValue();
     Register Val = MI.getOperand(0).getReg();
     unsigned Size = MRI.getType(Val).getSizeInBits();
 
diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
index acf0d6312ef51b..639e623954ffc6 100644
--- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
@@ -70,9 +70,10 @@ class MipsPreLegalizerCombinerImpl : public Combiner {
       // subtarget doesn't support them.
       auto MMO = *MI.memoperands_begin();
       const MipsSubtarget &STI = MI.getMF()->getSubtarget<MipsSubtarget>();
-      if (!isPowerOf2_64(MMO->getSize()))
+      if (!MMO->getSize().hasValue() ||
+          !isPowerOf2_64(MMO->getSize().getValue()))
         return false;
-      bool isUnaligned = MMO->getAlign() < MMO->getSize();
+      bool isUnaligned = MMO->getAlign() < MMO->getSize().getValue();
       if (!STI.systemSupportsUnalignedAccess() && isUnaligned)
         return false;
 
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index db7afc3c86c57e..6af1fd8c88e570 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -155,7 +155,8 @@ static bool isGprbTwoInstrUnalignedLoadOrStore(const MachineInstr *MI) {
     auto MMO = *MI->memoperands_begin();
     const MipsSubtarget &STI = MI->getMF()->getSubtarget<MipsSubtarget>();
     if (MMO->getSize() == 4 && (!STI.systemSupportsUnalignedAccess() &&
-                                MMO->getAlign() < MMO->getSize()))
+                                (!MMO->getSize().hasValue() ||
+                                 MMO->getAlign() < MMO->getSize().getValue())))
       return true;
   }
   return false;
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index b87d13cea787dd..5c8d64e3b6e3cc 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -197,8 +197,13 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
   setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
-  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
-  setOperationAction(ISD::STORE,              MVT::i32, Custom);
+  if (Subtarget.hasMips32r6()) {
+    setOperationAction(ISD::LOAD,               MVT::i32, Legal);
+    setOperationAction(ISD::STORE,              MVT::i32, Legal);
+  } else {
+    setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+    setOperationAction(ISD::STORE,              MVT::i32, Custom);
+  }
 
   setTargetDAGCombine(ISD::MUL);
 
@@ -425,6 +430,8 @@ bool MipsSETargetLowering::allowsMisalignedMemoryAccesses(
     if (Fast)
       *Fast = 1;
     return true;
+  } else if (Subtarget.hasMips32r6()) {
+    return false;
   }
 
   switch (SVT) {
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.cpp b/llvm/lib/Target/Mips/MipsSubtarget.cpp
index 0134fcb341f187..1bd35ca4a41914 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -82,7 +82,7 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
       HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 || Mips_Os16),
       Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
       HasEVA(false), DisableMadd4(false), HasMT(false), HasCRC(false),
-      HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false),
+      HasVirt(false), HasGINV(false), UseIndirectJumpsHazard(false), StrictAlign(false),
       StackAlignOverride(StackAlignOverride), TM(TM), TargetTriple(TT),
       TSInfo(), InstrInfo(MipsInstrInfo::create(
                     initializeSubtargetDependencies(CPU, FS, TM))),
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.h b/llvm/lib/Target/Mips/MipsSubtarget.h
index 225ee139d036f3..fea7f11fd07054 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -200,6 +200,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // Assume 32-bit GOT.
   bool UseXGOT = false;
 
+  // Disable unaligned load store for r6.
+  bool StrictAlign;
+
   /// The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   Align stackAlignment;
@@ -372,7 +375,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   /// MIPS32r6/MIPS64r6 require full unaligned access support but does not
   /// specify which component of the system provides it. Hardware, software, and
   /// hybrid implementations are all valid.
-  bool systemSupportsUnalignedAccess() const { return hasMips32r6(); }
+  bool systemSupportsUnalignedAccess() const {
+    return hasMips32r6() && !StrictAlign;
+  }
 
   // Set helper classes
   void setHelperClassesMips16();
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 2219d9f6619aaf..2783b1e0ec73eb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -489,8 +489,11 @@ void NVPTXAsmPrinter::emitFunctionEntryLabel() {
   OutStreamer->emitRawText(StringRef("{\n"));
   setAndEmitFunctionVirtualRegisters(*MF);
   // Emit initial .loc debug directive for correct relocation symbol data.
-  if (MMI && MMI->hasDebugInfo())
-    emitInitialRawDwarfLocDirective(*MF);
+  if (const DISubprogram *SP = MF->getFunction().getSubprogram()) {
+    assert(SP->getUnit());
+    if (!SP->getUnit()->isDebugDirectivesOnly() && MMI && MMI->hasDebugInfo())
+      emitInitialRawDwarfLocDirective(*MF);
+  }
 }
 
 bool NVPTXAsmPrinter::runOnMachineFunction(MachineFunction &F) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 86fb367780dc1a..c34472c21ccbe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -60,10 +60,12 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
                      NRI->getFrameRegister(MF))
                  .addReg(NRI->getFrameLocalRegister(MF));
     }
-    BuildMI(MBB, MBBI, dl,
-            MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
-            NRI->getFrameLocalRegister(MF))
-        .addImm(MF.getFunctionNumber());
+    if (!MR.use_empty(NRI->getFrameLocalRegister(MF))) {
+      BuildMI(MBB, MBBI, dl,
+              MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
+              NRI->getFrameLocalRegister(MF))
+          .addImm(MF.getFunctionNumber());
+    }
   }
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c979c03dc1b835..e1cdbd6a4b4dc6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -580,9 +580,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::ROTL, MVT::i8, Expand);
   setOperationAction(ISD::ROTR, MVT::i8, Expand);
   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
-  setOperationAction(ISD::BSWAP, MVT::v2i16, Expand);
-  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
-  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
   // Indirect branch is not supported.
   // This also disables Jump Table creation.
@@ -645,8 +642,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
   setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
 
-  // Lowering of DYNAMIC_STACKALLOC is unsupported.
-  // Custom lower to produce an error.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
 
@@ -937,6 +932,7 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::BFE)
     MAKE_CASE(NVPTXISD::BFI)
     MAKE_CASE(NVPTXISD::PRMT)
+    MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
     MAKE_CASE(NVPTXISD::SETP_F16X2)
     MAKE_CASE(NVPTXISD::SETP_BF16X2)
     MAKE_CASE(NVPTXISD::Dummy)
@@ -2211,14 +2207,39 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  const Function &Fn = DAG.getMachineFunction().getFunction();
-
-  DiagnosticInfoUnsupported NoDynamicAlloca(
-      Fn, "dynamic alloca unsupported by NVPTX backend",
-      SDLoc(Op).getDebugLoc());
-  DAG.getContext()->diagnose(NoDynamicAlloca);
-  auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
-  return DAG.getMergeValues(Ops, SDLoc());
+
+  if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
+    const Function &Fn = DAG.getMachineFunction().getFunction();
+
+    DiagnosticInfoUnsupported NoDynamicAlloca(
+        Fn,
+        "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
+        "requires target sm_52.",
+        SDLoc(Op).getDebugLoc());
+    DAG.getContext()->diagnose(NoDynamicAlloca);
+    auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
+                Op.getOperand(0)};
+    return DAG.getMergeValues(Ops, SDLoc());
+  }
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  SDLoc DL(Op.getNode());
+
+  // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
+  if (nvTM->is64Bit())
+    Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
+  else
+    Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
+
+  SDValue AllocOps[] = {Chain, Size,
+                        DAG.getTargetConstant(Align, DL, MVT::i32)};
+  SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,
+                               nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
+
+  SDValue MergeOps[] = {Alloca, Chain};
+  return DAG.getMergeValues(MergeOps, DL);
 }
 
 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
@@ -6100,6 +6121,9 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
   if (AI->isFloatingPointOperation()) {
     if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
+      if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
+          STI.getPTXVersion() >= 63)
+        return AtomicExpansionKind::None;
       if (Ty->isFloatTy())
         return AtomicExpansionKind::None;
       if (Ty->isDoubleTy() && STI.hasAtomAddF64())
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index cf1d4580766918..c9db10e555cefe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -61,6 +61,7 @@ enum NodeType : unsigned {
   BFE,
   BFI,
   PRMT,
+  DYNAMIC_STACKALLOC,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index 202134ed7035aa..c3fce14df06884 100644
--- a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -157,7 +157,7 @@ void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
       else
         // Get true block
         Dest = BI->getSuccessor(0);
-      BranchInst::Create(Dest, BI);
+      BranchInst::Create(Dest, BI->getIterator());
       InstrToDelete.push_back(BI);
     }
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3dc5b450cbf5cf..4daf47cfd4df8a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3549,6 +3549,11 @@ let hasSideEffects = false in {
                              (ins Int64Regs:$s),
                              "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
                              []>;
+  def I64toI32L  : NVPTXInst<(outs Int32Regs:$low),
+                             (ins Int64Regs:$s),
+                             "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
+                             []>;
+
 }
 
 // Using partial vectorized move produces better SASS code for extraction of
@@ -3805,6 +3810,28 @@ def CALL_PROTOTYPE :
   NVPTXInst<(outs), (ins ProtoIdent:$ident),
             "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
 
+def SDTDynAllocaOp :
+  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
+
+def dyn_alloca :
+  SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
+         [SDNPHasChain, SDNPSideEffect]>;
+
+def DYNAMIC_STACKALLOC32 :
+  NVPTXInst<(outs Int32Regs:$ptr),
+            (ins Int32Regs:$size, i32imm:$align),
+            "alloca.u32 \t$ptr, $size, $align;\n\t"
+            "cvta.local.u32 \t$ptr, $ptr;",
+            [(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>,
+            Requires<[hasPTX<73>, hasSM<52>]>;
+
+def DYNAMIC_STACKALLOC64 :
+  NVPTXInst<(outs Int64Regs:$ptr),
+            (ins Int64Regs:$size, i32imm:$align),
+            "alloca.u64 \t$ptr, $size, $align;\n\t"
+            "cvta.local.u64 \t$ptr, $ptr;",
+            [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
+            Requires<[hasPTX<73>, hasSM<52>]>;
 
 include "NVPTXIntrinsics.td"
 
@@ -3816,3 +3843,17 @@ include "NVPTXIntrinsics.td"
 // - for sm_20, use pmpt (use vector scalar mov to get the pack and
 //   unpack). sm_20 supports native 32-bit register, but not native 16-bit
 // register.
+
+def : Pat <
+  (i32 (bswap i32:$a)),
+  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
+
+def : Pat <
+  (v2i16 (bswap v2i16:$a)),
+  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
+
+def : Pat <
+  (i64 (bswap i64:$a)),
+  (V2I32toI64
+    (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
+    (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 477789a164ead2..c0c53380a13e9b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1520,7 +1520,7 @@ multiclass F_ATOMIC_2_imp<ValueType ptrT, NVPTXRegClass ptrclass,
   def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
     !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
     [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), IMM:$b))]>,
-  Requires<Pred>;
+  Requires<!if(!eq(TypeStr, ".f16"), [Predicate<"false">], Pred)>;
 }
 multiclass F_ATOMIC_2<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
   string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM,
@@ -1630,6 +1630,13 @@ defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".add",
 defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
   ".add", atomic_load_add_64_gen, i64imm, imm>;
 
+defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2<f16, Int16Regs, ".global", ".f16", ".add.noftz",
+  atomic_load_add_g, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_ADD_S_F16 : F_ATOMIC_2<f16, Int16Regs, ".shared", ".f16", ".add.noftz",
+  atomic_load_add_s, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_ADD_GEN_F16 : F_ATOMIC_2<f16, Int16Regs, "", ".f16", ".add.noftz",
+  atomic_load_add_gen, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+
 defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<f32, Float32Regs, ".global", ".f32", ".add",
   atomic_load_add_g, f32imm, fpimm>;
 defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<f32, Float32Regs, ".shared", ".f32", ".add",
@@ -2007,6 +2014,9 @@ multiclass ATOM2P_impl<string AsmStr,  Intrinsic Intr,
                        SDNode Imm, ValueType ImmTy,
                        list<Predicate> Preds> {
   let AddedComplexity = 1 in {
+    def : ATOM23_impl<AsmStr, regT, regclass, Preds,
+                      (ins Int16Regs:$src, regclass:$b),
+                      (Intr (i16 Int16Regs:$src), (regT regclass:$b))>;
     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
                       (ins Int32Regs:$src, regclass:$b),
                       (Intr (i32 Int32Regs:$src), (regT regclass:$b))>;
@@ -2017,6 +2027,9 @@ multiclass ATOM2P_impl<string AsmStr,  Intrinsic Intr,
   // tablegen can't infer argument types from Intrinsic (though it can
   // from Instruction) so we have to enforce specific type on
   // immediates via explicit cast to ImmTy.
+  def : ATOM23_impl<AsmStr, regT, regclass, Preds,
+                    (ins Int16Regs:$src, ImmType:$b),
+                    (Intr (i16 Int16Regs:$src), (ImmTy Imm:$b))>;
   def : ATOM23_impl<AsmStr, regT, regclass, Preds,
                     (ins Int32Regs:$src, ImmType:$b),
                     (Intr (i32 Int32Regs:$src), (ImmTy Imm:$b))>;
@@ -2136,6 +2149,8 @@ multiclass ATOM2_add_impl<string OpStr> {
    defm _s32  : ATOM2S_impl<OpStr, "i", "s32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u32  : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u64  : ATOM2S_impl<OpStr, "i", "u64", i64, Int64Regs, i64imm, imm, i64, []>;
+   defm _f16  : ATOM2S_impl<OpStr, "f", "f16", f16, Int16Regs, f16imm, fpimm, f16,
+                            [hasSM<70>, hasPTX<63>]>;
    defm _f32  : ATOM2S_impl<OpStr, "f", "f32", f32, Float32Regs, f32imm, fpimm, f32,
                             []>;
    defm _f64  : ATOM2S_impl<OpStr, "f", "f64", f64, Float64Regs, f64imm, fpimm, f64,
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 9e06f4689304ba..cde02c25c48342 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -183,16 +183,16 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
     }
     if (auto *GEP = dyn_cast<GetElementPtrInst>(I.OldInstruction)) {
       SmallVector<Value *, 4> Indices(GEP->indices());
-      auto *NewGEP = GetElementPtrInst::Create(GEP->getSourceElementType(),
-                                               I.NewParam, Indices,
-                                               GEP->getName(), GEP);
+      auto *NewGEP = GetElementPtrInst::Create(
+          GEP->getSourceElementType(), I.NewParam, Indices, GEP->getName(),
+          GEP->getIterator());
       NewGEP->setIsInBounds(GEP->isInBounds());
       return NewGEP;
     }
     if (auto *BC = dyn_cast<BitCastInst>(I.OldInstruction)) {
       auto *NewBCType = PointerType::get(BC->getContext(), ADDRESS_SPACE_PARAM);
       return BitCastInst::Create(BC->getOpcode(), I.NewParam, NewBCType,
-                                 BC->getName(), BC);
+                                 BC->getName(), BC->getIterator());
     }
     if (auto *ASC = dyn_cast<AddrSpaceCastInst>(I.OldInstruction)) {
       assert(ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM);
@@ -316,7 +316,7 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
 void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
                                       Argument *Arg) {
   Function *Func = Arg->getParent();
-  Instruction *FirstInst = &(Func->getEntryBlock().front());
+  BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
   Type *StructType = Arg->getParamByValType();
   assert(StructType && "Missing byval type");
 
@@ -407,9 +407,9 @@ void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
 
   Instruction *PtrInGlobal = new AddrSpaceCastInst(
       Ptr, PointerType::get(Ptr->getContext(), ADDRESS_SPACE_GLOBAL),
-      Ptr->getName(), &*InsertPt);
+      Ptr->getName(), InsertPt);
   Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
-                                              Ptr->getName(), &*InsertPt);
+                                              Ptr->getName(), InsertPt);
   // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
   Ptr->replaceAllUsesWith(PtrInGeneric);
   PtrInGlobal->setOperand(0, Ptr);
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp
index 34f06b548db260..92b90e2559154e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp
@@ -143,7 +143,7 @@ bool NVPTXLowerUnreachable::runOnFunction(Function &F) {
       if (auto unreachableInst = dyn_cast<UnreachableInst>(&I)) {
         if (isLoweredToTrap(*unreachableInst))
           continue; // trap is emitted as `trap; exit;`.
-        CallInst::Create(ExitFTy, Exit, "", unreachableInst);
+        CallInst::Create(ExitFTy, Exit, "", unreachableInst->getIterator());
         Changed = true;
       }
     }
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
index 98cd3a82a6e05d..3283a5bb69404c 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
@@ -739,7 +739,7 @@ bool PPCInstructionSelector::select(MachineInstr &I) {
     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
       const unsigned NewOpc = selectLoadStoreOp(
           I.getOpcode(), RBI.getRegBank(LdSt.getReg(0), MRI, TRI)->getID(),
-          LdSt.getMemSizeInBits());
+          LdSt.getMemSizeInBits().getValue());
 
       if (NewOpc == I.getOpcode())
         return nullptr;
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 542854ec9b99fd..64cae1caa6437d 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -474,6 +474,35 @@ static void collectTOCStats(PPCAsmPrinter::TOCEntryType Type) {
   }
 }
 
+static CodeModel::Model getCodeModel(const PPCSubtarget &S,
+                                     const TargetMachine &TM,
+                                     const MachineOperand &MO) {
+  CodeModel::Model ModuleModel = TM.getCodeModel();
+
+  // If the operand is not a global address then there is no
+  // global variable to carry an attribute.
+  if (!(MO.getType() == MachineOperand::MO_GlobalAddress))
+    return ModuleModel;
+
+  const GlobalValue *GV = MO.getGlobal();
+  assert(GV && "expected global for MO_GlobalAddress");
+
+  return S.getCodeModel(TM, GV);
+}
+
+static void setOptionalCodeModel(MCSymbolXCOFF *XSym, CodeModel::Model CM) {
+  switch (CM) {
+  case CodeModel::Large:
+    XSym->setPerSymbolCodeModel(MCSymbolXCOFF::CM_Large);
+    return;
+  case CodeModel::Small:
+    XSym->setPerSymbolCodeModel(MCSymbolXCOFF::CM_Small);
+    return;
+  default:
+    report_fatal_error("Invalid code model for AIX");
+  }
+}
+
 /// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
 /// exists for it.  If not, create one.  Then return a symbol that references
 /// the TOC entry.
@@ -1014,7 +1043,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // relative to the toc-base.
     if (IsAIX) {
       assert(
-          TM.getCodeModel() == CodeModel::Small &&
+          getCodeModel(*Subtarget, TM, MO) == CodeModel::Small &&
           "This pseudo should only be selected for 32-bit small code model.");
       Exp = getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp, VK);
       TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
@@ -1098,7 +1127,12 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::ADDIStocHA: {
-    assert((IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large) &&
+    const MachineOperand &MO = MI->getOperand(2);
+
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+           "Invalid operand for ADDIStocHA.");
+    assert((IsAIX && !IsPPC64 &&
+            getCodeModel(*Subtarget, TM, MO) == CodeModel::Large) &&
            "This pseudo should only be selected for 32-bit large code model on"
            " AIX.");
 
@@ -1108,10 +1142,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // Change the opcode to ADDIS.
     TmpInst.setOpcode(PPC::ADDIS);
 
-    const MachineOperand &MO = MI->getOperand(2);
-    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
-           "Invalid operand for ADDIStocHA.");
-
     // Map the machine operand to its corresponding MCSymbol.
     MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
@@ -1131,7 +1161,12 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::LWZtocL: {
-    assert(IsAIX && !IsPPC64 && TM.getCodeModel() == CodeModel::Large &&
+    const MachineOperand &MO = MI->getOperand(1);
+
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
+           "Invalid operand for LWZtocL.");
+    assert(IsAIX && !IsPPC64 &&
+           getCodeModel(*Subtarget, TM, MO) == CodeModel::Large &&
            "This pseudo should only be selected for 32-bit large code model on"
            " AIX.");
 
@@ -1141,10 +1176,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // Change the opcode to lwz.
     TmpInst.setOpcode(PPC::LWZ);
 
-    const MachineOperand &MO = MI->getOperand(1);
-    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
-           "Invalid operand for LWZtocL.");
-
     // Map the machine operand to its corresponding MCSymbol.
     MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
@@ -1183,8 +1214,12 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     const bool GlobalToc =
         MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal());
+
+    const CodeModel::Model CM =
+        IsAIX ? getCodeModel(*Subtarget, TM, MO) : TM.getCodeModel();
+
     if (GlobalToc || MO.isJTI() || MO.isBlockAddress() ||
-        (MO.isCPI() && TM.getCodeModel() == CodeModel::Large))
+        (MO.isCPI() && CM == CodeModel::Large))
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
     VK = IsAIX ? MCSymbolRefExpr::VK_PPC_U : MCSymbolRefExpr::VK_PPC_TOC_HA;
@@ -1225,8 +1260,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
     MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
-
-    if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large)
+    CodeModel::Model CM =
+        IsAIX ? getCodeModel(*Subtarget, TM, MO) : TM.getCodeModel();
+    if (!MO.isCPI() || CM == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
     VK = IsAIX ? MCSymbolRefExpr::VK_PPC_L : MCSymbolRefExpr::VK_PPC_TOC_LO;
@@ -2651,6 +2687,27 @@ uint64_t PPCAIXAsmPrinter::getAliasOffset(const Constant *C) {
   return 0;
 }
 
+static void tocDataChecks(unsigned PointerSize, const GlobalVariable *GV) {
+  // TODO: These asserts should be updated as more support for the toc data
+  // transformation is added (struct support, etc.).
+  assert(
+      PointerSize >= GV->getAlign().valueOrOne().value() &&
+      "GlobalVariables with an alignment requirement stricter than TOC entry "
+      "size not supported by the toc data transformation.");
+
+  Type *GVType = GV->getValueType();
+  assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
+                              "supported by the toc data transformation.");
+  if (GV->getParent()->getDataLayout().getTypeSizeInBits(GVType) >
+      PointerSize * 8)
+    report_fatal_error(
+        "A GlobalVariable with size larger than a TOC entry is not currently "
+        "supported by the toc data transformation.");
+  if (GV->hasPrivateLinkage())
+    report_fatal_error("A GlobalVariable with private linkage is not "
+                       "currently supported by the toc data transformation.");
+}
+
 void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   // Special LLVM global arrays have been handled at the initialization.
   if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV))
@@ -2660,7 +2717,7 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   // when we emit the .toc section.
   if (GV->hasAttribute("toc-data")) {
     unsigned PointerSize = GV->getParent()->getDataLayout().getPointerSize();
-    Subtarget->tocDataChecks(PointerSize, GV);
+    tocDataChecks(PointerSize, GV);
     TOCDataGlobalVars.push_back(GV);
     return;
   }
@@ -2963,6 +3020,10 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
     }
 
     setCsectAlignment(&G);
+    std::optional<CodeModel::Model> OptionalCodeModel = G.getCodeModel();
+    if (OptionalCodeModel)
+      setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&G)),
+                           *OptionalCodeModel);
   }
 
   for (const auto &F : M)
@@ -2984,6 +3045,15 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) {
                          false);
     }
 
+    const GlobalVariable *GVar =
+        dyn_cast_or_null<GlobalVariable>(Alias.getAliaseeObject());
+    if (GVar) {
+      std::optional<CodeModel::Model> OptionalCodeModel = GVar->getCodeModel();
+      if (OptionalCodeModel)
+        setOptionalCodeModel(cast<MCSymbolXCOFF>(getSymbol(&Alias)),
+                             *OptionalCodeModel);
+    }
+
     GOAliasMap[Aliasee].push_back(&Alias);
   }
 
diff --git a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index f1abb78d4dd343..4a3b64f30d8d49 100644
--- a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -263,7 +263,8 @@ class PPCBoolRetToInt : public FunctionPass {
     Value *IntRetVal = BoolToIntMap[U];
     Type *Int1Ty = Type::getInt1Ty(U->getContext());
     auto *I = cast<Instruction>(U.getUser());
-    Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I);
+    Value *BackToBool =
+        new TruncInst(IntRetVal, Int1Ty, "backToBool", I->getIterator());
     U.set(BackToBool);
 
     return true;
diff --git a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index ffaa3e05c84790..a942d2f9c7e8e5 100644
--- a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -374,8 +374,9 @@ getHazardType(SUnit *SU, int Stalls) {
   // overlapping address.
   if (isLoad && NumStores && !MI->memoperands_empty()) {
     MachineMemOperand *MO = *MI->memoperands_begin();
-    if (isLoadOfStoredAddress(MO->getSize(),
-                              MO->getOffset(), MO->getValue()))
+    if (MO->getSize().hasValue() &&
+        isLoadOfStoredAddress(MO->getSize().getValue(), MO->getOffset(),
+                              MO->getValue()))
       return NoopHazard;
   }
 
@@ -399,9 +400,10 @@ void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
   if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true;
 
   // Track the address stored to.
-  if (isStore && NumStores < 4 && !MI->memoperands_empty()) {
+  if (isStore && NumStores < 4 && !MI->memoperands_empty() &&
+      (*MI->memoperands_begin())->getSize().hasValue()) {
     MachineMemOperand *MO = *MI->memoperands_begin();
-    StoreSize[NumStores] = MO->getSize();
+    StoreSize[NumStores] = MO->getSize().getValue();
     StoreOffset[NumStores] = MO->getOffset();
     StoreValue[NumStores] = MO->getValue();
     ++NumStores;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 0c25accd1d6ce6..dfea9e7709240c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -524,6 +524,24 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
   return true;
 }
 
+static CodeModel::Model getCodeModel(const PPCSubtarget &Subtarget,
+                                     const TargetMachine &TM,
+                                     const SDNode *Node) {
+  // If there isn't an attribute to override the module code model
+  // this will be the effective code model.
+  CodeModel::Model ModuleModel = TM.getCodeModel();
+
+  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Node->getOperand(0));
+  if (!GA)
+    return ModuleModel;
+
+  const GlobalValue *GV = GA->getGlobal();
+  if (!GV)
+    return ModuleModel;
+
+  return Subtarget.getCodeModel(TM, GV);
+}
+
 /// isInt32Immediate - This method tests to see if the node is a 32-bit constant
 /// operand. If so Imm will receive the 32-bit value.
 static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
@@ -6059,7 +6077,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     const bool isAIXABI = Subtarget->isAIXABI();
 
     // PowerPC only support small, medium and large code model.
-    const CodeModel::Model CModel = TM.getCodeModel();
+    const CodeModel::Model CModel = getCodeModel(*Subtarget, TM, N);
+
     assert(!(CModel == CodeModel::Tiny || CModel == CodeModel::Kernel) &&
            "PowerPC doesn't support tiny or kernel code models.");
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index aef2d483c6df1e..cce0efad39c75b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -8519,7 +8519,8 @@ bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
   // If there is no LXSIBZX/LXSIHZX, like Power8,
   // prefer direct move if the memory size is 1 or 2 bytes.
   MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
-  if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
+  if (!Subtarget.hasP9Vector() &&
+      (!MMO->getSize().hasValue() || MMO->getSize().getValue() <= 2))
     return true;
 
   for (SDNode::use_iterator UI = Origin->use_begin(),
@@ -10763,30 +10764,54 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getRegister(PPC::R2, MVT::i32);
 
   case Intrinsic::ppc_rldimi: {
+    assert(Subtarget.isPPC64() && "rldimi is only available in 64-bit!");
+    SDValue Src = Op.getOperand(1);
+    APInt Mask = Op.getConstantOperandAPInt(4);
+    if (Mask.isZero())
+      return Op.getOperand(2);
+    if (Mask.isAllOnes())
+      return DAG.getNode(ISD::ROTL, dl, MVT::i64, Src, Op.getOperand(3));
     uint64_t SH = Op.getConstantOperandVal(3);
     unsigned MB = 0, ME = 0;
-    if (!isRunOfOnes64(Op.getConstantOperandVal(4), MB, ME) || ME != 63 - SH)
+    if (!isRunOfOnes64(Mask.getZExtValue(), MB, ME))
       report_fatal_error("invalid rldimi mask!");
-    return SDValue(DAG.getMachineNode(
-                       PPC::RLDIMI, dl, MVT::i64,
-                       {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
-                        DAG.getTargetConstant(MB, dl, MVT::i32)}),
-                   0);
+    // rldimi requires ME=63-SH, otherwise rotation is needed before rldimi.
+    if (ME < 63 - SH) {
+      Src = DAG.getNode(ISD::ROTL, dl, MVT::i64, Src,
+                        DAG.getConstant(ME + SH + 1, dl, MVT::i32));
+    } else if (ME > 63 - SH) {
+      Src = DAG.getNode(ISD::ROTL, dl, MVT::i64, Src,
+                        DAG.getConstant(ME + SH - 63, dl, MVT::i32));
+    }
+    return SDValue(
+        DAG.getMachineNode(PPC::RLDIMI, dl, MVT::i64,
+                           {Op.getOperand(2), Src,
+                            DAG.getTargetConstant(63 - ME, dl, MVT::i32),
+                            DAG.getTargetConstant(MB, dl, MVT::i32)}),
+        0);
   }
 
   case Intrinsic::ppc_rlwimi: {
+    APInt Mask = Op.getConstantOperandAPInt(4);
+    if (Mask.isZero())
+      return Op.getOperand(2);
+    if (Mask.isAllOnes())
+      return DAG.getNode(ISD::ROTL, dl, MVT::i32, Op.getOperand(1),
+                         Op.getOperand(3));
     unsigned MB = 0, ME = 0;
-    if (!isRunOfOnes(Op.getConstantOperandVal(4), MB, ME))
+    if (!isRunOfOnes(Mask.getZExtValue(), MB, ME))
       report_fatal_error("invalid rlwimi mask!");
     return SDValue(DAG.getMachineNode(
                        PPC::RLWIMI, dl, MVT::i32,
-                       {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                       {Op.getOperand(2), Op.getOperand(1), Op.getOperand(3),
                         DAG.getTargetConstant(MB, dl, MVT::i32),
                         DAG.getTargetConstant(ME, dl, MVT::i32)}),
                    0);
   }
 
   case Intrinsic::ppc_rlwnm: {
+    if (Op.getConstantOperandVal(3) == 0)
+      return DAG.getConstant(0, dl, MVT::i32);
     unsigned MB = 0, ME = 0;
     if (!isRunOfOnes(Op.getConstantOperandVal(3), MB, ME))
       report_fatal_error("invalid rlwnm mask!");
@@ -15023,6 +15048,7 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
     SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
                                          DAG.getVTList(MVT::f64, MVT::Other),
                                          Ops, MVT::i8, LDN->getMemOperand());
+    DAG.makeEquivalentMemoryOrdering(LDN, Ld);
 
     // For signed conversion, we need to sign-extend the value in the VSR
     if (Signed) {
@@ -15114,7 +15140,7 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
     // If the MMO suggests this isn't a load of a full vector, leave
     // things alone.  For a built-in, we have to make the change for
     // correctness, so if there is a size problem that will be a bug.
-    if (MMO->getSize() < 16)
+    if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
       return SDValue();
     break;
   }
@@ -15182,7 +15208,7 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
     // If the MMO suggests this isn't a store of a full vector, leave
     // things alone.  For a built-in, we have to make the change for
     // correctness, so if there is a size problem that will be a bug.
-    if (MMO->getSize() < 16)
+    if (!MMO->getSize().hasValue() || MMO->getSize().getValue() < 16)
       return SDValue();
     break;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 82da1a3c305983..6423e692d88c37 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -5045,12 +5045,12 @@ defm : TrapExtendedMnemonic<"lng", 6>;
 defm : TrapExtendedMnemonic<"u", 31>;
 
 // Atomic loads
-def : Pat<(atomic_load_8  DForm:$src), (LBZ  memri:$src)>;
-def : Pat<(atomic_load_16 DForm:$src), (LHZ  memri:$src)>;
-def : Pat<(atomic_load_32 DForm:$src), (LWZ  memri:$src)>;
-def : Pat<(atomic_load_8  XForm:$src), (LBZX memrr:$src)>;
-def : Pat<(atomic_load_16 XForm:$src), (LHZX memrr:$src)>;
-def : Pat<(atomic_load_32 XForm:$src), (LWZX memrr:$src)>;
+def : Pat<(i32 (atomic_load_8  DForm:$src)), (LBZ  memri:$src)>;
+def : Pat<(i32 (atomic_load_16 DForm:$src)), (LHZ  memri:$src)>;
+def : Pat<(i32 (atomic_load_32 DForm:$src)), (LWZ  memri:$src)>;
+def : Pat<(i32 (atomic_load_8  XForm:$src)), (LBZX memrr:$src)>;
+def : Pat<(i32 (atomic_load_16 XForm:$src)), (LHZX memrr:$src)>;
+def : Pat<(i32 (atomic_load_32 XForm:$src)), (LWZX memrr:$src)>;
 
 // Atomic stores
 def : Pat<(atomic_store_8  i32:$val, DForm:$ptr), (STB  gprc:$val, memri:$ptr)>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index d5a372e4dc1010..5f2937d47a5195 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -1289,13 +1289,13 @@ let Predicates = [PCRelativeMemops] in {
             (PSTXVpc $XS, $ga, 0)>;
 
   // Atomic Load
-  def : Pat<(atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(i32 (atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZpc $ga, 0)>;
-  def : Pat<(atomic_load_16 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(i32 (atomic_load_16 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLHZpc $ga, 0)>;
-  def : Pat<(atomic_load_32 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(i32 (atomic_load_32 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLWZpc $ga, 0)>;
-  def : Pat<(atomic_load_64 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(i64 (atomic_load_64 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLDpc $ga, 0)>;
 
   // Atomic Store
@@ -2347,10 +2347,10 @@ let Predicates = [PrefixInstrs] in {
   def : Pat<(store f64:$FRS, PDForm:$dst), (PSTFD $FRS, memri34:$dst)>;
 
   // Atomic Load
-  def : Pat<(atomic_load_8 PDForm:$src), (PLBZ memri34:$src)>;
-  def : Pat<(atomic_load_16 PDForm:$src), (PLHZ memri34:$src)>;
-  def : Pat<(atomic_load_32 PDForm:$src), (PLWZ memri34:$src)>;
-  def : Pat<(atomic_load_64 PDForm:$src), (PLD memri34:$src)>;
+  def : Pat<(i32 (atomic_load_8 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(i32 (atomic_load_16 PDForm:$src)), (PLHZ memri34:$src)>;
+  def : Pat<(i32 (atomic_load_32 PDForm:$src)), (PLWZ memri34:$src)>;
+  def : Pat<(i64 (atomic_load_64 PDForm:$src)), (PLD memri34:$src)>;
 
   // Atomic Store
   def : Pat<(atomic_store_8 i32:$RS, PDForm:$dst), (PSTB $RS, memri34:$dst)>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 0e5f6b773bb544..dd07892794d599 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1596,7 +1596,7 @@ let Predicates = [HasVSX, HasP9Vector] in {
   // FIXME: Setting the hasSideEffects flag here to match current behaviour.
   let hasSideEffects = 1 in {
     def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298,
-                                (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
+                                (outs crrc:$BF), (ins u7imm:$DCMX, vssrc:$XB),
                                 "xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>;
     def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362,
                                 (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index dc739a2c7a4d0a..f19eb2af5e0709 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -725,7 +725,7 @@ PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
   Instruction *PtrInc = nullptr;
   Instruction *NewBasePtr = nullptr;
   if (CanPreInc) {
-    Instruction *InsPoint = &*Header->getFirstInsertionPt();
+    BasicBlock::iterator InsPoint = Header->getFirstInsertionPt();
     PtrInc = GetElementPtrInst::Create(
         I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
         InsPoint);
@@ -752,7 +752,7 @@ PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
       // For the latch predecessor, we need to insert a GEP just before the
       // terminator to increase the address.
       BasicBlock *BB = PI;
-      Instruction *InsPoint = BB->getTerminator();
+      BasicBlock::iterator InsPoint = BB->getTerminator()->getIterator();
       PtrInc = GetElementPtrInst::Create(
           I8Ty, NewPHI, IncNode, getInstrName(BaseMemI, GEPNodeIncNameSuffix),
           InsPoint);
@@ -764,7 +764,7 @@ PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
     if (NewPHI->getType() != BasePtr->getType())
       NewBasePtr = new BitCastInst(NewPHI, BasePtr->getType(),
                                    getInstrName(NewPHI, CastNodeNameSuffix),
-                                   &*Header->getFirstInsertionPt());
+                                   Header->getFirstInsertionPt());
     else
       NewBasePtr = NewPHI;
   }
@@ -794,20 +794,25 @@ Instruction *PPCLoopInstrFormPrep::rewriteForBucketElement(
        cast<SCEVConstant>(Element.Offset)->getValue()->isZero())) {
     RealNewPtr = NewBasePtr;
   } else {
-    Instruction *PtrIP = dyn_cast<Instruction>(Ptr);
+    std::optional<BasicBlock::iterator> PtrIP = std::nullopt;
+    if (Instruction *I = dyn_cast<Instruction>(Ptr))
+      PtrIP = I->getIterator();
+
     if (PtrIP && isa<Instruction>(NewBasePtr) &&
-        cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent())
-      PtrIP = nullptr;
-    else if (PtrIP && isa<PHINode>(PtrIP))
-      PtrIP = &*PtrIP->getParent()->getFirstInsertionPt();
+        cast<Instruction>(NewBasePtr)->getParent() == (*PtrIP)->getParent())
+      PtrIP = std::nullopt;
+    else if (PtrIP && isa<PHINode>(*PtrIP))
+      PtrIP = (*PtrIP)->getParent()->getFirstInsertionPt();
     else if (!PtrIP)
-      PtrIP = Element.Instr;
+      PtrIP = Element.Instr->getIterator();
 
     assert(OffToBase && "There should be an offset for non base element!\n");
     GetElementPtrInst *NewPtr = GetElementPtrInst::Create(
         I8Ty, PtrInc, OffToBase,
-        getInstrName(Element.Instr, GEPNodeOffNameSuffix), PtrIP);
-    if (!PtrIP)
+        getInstrName(Element.Instr, GEPNodeOffNameSuffix));
+    if (PtrIP)
+      NewPtr->insertBefore(*(*PtrIP)->getParent(), *PtrIP);
+    else
       NewPtr->insertAfter(cast<Instruction>(PtrInc));
     NewPtr->setIsInBounds(IsPtrInBounds(Ptr));
     RealNewPtr = NewPtr;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 884f2f5c57b258..653d9bda99192a 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -185,37 +185,64 @@ bool PPCSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
 }
 
-void PPCSubtarget::tocDataChecks(unsigned PointerSize,
-                                 const GlobalVariable *GV) const {
-  // TODO: These asserts should be updated as more support for the toc data
-  // transformation is added (struct support, etc.).
-  assert(
-      PointerSize >= GV->getAlign().valueOrOne().value() &&
-      "GlobalVariables with an alignment requirement stricter than TOC entry "
-      "size not supported by the toc data transformation.");
-
-  Type *GVType = GV->getValueType();
-  assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
-                              "supported by the toc data transformation.");
-  if (GV->getParent()->getDataLayout().getTypeSizeInBits(GVType) >
-      PointerSize * 8)
-    report_fatal_error(
-        "A GlobalVariable with size larger than a TOC entry is not currently "
-        "supported by the toc data transformation.");
-  if (GV->hasPrivateLinkage())
-    report_fatal_error("A GlobalVariable with private linkage is not "
-                       "currently supported by the toc data transformation.");
-}
-
 bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
+  if (isAIXABI()) {
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      // On AIX the only symbols that aren't indirect are toc-data.
+      return !GVar->hasAttribute("toc-data");
+
+    return true;
+  }
+
   // Large code model always uses the TOC even for local symbols.
   if (TM.getCodeModel() == CodeModel::Large)
     return true;
+
   if (TM.shouldAssumeDSOLocal(GV))
     return false;
   return true;
 }
 
+CodeModel::Model PPCSubtarget::getCodeModel(const TargetMachine &TM,
+                                            const GlobalValue *GV) const {
+  // If there isn't an attribute to override the module code model
+  // this will be the effective code model.
+  CodeModel::Model ModuleModel = TM.getCodeModel();
+
+  // Initially support per global code model for AIX only.
+  if (!isAIXABI())
+    return ModuleModel;
+
+  // Only GlobalVariables carry an attribute which can override the module code
+  // model.
+  assert(GV && "Unexpected NULL GlobalValue");
+  const GlobalVariable *GlobalVar =
+      [](const GlobalValue *GV) -> const GlobalVariable * {
+    const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+    if (Var)
+      return Var;
+
+    const GlobalAlias *Alias = dyn_cast<GlobalAlias>(GV);
+    if (Alias)
+      return dyn_cast<GlobalVariable>(Alias->getAliaseeObject());
+
+    return nullptr;
+  }(GV);
+
+  if (!GlobalVar)
+    return ModuleModel;
+
+  std::optional<CodeModel::Model> MaybeCodeModel = GlobalVar->getCodeModel();
+  if (MaybeCodeModel) {
+    CodeModel::Model CM = *MaybeCodeModel;
+    assert((CM == CodeModel::Small || CM == CodeModel::Large) &&
+           "invalid code model for AIX");
+    return CM;
+  }
+
+  return ModuleModel;
+}
+
 bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
 bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); }
 
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index d913f22bd5ba9d..bf35f8ec151b10 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -245,7 +245,9 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   /// True if the GV will be accessed via an indirect symbol.
   bool isGVIndirectSymbol(const GlobalValue *GV) const;
 
-  void tocDataChecks(unsigned PointerSize, const GlobalVariable *GV) const;
+  /// Calculates the effective code model for argument GV.
+  CodeModel::Model getCodeModel(const TargetMachine &TM,
+                                const GlobalValue *GV) const;
 
   /// True if the ABI is descriptor based.
   bool usesFunctionDescriptors() const {
diff --git a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 147438dfedd87d..9f680ef5046dad 100644
--- a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -25,6 +25,7 @@
 #include "PPCInstrInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/InitializePasses.h"
@@ -159,9 +160,11 @@ namespace {
         // We don't really need to save data to the stack - the clobbered
         // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
         // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
-        if (NeedFence)
+        if (NeedFence) {
+          MBB.getParent()->getFrameInfo().setAdjustsStack(true);
           BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
                                                               .addImm(0);
+        }
 
         if (IsAIX) {
           if (IsTLSLDAIXMI) {
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 575bd4c9d3561d..22736edc5f0720 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -43,6 +43,7 @@ include "RISCVMacroFusion.td"
 include "RISCVSchedRocket.td"
 include "RISCVSchedSiFive7.td"
 include "RISCVSchedSiFiveP400.td"
+include "RISCVSchedSiFiveP600.td"
 include "RISCVSchedSyntacoreSCR1.td"
 include "RISCVSchedXiangShanNanHu.td"
 
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index 0530e3d08be821..f0bd25f167d80d 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -231,7 +231,7 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
     BasePtr =
         PHINode::Create(Start->getType(), 2, Phi->getName() + ".scalar", Phi->getIterator());
     Inc = BinaryOperator::CreateAdd(BasePtr, Step, Inc->getName() + ".scalar",
-                                    Inc);
+                                    Inc->getIterator());
     BasePtr->addIncoming(Start, Phi->getIncomingBlock(1 - IncrementingBlock));
     BasePtr->addIncoming(Inc, Phi->getIncomingBlock(IncrementingBlock));
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 803774fd16dbf0..4bfd4d0386a86c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -717,7 +717,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
     static const unsigned FloatingPointVecReduceOps[] = {
         ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_FMIN,
-        ISD::VECREDUCE_FMAX};
+        ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMINIMUM, ISD::VECREDUCE_FMAXIMUM};
 
     if (!Subtarget.is64Bit()) {
       // We must custom-lower certain vXi64 operations on RV32 due to the vector
@@ -6541,6 +6541,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VECREDUCE_SEQ_FADD:
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
     return lowerFPVECREDUCE(Op, DAG);
   case ISD::VP_REDUCE_ADD:
   case ISD::VP_REDUCE_UMAX:
@@ -9541,14 +9543,17 @@ getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT,
   case ISD::VECREDUCE_SEQ_FADD:
     return std::make_tuple(RISCVISD::VECREDUCE_SEQ_FADD_VL, Op.getOperand(1),
                            Op.getOperand(0));
+  case ISD::VECREDUCE_FMINIMUM:
+  case ISD::VECREDUCE_FMAXIMUM:
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAX: {
     SDValue Front =
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op.getOperand(0),
                     DAG.getVectorIdxConstant(0, DL));
-    unsigned RVVOpc = (Opcode == ISD::VECREDUCE_FMIN)
-                          ? RISCVISD::VECREDUCE_FMIN_VL
-                          : RISCVISD::VECREDUCE_FMAX_VL;
+    unsigned RVVOpc =
+        (Opcode == ISD::VECREDUCE_FMIN || Opcode == ISD::VECREDUCE_FMINIMUM)
+            ? RISCVISD::VECREDUCE_FMIN_VL
+            : RISCVISD::VECREDUCE_FMAX_VL;
     return std::make_tuple(RVVOpc, Op.getOperand(0), Front);
   }
   }
@@ -9571,9 +9576,30 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op,
     VectorVal = convertToScalableVector(ContainerVT, VectorVal, DAG, Subtarget);
   }
 
+  MVT ResVT = Op.getSimpleValueType();
   auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
-  return lowerReductionSeq(RVVOpcode, Op.getSimpleValueType(), ScalarVal,
-                           VectorVal, Mask, VL, DL, DAG, Subtarget);
+  SDValue Res = lowerReductionSeq(RVVOpcode, ResVT, ScalarVal, VectorVal, Mask,
+                                  VL, DL, DAG, Subtarget);
+  if (Op.getOpcode() != ISD::VECREDUCE_FMINIMUM &&
+      Op.getOpcode() != ISD::VECREDUCE_FMAXIMUM)
+    return Res;
+
+  if (Op->getFlags().hasNoNaNs())
+    return Res;
+
+  // Force output to NaN if any element is Nan.
+  SDValue IsNan =
+      DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(),
+                  {VectorVal, VectorVal, DAG.getCondCode(ISD::SETNE),
+                   DAG.getUNDEF(Mask.getValueType()), Mask, VL});
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDValue CPop = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, IsNan, Mask, VL);
+  SDValue NoNaNs = DAG.getSetCC(DL, XLenVT, CPop,
+                                DAG.getConstant(0, DL, XLenVT), ISD::SETEQ);
+  return DAG.getSelect(
+      DL, ResVT, NoNaNs, Res,
+      DAG.getConstantFP(APFloat::getNaN(DAG.EVTToAPFloatSemantics(ResVT)), DL,
+                        ResVT));
 }
 
 SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op,
@@ -15297,62 +15323,13 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
 }
 
-// Recursively split up concat_vectors with more than 2 operands:
-//
-// concat_vector op1, op2, op3, op4
-// ->
-// concat_vector (concat_vector op1, op2), (concat_vector op3, op4)
-//
-// This reduces the length of the chain of vslideups and allows us to perform
-// the vslideups at a smaller LMUL, limited to MF2.
-//
-// We do this as a DAG combine rather than during lowering so that any undef
-// operands can get combined away.
-static SDValue
-performCONCAT_VECTORSSplitCombine(SDNode *N, SelectionDAG &DAG,
-                                  const RISCVTargetLowering &TLI) {
-  SDLoc DL(N);
-
-  if (N->getNumOperands() <= 2)
-    return SDValue();
-
-  if (!TLI.isTypeLegal(N->getValueType(0)))
-    return SDValue();
-  MVT VT = N->getSimpleValueType(0);
-
-  // Don't split any further than MF2.
-  MVT ContainerVT = VT;
-  if (VT.isFixedLengthVector())
-    ContainerVT = getContainerForFixedLengthVector(DAG, VT, TLI.getSubtarget());
-  if (ContainerVT.bitsLT(getLMUL1VT(ContainerVT)))
-    return SDValue();
-
-  MVT HalfVT = VT.getHalfNumVectorElementsVT();
-  assert(isPowerOf2_32(N->getNumOperands()));
-  size_t HalfNumOps = N->getNumOperands() / 2;
-  SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
-                           N->ops().take_front(HalfNumOps));
-  SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
-                           N->ops().drop_front(HalfNumOps));
-
-  // Lower to an insert_subvector directly so the concat_vectors don't get
-  // recombined.
-  SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Lo,
-                            DAG.getVectorIdxConstant(0, DL));
-  Vec = DAG.getNode(
-      ISD::INSERT_SUBVECTOR, DL, VT, Vec, Hi,
-      DAG.getVectorIdxConstant(HalfVT.getVectorMinNumElements(), DL));
-  return Vec;
-}
-
 // If we're concatenating a series of vector loads like
 // concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
 // Then we can turn this into a strided load by widening the vector elements
 // vlse32 p, stride=n
-static SDValue
-performCONCAT_VECTORSStridedLoadCombine(SDNode *N, SelectionDAG &DAG,
-                                        const RISCVSubtarget &Subtarget,
-                                        const RISCVTargetLowering &TLI) {
+static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
+                                            const RISCVSubtarget &Subtarget,
+                                            const RISCVTargetLowering &TLI) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
@@ -16457,10 +16434,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return V;
     break;
   case ISD::CONCAT_VECTORS:
-    if (SDValue V =
-            performCONCAT_VECTORSStridedLoadCombine(N, DAG, Subtarget, *this))
-      return V;
-    if (SDValue V = performCONCAT_VECTORSSplitCombine(N, DAG, *this))
+    if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
       return V;
     break;
   case ISD::INSERT_VECTOR_ELT:
@@ -20935,7 +20909,7 @@ bool RISCVTargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
   if (Op == Instruction::Add || Op == Instruction::Sub ||
       Op == Instruction::And || Op == Instruction::Or ||
       Op == Instruction::Xor || Op == Instruction::InsertElement ||
-      Op == Instruction::Xor || Op == Instruction::ShuffleVector)
+      Op == Instruction::ShuffleVector || Op == Instruction::Load)
     return false;
 
   if (Inst.getType()->isScalableTy())
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 37a8079dcbf10d..4ca300f9151e1a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -656,7 +656,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   if (IsScalableVector) {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
-        MemoryLocation::UnknownSize, MFI.getObjectAlign(FI));
+        LocationSize::beforeOrAfterPointer(), MFI.getObjectAlign(FI));
 
     MFI.setStackID(FI, TargetStackID::ScalableVector);
     BuildMI(MBB, I, DebugLoc(), get(Opcode))
@@ -739,7 +739,7 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (IsScalableVector) {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
-        MemoryLocation::UnknownSize, MFI.getObjectAlign(FI));
+        LocationSize::beforeOrAfterPointer(), MFI.getObjectAlign(FI));
 
     MFI.setStackID(FI, TargetStackID::ScalableVector);
     BuildMI(MBB, I, DebugLoc(), get(Opcode), DstReg)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index f0f8494dd9a313..a882b208a76889 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -424,13 +424,13 @@ def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh", Commutable=1>,
 
 let Predicates = [HasStdExtZbb] in {
 def MIN  : ALU_rr<0b0000101, 0b100, "min", Commutable=1>,
-           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+           Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 def MINU : ALU_rr<0b0000101, 0b101, "minu", Commutable=1>,
-           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+           Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 def MAX  : ALU_rr<0b0000101, 0b110, "max", Commutable=1>,
-           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+           Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 def MAXU : ALU_rr<0b0000101, 0b111, "maxu", Commutable=1>,
-           Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+           Sched<[WriteIMinMax, ReadIMinMax, ReadIMinMax]>;
 } // Predicates = [HasStdExtZbb]
 
 let Predicates = [HasStdExtZbkb] in {
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 8c75df41f5e395..fd6d6078ec238b 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -245,7 +245,7 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                        TuneLUIADDIFusion,
                                        TuneAUIPCADDIFusion]>;
 
-def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", NoSchedModel,
+def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       [Feature64Bit,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index a68674b221d38e..10bf1e88d74146 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -431,29 +431,28 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 
   if (!IsRVVSpill) {
-    if (MI.getOpcode() == RISCV::ADDI && !isInt<12>(Offset.getFixed())) {
+    int64_t Val = Offset.getFixed();
+    int64_t Lo12 = SignExtend64<12>(Val);
+    unsigned Opc = MI.getOpcode();
+    if (Opc == RISCV::ADDI && !isInt<12>(Val)) {
       // We chose to emit the canonical immediate sequence rather than folding
       // the offset into the using add under the theory that doing so doesn't
       // save dynamic instruction count and some target may fuse the canonical
       // 32 bit immediate sequence.  We still need to clear the portion of the
       // offset encoded in the immediate.
       MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+    } else if ((Opc == RISCV::PREFETCH_I || Opc == RISCV::PREFETCH_R ||
+                Opc == RISCV::PREFETCH_W) &&
+               (Lo12 & 0b11111) != 0) {
+      // Prefetch instructions require the offset to be 32 byte aligned.
+      MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
     } else {
       // We can encode an add with 12 bit signed immediate in the immediate
       // operand of our user instruction.  As a result, the remaining
       // offset can by construction, at worst, a LUI and a ADD.
-      int64_t Val = Offset.getFixed();
-      int64_t Lo12 = SignExtend64<12>(Val);
-      if ((MI.getOpcode() == RISCV::PREFETCH_I ||
-           MI.getOpcode() == RISCV::PREFETCH_R ||
-           MI.getOpcode() == RISCV::PREFETCH_W) &&
-          (Lo12 & 0b11111) != 0)
-        MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
-      else {
-        MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Lo12);
-        Offset = StackOffset::get((uint64_t)Val - (uint64_t)Lo12,
-                                  Offset.getScalable());
-      }
+      MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Lo12);
+      Offset = StackOffset::get((uint64_t)Val - (uint64_t)Lo12,
+                                Offset.getScalable());
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 240d170bfcf6f9..3586d235bdbbb9 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -186,7 +186,7 @@ class SiFive7AnyToGPRBypass<SchedRead read, int cycles = 2>
                                  WriteBEXT, WriteBEXTI,
                                  WriteCLZ, WriteCLZ32, WriteCTZ, WriteCTZ32,
                                  WriteCPOP, WriteCPOP32,
-                                 WriteREV8, WriteORCB, WriteSFB,
+                                 WriteREV8, WriteORCB, WriteIMinMax, WriteSFB,
                                  WriteIMul, WriteIMul32,
                                  WriteIDiv, WriteIDiv32,
                                  WriteIRem, WriteIRem32,
@@ -305,6 +305,9 @@ def : WriteRes<WriteCPOP32, [SiFive7PipeB]>;
 // orc.b is in the late-B ALU.
 def : WriteRes<WriteORCB, [SiFive7PipeB]>;
 
+// min/max are in the late-B ALU
+def : WriteRes<WriteIMinMax, [SiFive7PipeB]>;
+
 // rev8 is in the late-A and late-B ALUs.
 def : WriteRes<WriteREV8, [SiFive7PipeAB]>;
 
@@ -1041,6 +1044,7 @@ def : SiFive7AnyToGPRBypass<ReadCTZ32>;
 def : ReadAdvance<ReadCPOP, 0>;
 def : ReadAdvance<ReadCPOP32, 0>;
 def : SiFive7AnyToGPRBypass<ReadORCB>;
+def : SiFive7AnyToGPRBypass<ReadIMinMax>;
 def : SiFive7AnyToGPRBypass<ReadREV8>;
 def : SiFive7AnyToGPRBypass<ReadSHXADD>;
 def : SiFive7AnyToGPRBypass<ReadSHXADD32>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index d02d34a0fb9c58..8ec2e4ff885ebb 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -109,6 +109,7 @@ def : WriteRes<WriteCTZ, [SiFiveP400IntArith]>;
 def : WriteRes<WriteCTZ32, [SiFiveP400IntArith]>;
 
 def : WriteRes<WriteORCB, [SiFiveP400IntArith]>;
+def : WriteRes<WriteIMinMax, [SiFiveP400IntArith]>;
 
 def : WriteRes<WriteREV8, [SiFiveP400IntArith]>;
 
@@ -349,6 +350,7 @@ def : ReadAdvance<ReadCTZ32, 0>;
 def : ReadAdvance<ReadCPOP, 0>;
 def : ReadAdvance<ReadCPOP32, 0>;
 def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
 def : ReadAdvance<ReadREV8, 0>;
 def : ReadAdvance<ReadSHXADD, 0>;
 def : ReadAdvance<ReadSHXADD32, 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
new file mode 100644
index 00000000000000..54016959d348e3
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -0,0 +1,1024 @@
+//==- RISCVSchedSiFiveP600.td - SiFiveP600 Scheduling Defs ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+/// c is true if mx has the worst case behavior compared to LMULs in MxList.
+/// On the SiFiveP600, the worst case LMUL is the Largest LMUL
+/// and the worst case sew is the smallest SEW for that LMUL.
+class SiFiveP600IsWorstCaseMX<string mx, list<string> MxList> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
+}
+
+class SiFiveP600IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  int SSEW = SmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+// 1 Micro-Op per cycle.
+class SiFiveP600GetLMulCycles<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 1,
+    !eq(mx, "M2") : 2,
+    !eq(mx, "M4") : 4,
+    !eq(mx, "M8") : 8,
+    !eq(mx, "MF2") : 1,
+    !eq(mx, "MF4") : 1,
+    !eq(mx, "MF8") : 1
+  );
+}
+
+// Latency for segmented loads and stores are calculated as vl * nf.
+class SiFiveP600GetCyclesSegmented<string mx, int sew, int nf> {
+  defvar VLEN = 128;
+  defvar VLUpperBound = !cond(
+    !eq(mx, "M1") : !div(VLEN, sew),
+    !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
+    !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
+    !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
+    !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
+    !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
+    !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
+  );
+  int c = !mul(VLUpperBound, nf);
+}
+
+// SiFiveP600 machine model for scheduling and other instruction cost heuristics.
+def SiFiveP600Model : SchedMachineModel {
+  let IssueWidth = 4;         // 4 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 160; // Max micro-ops that can be buffered.
+  let LoadLatency = 4;        // Cycles for loads to access the cache.
+  let MispredictPenalty = 9;  // Extra cycles for a mispredicted branch.
+  let PostRAScheduler = true;
+  let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+                             HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+                             HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
+                             HasVendorXSfvqmaccqoq];
+  let CompleteModel = false;
+}
+
+let SchedModel = SiFiveP600Model in {
+
+def SiFiveP600IEXQ0       : ProcResource<1>;
+def SiFiveP600IEXQ1       : ProcResource<1>;
+def SiFiveP600IEXQ2       : ProcResource<1>;
+def SiFiveP600IEXQ3       : ProcResource<1>;
+def SiFiveP600FEXQ0       : ProcResource<1>;
+def SiFiveP600FEXQ1       : ProcResource<1>;
+
+// Two Load/Store ports that can issue either two loads, two stores, or one load
+// and one store (P550 has one load and one separate store pipe).
+def SiFiveP600LDST       : ProcResource<2>;
+
+// 4-wide pipeline with 4 ALU pipes.
+def SiFiveP600IntArith    : ProcResGroup<[SiFiveP600IEXQ0, SiFiveP600IEXQ1, SiFiveP600IEXQ2, SiFiveP600IEXQ3]>;
+defvar SiFiveP600SYS      = SiFiveP600IEXQ0;
+defvar SiFiveP600CMOV     = SiFiveP600IEXQ0;
+defvar SiFiveP600MulI2F   = SiFiveP600IEXQ1;
+def SiFiveP600Branch      : ProcResGroup<[SiFiveP600IEXQ2, SiFiveP600IEXQ3]>;
+def SiFiveP600Div         : ProcResource<1>;
+
+def SiFiveP600FloatArith  : ProcResGroup<[SiFiveP600FEXQ0, SiFiveP600FEXQ1]>;
+defvar SiFiveP600F2I      = SiFiveP600FEXQ0;
+def SiFiveP600FloatDiv    : ProcResource<1>;
+
+// Vector pipeline
+// VEXQ0 handle Mask, Simple Slide instructions,
+// VEXQ1 handle Complex Slide, Permutation, Reductions, Divide instructions.
+// Other vector instructions can be done in VEXQ0 and VEXQ1.
+def SiFiveP600VEXQ0        : ProcResource<1>;
+def SiFiveP600VEXQ1        : ProcResource<1>;
+def SiFiveP600VectorArith  : ProcResGroup<[SiFiveP600VEXQ0, SiFiveP600VEXQ1]>;
+def SiFiveP600VLD          : ProcResource<1>;
+def SiFiveP600VST          : ProcResource<1>;
+def SiFiveP600VDiv         : ProcResource<1>;
+def SiFiveP600VFloatDiv    : ProcResource<1>;
+
+// Integer arithmetic and logic
+def : WriteRes<WriteIALU, [SiFiveP600IntArith]>;
+def : WriteRes<WriteIALU32, [SiFiveP600IntArith]>;
+def : WriteRes<WriteShiftImm, [SiFiveP600IntArith]>;
+def : WriteRes<WriteShiftImm32, [SiFiveP600IntArith]>;
+def : WriteRes<WriteShiftReg, [SiFiveP600IntArith]>;
+def : WriteRes<WriteShiftReg32, [SiFiveP600IntArith]>;
+// Branching
+def : WriteRes<WriteJmp, [SiFiveP600Branch]>;
+def : WriteRes<WriteJal, [SiFiveP600Branch]>;
+def : WriteRes<WriteJalr, [SiFiveP600Branch]>;
+
+// CMOV
+def P600WriteCMOV : SchedWriteRes<[SiFiveP600Branch, SiFiveP600CMOV]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[P600WriteCMOV], (instrs PseudoCCMOVGPRNoX0)>;
+
+let Latency = 3 in {
+// Integer multiplication
+def : WriteRes<WriteIMul, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteIMul32, [SiFiveP600MulI2F]>;
+// cpop[w] look exactly like multiply.
+def : WriteRes<WriteCPOP, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteCPOP32, [SiFiveP600MulI2F]>;
+}
+
+// Integer division
+def : WriteRes<WriteIDiv, [SiFiveP600MulI2F, SiFiveP600Div]> {
+  let Latency = 35;
+  let ReleaseAtCycles = [1, 34];
+}
+def : WriteRes<WriteIDiv32,  [SiFiveP600MulI2F, SiFiveP600Div]> {
+  let Latency = 20;
+  let ReleaseAtCycles = [1, 19];
+}
+
+// Integer remainder
+def : WriteRes<WriteIRem, [SiFiveP600MulI2F, SiFiveP600Div]> {
+  let Latency = 35;
+  let ReleaseAtCycles = [1, 34];
+}
+def : WriteRes<WriteIRem32, [SiFiveP600MulI2F, SiFiveP600Div]> {
+  let Latency = 20;
+  let ReleaseAtCycles = [1, 19];
+}
+
+// Bitmanip
+def : WriteRes<WriteRotateImm, [SiFiveP600IntArith]>;
+def : WriteRes<WriteRotateImm32, [SiFiveP600IntArith]>;
+def : WriteRes<WriteRotateReg, [SiFiveP600IntArith]>;
+def : WriteRes<WriteRotateReg32, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteCLZ, [SiFiveP600IntArith]>;
+def : WriteRes<WriteCLZ32, [SiFiveP600IntArith]>;
+def : WriteRes<WriteCTZ, [SiFiveP600IntArith]>;
+def : WriteRes<WriteCTZ32, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteORCB, [SiFiveP600IntArith]>;
+def : WriteRes<WriteIMinMax, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteREV8, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteSHXADD, [SiFiveP600IntArith]>;
+def : WriteRes<WriteSHXADD32, [SiFiveP600IntArith]>;
+
+def : WriteRes<WriteSingleBit, [SiFiveP600IntArith]>;
+def : WriteRes<WriteSingleBitImm, [SiFiveP600IntArith]>;
+def : WriteRes<WriteBEXT, [SiFiveP600IntArith]>;
+def : WriteRes<WriteBEXTI, [SiFiveP600IntArith]>;
+
+// Memory
+def : WriteRes<WriteSTB, [SiFiveP600LDST]>;
+def : WriteRes<WriteSTH, [SiFiveP600LDST]>;
+def : WriteRes<WriteSTW, [SiFiveP600LDST]>;
+def : WriteRes<WriteSTD, [SiFiveP600LDST]>;
+def : WriteRes<WriteFST16, [SiFiveP600LDST]>;
+def : WriteRes<WriteFST32, [SiFiveP600LDST]>;
+def : WriteRes<WriteFST64, [SiFiveP600LDST]>;
+
+let Latency = 4 in {
+def : WriteRes<WriteLDB, [SiFiveP600LDST]>;
+def : WriteRes<WriteLDH, [SiFiveP600LDST]>;
+}
+let Latency = 4 in {
+def : WriteRes<WriteLDW, [SiFiveP600LDST]>;
+def : WriteRes<WriteLDD, [SiFiveP600LDST]>;
+}
+
+let Latency = 6 in {
+def : WriteRes<WriteFLD16, [SiFiveP600LDST]>;
+def : WriteRes<WriteFLD32, [SiFiveP600LDST]>;
+def : WriteRes<WriteFLD64, [SiFiveP600LDST]>;
+}
+
+// Atomic memory
+let Latency = 3 in {
+def : WriteRes<WriteAtomicSTW, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicSTD, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicW, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicD, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicLDW, [SiFiveP600LDST]>;
+def : WriteRes<WriteAtomicLDD, [SiFiveP600LDST]>;
+}
+
+// Floating point
+let Latency = 2 in {
+def : WriteRes<WriteFAdd16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFAdd32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFAdd64, [SiFiveP600FloatArith]>;
+}
+let Latency = 3 in {
+def : WriteRes<WriteFMul16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMul32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMul64, [SiFiveP600FloatArith]>;
+}
+let Latency = 4 in {
+def : WriteRes<WriteFMA16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMA32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMA64, [SiFiveP600FloatArith]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteFSGNJ16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFSGNJ32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFSGNJ64, [SiFiveP600FloatArith]>;
+
+def : WriteRes<WriteFMinMax16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMinMax32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFMinMax64, [SiFiveP600FloatArith]>;
+}
+
+// Half precision.
+def : WriteRes<WriteFDiv16, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 4;
+  let ReleaseAtCycles = [1, 4];
+}
+def : WriteRes<WriteFSqrt16, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 18;
+  let ReleaseAtCycles = [1, 17];
+}
+
+// Single precision.
+def : WriteRes<WriteFDiv32, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 6;
+  let ReleaseAtCycles = [1, 6];
+}
+def : WriteRes<WriteFSqrt32, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 18;
+  let ReleaseAtCycles = [1, 17];
+}
+
+// Double precision
+def : WriteRes<WriteFDiv64, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 11;
+  let ReleaseAtCycles = [1, 11];
+}
+def : WriteRes<WriteFSqrt64, [SiFiveP600FEXQ1, SiFiveP600FloatDiv]> {
+  let Latency = 33;
+  let ReleaseAtCycles = [1, 32];
+}
+
+// Conversions
+let Latency = 2 in {
+def : WriteRes<WriteFCvtI32ToF16, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI32ToF32, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI32ToF64, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI64ToF16, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI64ToF32, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtI64ToF64, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFCvtF16ToI32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF16ToI64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF16ToF32, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF16ToF64, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF32ToI32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF32ToI64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF32ToF16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF32ToF64, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF64ToI32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF64ToI64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCvtF64ToF16, [SiFiveP600FloatArith]>;
+def : WriteRes<WriteFCvtF64ToF32, [SiFiveP600FloatArith]>;
+
+def : WriteRes<WriteFClass16, [SiFiveP600F2I]>;
+def : WriteRes<WriteFClass32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFClass64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCmp16, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCmp32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFCmp64, [SiFiveP600F2I]>;
+def : WriteRes<WriteFMovI16ToF16, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFMovF16ToI16, [SiFiveP600F2I]>;
+def : WriteRes<WriteFMovI32ToF32, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFMovF32ToI32, [SiFiveP600F2I]>;
+def : WriteRes<WriteFMovI64ToF64, [SiFiveP600MulI2F]>;
+def : WriteRes<WriteFMovF64ToI64, [SiFiveP600F2I]>;
+}
+
+// 6. Configuration-Setting Instructions
+def : WriteRes<WriteVSETVLI, [SiFiveP600SYS]>;
+def : WriteRes<WriteVSETIVLI, [SiFiveP600SYS]>;
+def : WriteRes<WriteVSETVL, [SiFiveP600SYS]>;
+
+// 7. Vector Loads and Stores
+// FIXME: This unit is still being improved, currently
+// it is based on stage numbers. Estimates are optimistic,
+// latency may be longer.
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 8, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVLDE",    [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDM",    [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDFF",   [SiFiveP600VLD], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVLDS8",   [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS16",  [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS32",  [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDS64",  [SiFiveP600VLD], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVLDUX8",  [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX8",  [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP600VLD], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP600VLD], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 8, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSTE",    [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTM",    [SiFiveP600VST], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSTS8",   [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS16",  [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS32",  [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTS64",  [SiFiveP600VST], mx, IsWorstCase>;
+  }
+  let Latency = 12, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSTUX8",  [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX8",  [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP600VST], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP600VST], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxList in {
+  foreach nf=2-8 in {
+    foreach eew = [8, 16, 32, 64] in {
+      defvar LMulLat = SiFiveP600GetCyclesSegmented<mx, eew, nf>.c;
+      defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+      let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
+        defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew,   [SiFiveP600VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" # eew,  [SiFiveP600VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
+      }
+      let Latency = !add(1, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
+        defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" # eew,   [SiFiveP600VST], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" # eew,  [SiFiveP600VST], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" # eew, [SiFiveP600VST], mx, IsWorstCase>;
+        defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" # eew, [SiFiveP600VST], mx, IsWorstCase>;
+      }
+    }
+  }
+}
+
+// Whole register move/load/store
+foreach LMul = [1, 2, 4, 8] in {
+  let Latency = 8, ReleaseAtCycles = [LMul] in {
+    def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SiFiveP600VLD]>;
+    def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SiFiveP600VST]>;
+  }
+  let Latency = LMul, ReleaseAtCycles = [LMul] in {
+    def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SiFiveP600VectorArith]>;
+  }
+}
+
+// 11. Vector Integer Arithmetic Instructions
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVIALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIALUI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVExtV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICALUV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICALUX",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICALUI",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMergeV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMergeX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMergeI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMovV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMovX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMovI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVShiftV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVShiftX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVShiftI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+// Widening
+foreach mx = SchedMxListW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVIWALUV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWALUX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWALUI",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// Worst case needs 64 cycles if SEW is equal to 64.
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    let Latency = 64, ReleaseAtCycles = [LMulLat, !mul(63, LMulLat)] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SiFiveP600VEXQ1, SiFiveP600VDiv], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SiFiveP600VEXQ1, SiFiveP600VDiv], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// Narrowing Shift and Clips
+foreach mx = SchedMxListW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVNShiftV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNShiftX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNShiftI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNClipV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNClipX",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVNClipI",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// 12. Vector Fixed-Point Arithmetic Instructions
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVSALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSALUI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVAALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSMulV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSMulX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSShiftV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSShiftX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVSShiftI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// 13. Vector Floating-Point Instructions
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFALUV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFALUF",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMulF",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMulAddF", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFCvtIToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFCmpV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFCmpF",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFRecpV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFSgnjV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFSgnjF",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMinMaxF", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFClassV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMergeV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFMovV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFWCvtIToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+foreach mx = SchedMxListFW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListFW>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWCvtFToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+foreach mx = SchedMxListFW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListFW>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFWALUV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWMulAddF", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWMulF",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFWALUF",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+// Narrowing
+foreach mx = SchedMxListW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListW>.c;
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+foreach mx = SchedMxListFW in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxListFW>.c;
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVFNCvtIToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFNCvtFToFV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  }
+}
+
+// Worst case needs 76 cycles if SEW is equal to 64.
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+    let Latency = 76, ReleaseAtCycles = [LMulLat, !mul(76, LMulLat)] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV",  [SiFiveP600VEXQ1, SiFiveP600VFloatDiv], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF",  [SiFiveP600VEXQ1, SiFiveP600VFloatDiv], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SiFiveP600VEXQ1, SiFiveP600VFloatDiv], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// 14. Vector Reduction Operations
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    let Latency = !add(2, !mul(2, LMulLat)), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+foreach mx = SchedMxListWRed in {
+  foreach sew = SchedSEWSet<mx, 0, 1>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+    let Latency = !add(2, !mul(2, LMulLat)), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+    let Latency = !add(6, !mul(6, LMulLat)), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From",
+                                     [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+foreach mx = SchedMxListFWRed in {
+  foreach sew = SchedSEWSet<mx, 1, 1>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+    let Latency = !add(6, !mul(6, LMulLat)), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From",  [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SiFiveP600VEXQ1],
+                                     mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// 15. Vector Mask Instructions
+foreach mx = SchedMxList in {
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 1, ReleaseAtCycles = [1] in {
+    defm "" : LMULWriteResMX<"WriteVMALUV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVMPopV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVMFFSV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVMSFSV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVIotaV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVIdxV", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+}
+
+// 16. Vector Permutation Instructions
+// Simple Slide
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVISlideI",  [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVISlide1X", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFSlide1F", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+}
+foreach mx = ["MF8", "MF4", "MF2", "M1"] in {
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 2, ReleaseAtCycles = [1] in {
+    defm "" : LMULWriteResMX<"WriteVISlideX", [SiFiveP600VEXQ0], mx, IsWorstCase>;
+  }
+}
+
+// Complex Slide
+foreach mx = ["M8", "M4", "M2"] in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = !add(4, LMulLat), ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVISlideX", [SiFiveP600VEXQ1], mx, IsWorstCase>;
+  }
+}
+
+let Latency = 2, ReleaseAtCycles = [1] in {
+  def : WriteRes<WriteVMovSX, [SiFiveP600VectorArith]>;
+  def : WriteRes<WriteVMovXS, [SiFiveP600VectorArith]>;
+}
+let Latency = 6, ReleaseAtCycles = [1] in {
+  def : WriteRes<WriteVMovSF, [SiFiveP600VectorArith]>;
+  def : WriteRes<WriteVMovFS, [SiFiveP600VectorArith]>;
+}
+
+// Simple Gather and Compress
+foreach mx = ["MF8", "MF4", "MF2", "M1"] in {
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 3, ReleaseAtCycles = [1] in {
+    defm "" : LMULWriteResMX<"WriteVRGatherVX", [SiFiveP600VEXQ1], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = ["MF8", "MF4", "MF2", "M1"] in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+    let Latency = 3, ReleaseAtCycles = [1] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// Complex Gather and Compress
+foreach mx = ["M2", "M4", "M8"] in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVRGatherVX", [SiFiveP600VEXQ1], mx, IsWorstCase>;
+  }
+}
+
+foreach mx = ["M2", "M4", "M8"] in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    let Latency = 6, ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+    }
+  }
+}
+
+// Simple Vrgather.vi
+foreach mx = SchedMxList in {
+  defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
+  defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    defm "" : LMULWriteResMX<"WriteVRGatherVI", [SiFiveP600VEXQ1], mx, IsWorstCase>;
+  }
+}
+
+// Others
+def : WriteRes<WriteCSR, [SiFiveP600SYS]>;
+def : WriteRes<WriteNop, []>;
+def : WriteRes<WriteRdVLENB, [SiFiveP600SYS]>;
+
+// FIXME: This could be better modeled by looking at the regclasses of the operands.
+def : InstRW<[WriteIALU, ReadIALU], (instrs COPY)>;
+
+//===----------------------------------------------------------------------===//
+// Bypass and advance
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShiftImm, 0>;
+def : ReadAdvance<ReadShiftImm32, 0>;
+def : ReadAdvance<ReadShiftReg, 0>;
+def : ReadAdvance<ReadShiftReg32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIRem, 0>;
+def : ReadAdvance<ReadIRem32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFStoreData, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFAdd16, 0>;
+def : ReadAdvance<ReadFAdd32, 0>;
+def : ReadAdvance<ReadFAdd64, 0>;
+def : ReadAdvance<ReadFMul16, 0>;
+def : ReadAdvance<ReadFMA16, 0>;
+def : ReadAdvance<ReadFMA16Addend, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
+def : ReadAdvance<ReadFDiv16, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt16, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp16, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ16, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax16, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF16ToI32, 0>;
+def : ReadAdvance<ReadFCvtF16ToI64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF16, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF16, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFCvtF16ToF32, 0>;
+def : ReadAdvance<ReadFCvtF32ToF16, 0>;
+def : ReadAdvance<ReadFCvtF16ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF16, 0>;
+def : ReadAdvance<ReadFMovF16ToI16, 0>;
+def : ReadAdvance<ReadFMovI16ToF16, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass16, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+
+// Bitmanip
+def : ReadAdvance<ReadRotateImm, 0>;
+def : ReadAdvance<ReadRotateImm32, 0>;
+def : ReadAdvance<ReadRotateReg, 0>;
+def : ReadAdvance<ReadRotateReg32, 0>;
+def : ReadAdvance<ReadCLZ, 0>;
+def : ReadAdvance<ReadCLZ32, 0>;
+def : ReadAdvance<ReadCTZ, 0>;
+def : ReadAdvance<ReadCTZ32, 0>;
+def : ReadAdvance<ReadCPOP, 0>;
+def : ReadAdvance<ReadCPOP32, 0>;
+def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
+def : ReadAdvance<ReadREV8, 0>;
+def : ReadAdvance<ReadSHXADD, 0>;
+def : ReadAdvance<ReadSHXADD32, 0>;
+def : ReadAdvance<ReadSingleBit, 0>;
+def : ReadAdvance<ReadSingleBitImm, 0>;
+
+// 6. Configuration-Setting Instructions
+def : ReadAdvance<ReadVSETVLI, 0>;
+def : ReadAdvance<ReadVSETVL, 0>;
+
+// 7. Vector Loads and Stores
+def : ReadAdvance<ReadVLDX, 0>;
+def : ReadAdvance<ReadVSTX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTEV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTM", 0>;
+def : ReadAdvance<ReadVLDSX, 0>;
+def : ReadAdvance<ReadVSTSX, 0>;
+defm "" : LMULReadAdvance<"ReadVSTS8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTS64V", 0>;
+defm "" : LMULReadAdvance<"ReadVLDUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVLDOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOXV", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>;
+defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVST1R, 0>;
+def : ReadAdvance<ReadVST2R, 0>;
+def : ReadAdvance<ReadVST4R, 0>;
+def : ReadAdvance<ReadVST8R, 0>;
+
+// 12. Vector Integer Arithmetic Instructions
+defm : LMULReadAdvance<"ReadVIALUV", 0>;
+defm : LMULReadAdvance<"ReadVIALUX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWALUX", 0>;
+defm : LMULReadAdvance<"ReadVExtV", 0>;
+defm : LMULReadAdvance<"ReadVICALUV", 0>;
+defm : LMULReadAdvance<"ReadVICALUX", 0>;
+defm : LMULReadAdvance<"ReadVShiftV", 0>;
+defm : LMULReadAdvance<"ReadVShiftX", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftV", 0>;
+defm : LMULReadAdvanceW<"ReadVNShiftX", 0>;
+defm : LMULReadAdvance<"ReadVICmpV", 0>;
+defm : LMULReadAdvance<"ReadVICmpX", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxV", 0>;
+defm : LMULReadAdvance<"ReadVIMinMaxX", 0>;
+defm : LMULReadAdvance<"ReadVIMulV", 0>;
+defm : LMULReadAdvance<"ReadVIMulX", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivV", 0>;
+defm : LMULSEWReadAdvance<"ReadVIDivX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulX", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddV", 0>;
+defm : LMULReadAdvance<"ReadVIMulAddX", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>;
+defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>;
+defm : LMULReadAdvance<"ReadVIMergeV", 0>;
+defm : LMULReadAdvance<"ReadVIMergeX", 0>;
+defm : LMULReadAdvance<"ReadVIMovV", 0>;
+defm : LMULReadAdvance<"ReadVIMovX", 0>;
+
+// 13. Vector Fixed-Point Arithmetic Instructions
+defm "" : LMULReadAdvance<"ReadVSALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVSALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVAALUX", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulV", 0>;
+defm "" : LMULReadAdvance<"ReadVSMulX", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftV", 0>;
+defm "" : LMULReadAdvance<"ReadVSShiftX", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>;
+
+// 14. Vector Floating-Point Instructions
+defm "" : LMULReadAdvance<"ReadVFALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVFALUF", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWALUV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWALUF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMulV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMulF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWMulV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWMulF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMulAddV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMulAddF", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWMulAddV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWMulAddF", 0>;
+defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>;
+defm "" : LMULReadAdvance<"ReadVFRecpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCmpF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMinMaxV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMinMaxF", 0>;
+defm "" : LMULReadAdvance<"ReadVFSgnjV", 0>;
+defm "" : LMULReadAdvance<"ReadVFSgnjF", 0>;
+defm "" : LMULReadAdvance<"ReadVFClassV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeV", 0>;
+defm "" : LMULReadAdvance<"ReadVFMergeF", 0>;
+defm "" : LMULReadAdvance<"ReadVFMovF", 0>;
+defm "" : LMULReadAdvance<"ReadVFCvtIToFV", 0>;
+defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVFWCvtIToFV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
+defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
+defm "" : LMULReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
+
+// 15. Vector Reduction Operations
+def : ReadAdvance<ReadVIRedV, 0>;
+def : ReadAdvance<ReadVIRedV0, 0>;
+def : ReadAdvance<ReadVIWRedV, 0>;
+def : ReadAdvance<ReadVIWRedV0, 0>;
+def : ReadAdvance<ReadVFRedV, 0>;
+def : ReadAdvance<ReadVFRedV0, 0>;
+def : ReadAdvance<ReadVFRedOV, 0>;
+def : ReadAdvance<ReadVFRedOV0, 0>;
+def : ReadAdvance<ReadVFWRedV, 0>;
+def : ReadAdvance<ReadVFWRedV0, 0>;
+def : ReadAdvance<ReadVFWRedOV, 0>;
+def : ReadAdvance<ReadVFWRedOV0, 0>;
+
+// 16. Vector Mask Instructions
+defm "" : LMULReadAdvance<"ReadVMALUV", 0>;
+defm "" : LMULReadAdvance<"ReadVMPopV", 0>;
+defm "" : LMULReadAdvance<"ReadVMFFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVMSFSV", 0>;
+defm "" : LMULReadAdvance<"ReadVIotaV", 0>;
+
+// 17. Vector Permutation Instructions
+def : ReadAdvance<ReadVMovXS, 0>;
+def : ReadAdvance<ReadVMovSX_V, 0>;
+def : ReadAdvance<ReadVMovSX_X, 0>;
+def : ReadAdvance<ReadVMovFS, 0>;
+def : ReadAdvance<ReadVMovSF_V, 0>;
+def : ReadAdvance<ReadVMovSF_F, 0>;
+defm "" : LMULReadAdvance<"ReadVISlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVISlideX", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
+defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
+defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>;
+// LMUL Aware
+def : ReadAdvance<ReadVMov1V, 0>;
+def : ReadAdvance<ReadVMov2V, 0>;
+def : ReadAdvance<ReadVMov4V, 0>;
+def : ReadAdvance<ReadVMov8V, 0>;
+
+// Others
+def : ReadAdvance<ReadVMask, 0>;
+def : ReadAdvance<ReadVMergeOp_WorstCase, 0>;
+foreach mx = SchedMxList in {
+  def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx), 0>;
+  foreach sew = SchedSEWSet<mx>.val in
+    def : ReadAdvance<!cast<SchedRead>("ReadVMergeOp_" # mx  # "_E" # sew), 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedZabha;
+defm : UnsupportedSchedZbc;
+defm : UnsupportedSchedZbkb;
+defm : UnsupportedSchedZbkx;
+defm : UnsupportedSchedSFB;
+defm : UnsupportedSchedZfa;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
index ef491edf3671f8..4fc7b0335af538 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
@@ -85,6 +85,7 @@ def : WriteRes<WriteRotateImm32, [XS2ALU]>;
 def : WriteRes<WriteRotateReg, [XS2ALU]>;
 def : WriteRes<WriteRotateReg32, [XS2ALU]>;
 def : WriteRes<WriteORCB, [XS2ALU]>;
+def : WriteRes<WriteIMinMax, [XS2ALU]>;
 def : WriteRes<WriteREV8, [XS2ALU]>;
 
 // Zbkb
@@ -288,6 +289,7 @@ def : ReadAdvance<ReadCTZ32, 0>;
 def : ReadAdvance<ReadCPOP, 0>;
 def : ReadAdvance<ReadCPOP32, 0>;
 def : XS2LoadToALUBypass<ReadORCB>;
+def : XS2LoadToALUBypass<ReadIMinMax>;
 def : XS2LoadToALUBypass<ReadREV8>;
 // Zbkc
 def : ReadAdvance<ReadCLMUL, 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleZb.td b/llvm/lib/Target/RISCV/RISCVScheduleZb.td
index 0a16390e505356..93381f439e0dfc 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleZb.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleZb.td
@@ -25,6 +25,7 @@ def WriteCPOP        : SchedWrite;
 def WriteCPOP32      : SchedWrite;
 def WriteREV8        : SchedWrite;
 def WriteORCB        : SchedWrite;
+def WriteIMinMax     : SchedWrite;
 
 // Zbc extension
 def WriteCLMUL       : SchedWrite; // CLMUL/CLMULR/CLMULH
@@ -63,6 +64,7 @@ def ReadCPOP        : SchedRead;
 def ReadCPOP32      : SchedRead;
 def ReadREV8        : SchedRead;
 def ReadORCB        : SchedRead;
+def ReadIMinMax     : SchedRead;
 
 // Zbc extension
 def ReadCLMUL       : SchedRead; // CLMUL/CLMULR/CLMULH
@@ -106,6 +108,7 @@ def : WriteRes<WriteCPOP, []>;
 def : WriteRes<WriteCPOP32, []>;
 def : WriteRes<WriteREV8, []>;
 def : WriteRes<WriteORCB, []>;
+def : WriteRes<WriteIMinMax, []>;
 
 def : ReadAdvance<ReadRotateImm, 0>;
 def : ReadAdvance<ReadRotateImm32, 0>;
@@ -119,6 +122,7 @@ def : ReadAdvance<ReadCPOP, 0>;
 def : ReadAdvance<ReadCPOP32, 0>;
 def : ReadAdvance<ReadREV8, 0>;
 def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
 }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 79f977e5b32266..01c2767119502c 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -418,7 +418,16 @@ def : SysReg<"vsieh", 0x214>;
 def : SysReg<"vsiph", 0x254>;
 } // isRV32Only
 
+//===-----------------------------------------------
 // Jump Vector Table CSR
 //===-----------------------------------------------
 
 def : SysReg<"jvt", 0x017>;
+
+//===-----------------------------------------------
+// Resumable Non-Maskable Interrupts(Smrnmi) CSRs
+//===-----------------------------------------------
+def : SysReg<"mnscratch", 0x740>;
+def : SysReg<"mnepc", 0x741>;
+def : SysReg<"mncause", 0x742>;
+def : SysReg<"mnstatus", 0x744>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 1fbf3c3e11aedc..30c67d3fde6338 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -29,7 +29,9 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -101,6 +103,21 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
   if (ModuleSectionsEmitted == false) {
     outputModuleSections();
     ModuleSectionsEmitted = true;
+  } else {
+    ST = static_cast<const SPIRVTargetMachine &>(TM).getSubtargetImpl();
+    uint32_t DecSPIRVVersion = ST->getSPIRVVersion();
+    uint32_t Major = DecSPIRVVersion / 10;
+    uint32_t Minor = DecSPIRVVersion - Major * 10;
+    // TODO: calculate Bound more carefully from maximum used register number,
+    // accounting for generated OpLabels and other related instructions if
+    // needed.
+    unsigned Bound = 2 * (ST->getBound() + 1);
+    bool FlagToRestore = OutStreamer->getUseAssemblerInfoForParsing();
+    OutStreamer->setUseAssemblerInfoForParsing(true);
+    if (MCAssembler *Asm = OutStreamer->getAssemblerPtr())
+      Asm->setBuildVersion(static_cast<MachO::PlatformType>(0), Major, Minor,
+                           Bound, VersionTuple(Major, Minor, 0, Bound));
+    OutStreamer->setUseAssemblerInfoForParsing(FlagToRestore);
   }
 }
 
@@ -507,6 +524,13 @@ void SPIRVAsmPrinter::outputAnnotations(const Module &M) {
         report_fatal_error("Unsupported value in llvm.global.annotations");
       Function *Func = cast<Function>(AnnotatedVar);
       Register Reg = MAI->getFuncReg(Func);
+      if (!Reg.isValid()) {
+        std::string DiagMsg;
+        raw_string_ostream OS(DiagMsg);
+        AnnotatedVar->print(OS);
+        DiagMsg = "Unknown function in llvm.global.annotations: " + DiagMsg;
+        report_fatal_error(DiagMsg.c_str());
+      }
 
       // The second field contains a pointer to a global annotation string.
       GlobalVariable *GV =
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index bda9c57e534c3a..42f8397a3023b1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -24,7 +24,7 @@
 
 using namespace llvm;
 SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize)
-    : PointerSize(PointerSize) {}
+    : PointerSize(PointerSize), Bound(0) {}
 
 SPIRVType *SPIRVGlobalRegistry::assignIntTypeToVReg(unsigned BitWidth,
                                                     Register VReg,
@@ -896,6 +896,15 @@ bool SPIRVGlobalRegistry::isScalarOrVectorSigned(const SPIRVType *Type) const {
   return IntType && IntType->getOperand(2).getImm() != 0;
 }
 
+unsigned SPIRVGlobalRegistry::getPointeeTypeOp(Register PtrReg) {
+  SPIRVType *PtrType = getSPIRVTypeForVReg(PtrReg);
+  SPIRVType *ElemType =
+      PtrType && PtrType->getOpcode() == SPIRV::OpTypePointer
+          ? getSPIRVTypeForVReg(PtrType->getOperand(2).getReg())
+          : nullptr;
+  return ElemType ? ElemType->getOpcode() : 0;
+}
+
 bool SPIRVGlobalRegistry::isBitcastCompatible(const SPIRVType *Type1,
                                               const SPIRVType *Type2) const {
   if (!Type1 || !Type2)
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 25d82ebf9bc79b..da480b22a525f2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -56,6 +56,9 @@ class SPIRVGlobalRegistry {
   // Number of bits pointers and size_t integers require.
   const unsigned PointerSize;
 
+  // Holds the maximum ID we have in the module.
+  unsigned Bound;
+
   // Add a new OpTypeXXX instruction without checking for duplicates.
   SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
                              SPIRV::AccessQualifier::AccessQualifier AQ =
@@ -108,6 +111,9 @@ class SPIRVGlobalRegistry {
     DT.buildDepsGraph(Graph, MMI);
   }
 
+  void setBound(unsigned V) { Bound = V; }
+  unsigned getBound() { return Bound; }
+
   // Map a machine operand that represents a use of a function via function
   // pointer to a machine operand that represents the function definition.
   // Return either the register or invalid value, because we have no context for
@@ -166,6 +172,9 @@ class SPIRVGlobalRegistry {
     return Res->second;
   }
 
+  // Return a pointee's type op code, or 0 otherwise.
+  unsigned getPointeeTypeOp(Register PtrReg);
+
   // Either generate a new OpTypeXXX instruction or return an existing one
   // corresponding to the given string containing the name of the builtin type.
   // Return nullptr if unable to recognize SPIRV type name from `TypeStr`.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fd19b7412c4c9c..0fef19c2d53419 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1567,7 +1567,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
                                                const SPIRVType *ResType,
                                                MachineInstr &I) const {
   MachineBasicBlock &BB = *I.getParent();
-  switch (cast<GIntrinsic>(I).getIntrinsicID()) {
+  Intrinsic::ID IID = cast<GIntrinsic>(I).getIntrinsicID();
+  switch (IID) {
   case Intrinsic::spv_load:
     return selectLoad(ResVReg, ResType, I);
   case Intrinsic::spv_store:
@@ -1661,8 +1662,25 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     break;
   case Intrinsic::spv_thread_id:
     return selectSpvThreadId(ResVReg, ResType, I);
-  default:
-    llvm_unreachable("Intrinsic selection not implemented");
+  case Intrinsic::spv_lifetime_start:
+  case Intrinsic::spv_lifetime_end: {
+    unsigned Op = IID == Intrinsic::spv_lifetime_start ? SPIRV::OpLifetimeStart
+                                                       : SPIRV::OpLifetimeStop;
+    int64_t Size = I.getOperand(I.getNumExplicitDefs() + 1).getImm();
+    Register PtrReg = I.getOperand(I.getNumExplicitDefs() + 2).getReg();
+    unsigned PonteeOpType = GR.getPointeeTypeOp(PtrReg);
+    bool IsNonvoidPtr = PonteeOpType != 0 && PonteeOpType != SPIRV::OpTypeVoid;
+    if (Size == -1 || IsNonvoidPtr)
+      Size = 0;
+    BuildMI(BB, I, I.getDebugLoc(), TII.get(Op)).addUse(PtrReg).addImm(Size);
+  } break;
+  default: {
+    std::string DiagMsg;
+    raw_string_ostream OS(DiagMsg);
+    I.print(OS);
+    DiagMsg = "Intrinsic selection not implemented: " + DiagMsg;
+    report_fatal_error(DiagMsg.c_str(), false);
+  }
   }
   return true;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
index 8c6649bf628265..afa550d6dd424e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
@@ -34,7 +34,13 @@ void SPIRVMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI,
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_GlobalAddress: {
       Register FuncReg = MAI->getFuncReg(dyn_cast<Function>(MO.getGlobal()));
-      assert(FuncReg.isValid() && "Cannot find function Id");
+      if (!FuncReg.isValid()) {
+        std::string DiagMsg;
+        raw_string_ostream OS(DiagMsg);
+        MI->print(OS);
+        DiagMsg = "Unknown function in:" + DiagMsg;
+        report_fatal_error(DiagMsg.c_str());
+      }
       MCOp = MCOperand::createReg(FuncReg);
       break;
     }
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 2b4cb5ccc7b1eb..00d0cbd763736d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1309,5 +1309,8 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
   if (MAI.MS[SPIRV::MB_EntryPoints].empty())
     MAI.Reqs.addCapability(SPIRV::Capability::Linkage);
 
+  // Set maximum ID used.
+  GR->setBound(MAI.MaxID);
+
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 708384fc55f525..6e86eed30c5dc1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -163,8 +163,8 @@ struct ModuleAnalysisInfo {
   Register getFuncReg(const Function *F) {
     assert(F && "Function is null");
     auto FuncPtrRegPair = FuncMap.find(F);
-    assert(FuncPtrRegPair != FuncMap.end() && "Cannot find function ID");
-    return FuncPtrRegPair->second;
+    return FuncPtrRegPair == FuncMap.end() ? Register(0)
+                                           : FuncPtrRegPair->second;
   }
   Register getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; }
   InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index c376497469ce33..a8a0577f60564c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -263,6 +263,21 @@ static void lowerExpectAssume(IntrinsicInst *II) {
   return;
 }
 
+static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID,
+                                     ArrayRef<unsigned> OpNos) {
+  Function *F = nullptr;
+  if (OpNos.empty()) {
+    F = Intrinsic::getDeclaration(II->getModule(), NewID);
+  } else {
+    SmallVector<Type *, 4> Tys;
+    for (unsigned OpNo : OpNos)
+      Tys.push_back(II->getOperand(OpNo)->getType());
+    F = Intrinsic::getDeclaration(II->getModule(), NewID, Tys);
+  }
+  II->setCalledFunction(F);
+  return true;
+}
+
 static void lowerUMulWithOverflow(IntrinsicInst *UMulIntrinsic) {
   // Get a separate function - otherwise, we'd have to rework the CFG of the
   // current one. Then simply replace the intrinsic uses with a call to the new
@@ -290,22 +305,35 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
       if (!CF || !CF->isIntrinsic())
         continue;
       auto *II = cast<IntrinsicInst>(Call);
-      if (II->getIntrinsicID() == Intrinsic::memset ||
-          II->getIntrinsicID() == Intrinsic::bswap)
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::memset:
+      case Intrinsic::bswap:
         Changed |= lowerIntrinsicToFunction(II);
-      else if (II->getIntrinsicID() == Intrinsic::fshl ||
-               II->getIntrinsicID() == Intrinsic::fshr) {
+        break;
+      case Intrinsic::fshl:
+      case Intrinsic::fshr:
         lowerFunnelShifts(II);
         Changed = true;
-      } else if (II->getIntrinsicID() == Intrinsic::umul_with_overflow) {
+        break;
+      case Intrinsic::umul_with_overflow:
         lowerUMulWithOverflow(II);
         Changed = true;
-      } else if (II->getIntrinsicID() == Intrinsic::assume ||
-                 II->getIntrinsicID() == Intrinsic::expect) {
+        break;
+      case Intrinsic::assume:
+      case Intrinsic::expect: {
         const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
         if (STI.canUseExtension(SPIRV::Extension::SPV_KHR_expect_assume))
           lowerExpectAssume(II);
         Changed = true;
+      } break;
+      case Intrinsic::lifetime_start:
+        Changed |= toSpvOverloadedIntrinsic(
+            II, Intrinsic::SPVIntrinsics::spv_lifetime_start, {1});
+        break;
+      case Intrinsic::lifetime_end:
+        Changed |= toSpvOverloadedIntrinsic(
+            II, Intrinsic::SPVIntrinsics::spv_lifetime_end, {1});
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
index 62524ebfc9bf8c..3b486226a93931 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
@@ -71,6 +71,7 @@ class SPIRVSubtarget : public SPIRVGenSubtargetInfo {
   // The definition of this function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
   unsigned getPointerSize() const { return PointerSize; }
+  unsigned getBound() const { return GR->getBound(); }
   bool canDirectlyComparePointers() const;
   // TODO: this environment is not implemented in Triple, we need to decide
   // how to standardize its support. For now, let's assume SPIR-V with physical
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 815eca1240d827..deaf3dcaeb92a4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -344,6 +344,9 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // requirements for a PC-relative access.
   bool storeLoadIsAligned(SDNode *N) const;
 
+  // Return the load extension type of a load or atomic load.
+  ISD::LoadExtType getLoadExtType(SDNode *N) const;
+
   // Try to expand a boolean SELECT_CCMASK using an IPM sequence.
   SDValue expandSelectBoolean(SDNode *Node);
 
@@ -1507,15 +1510,17 @@ bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
 
 bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
 
-  auto *MemAccess = cast<LSBaseSDNode>(N);
+  auto *MemAccess = cast<MemSDNode>(N);
+  auto *LdSt = dyn_cast<LSBaseSDNode>(MemAccess);
   TypeSize StoreSize = MemAccess->getMemoryVT().getStoreSize();
   SDValue BasePtr = MemAccess->getBasePtr();
   MachineMemOperand *MMO = MemAccess->getMemOperand();
   assert(MMO && "Expected a memory operand.");
 
   // The memory access must have a proper alignment and no index register.
+  // Only load and store nodes have the offset operand (atomic loads do not).
   if (MemAccess->getAlign().value() < StoreSize ||
-      !MemAccess->getOffset().isUndef())
+      (LdSt && !LdSt->getOffset().isUndef()))
     return false;
 
   // The MMO must not have an unaligned offset.
@@ -1545,6 +1550,17 @@ bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
   return true;
 }
 
+ISD::LoadExtType SystemZDAGToDAGISel::getLoadExtType(SDNode *N) const {
+  ISD::LoadExtType ETy;
+  if (auto *L = dyn_cast<LoadSDNode>(N))
+    ETy = L->getExtensionType();
+  else if (auto *AL = dyn_cast<AtomicSDNode>(N))
+    ETy = AL->getExtensionType();
+  else
+    llvm_unreachable("Unkown load node type.");
+  return ETy;
+}
+
 void SystemZDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
@@ -1742,6 +1758,26 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+
+  case ISD::ATOMIC_STORE: {
+    auto *AtomOp = cast<AtomicSDNode>(Node);
+    // Replace the atomic_store with a regular store and select it. This is
+    // ok since we know all store instructions <= 8 bytes are atomic, and the
+    // 16 byte case is already handled during lowering.
+    StoreSDNode *St = cast<StoreSDNode>(CurDAG->getTruncStore(
+         AtomOp->getChain(), SDLoc(AtomOp), AtomOp->getVal(),
+         AtomOp->getBasePtr(), AtomOp->getMemoryVT(), AtomOp->getMemOperand()));
+    assert(St->getMemOperand()->isAtomic() && "Broken MMO.");
+    SDNode *Chain = St;
+    // We have to enforce sequential consistency by performing a
+    // serialization operation after the store.
+    if (AtomOp->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Chain = CurDAG->getMachineNode(SystemZ::Serialize, SDLoc(AtomOp),
+                                     MVT::Other, SDValue(Chain, 0));
+    ReplaceNode(Node, Chain);
+    SelectCode(St);
+    return;
+  }
   }
 
   SelectCode(Node);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3b85a6ac0371ed..bc60d14ee700fe 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -194,11 +194,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UADDO_CARRY, VT, Custom);
       setOperationAction(ISD::USUBO_CARRY, VT, Custom);
 
-      // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
-      // stores, putting a serialization instruction after the stores.
-      setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
-      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
-
       // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
       // available, or if the operand is constant.
       setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
@@ -920,6 +915,22 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const
   return false;
 }
 
+TargetLowering::AtomicExpansionKind
+SystemZTargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  // Lower fp128 the same way as i128.
+  if (LI->getType()->isFP128Ty())
+    return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+SystemZTargetLowering::shouldCastAtomicStoreInIR(StoreInst *SI) const {
+  // Lower fp128 the same way as i128.
+  if (SI->getValueOperand()->getType()->isFP128Ty())
+    return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 TargetLowering::AtomicExpansionKind
 SystemZTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   // Don't expand subword operations as they require special treatment.
@@ -4252,6 +4263,7 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
   if (N->getValueType(0) == MVT::i128) {
     unsigned BaseOp = 0;
     unsigned FlagOp = 0;
+    bool IsBorrow = false;
     switch (Op.getOpcode()) {
     default: llvm_unreachable("Unknown instruction!");
     case ISD::UADDO:
@@ -4261,6 +4273,7 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
     case ISD::USUBO:
       BaseOp = ISD::SUB;
       FlagOp = SystemZISD::VSCBI;
+      IsBorrow = true;
       break;
     }
     SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS);
@@ -4268,6 +4281,9 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
     Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag,
                        DAG.getValueType(MVT::i1));
     Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1));
+    if (IsBorrow)
+      Flag = DAG.getNode(ISD::XOR, DL, Flag.getValueType(),
+                         Flag, DAG.getConstant(1, DL, Flag.getValueType()));
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag);
   }
 
@@ -4340,6 +4356,7 @@ SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op,
   if (VT == MVT::i128) {
     unsigned BaseOp = 0;
     unsigned FlagOp = 0;
+    bool IsBorrow = false;
     switch (Op.getOpcode()) {
     default: llvm_unreachable("Unknown instruction!");
     case ISD::UADDO_CARRY:
@@ -4349,14 +4366,21 @@ SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op,
     case ISD::USUBO_CARRY:
       BaseOp = SystemZISD::VSBI;
       FlagOp = SystemZISD::VSBCBI;
+      IsBorrow = true;
       break;
     }
+    if (IsBorrow)
+      Carry = DAG.getNode(ISD::XOR, DL, Carry.getValueType(),
+                          Carry, DAG.getConstant(1, DL, Carry.getValueType()));
     Carry = DAG.getZExtOrTrunc(Carry, DL, MVT::i128);
     SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS, Carry);
     SDValue Flag = DAG.getNode(FlagOp, DL, MVT::i128, LHS, RHS, Carry);
     Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag,
                        DAG.getValueType(MVT::i1));
     Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1));
+    if (IsBorrow)
+      Flag = DAG.getNode(ISD::XOR, DL, Flag.getValueType(),
+                         Flag, DAG.getConstant(1, DL, Flag.getValueType()));
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag);
   }
 
@@ -4503,40 +4527,14 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-// Op is an atomic load.  Lower it into a normal volatile load.
-SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  auto *Node = cast<AtomicSDNode>(Op.getNode());
-  if (Node->getMemoryVT() == MVT::i128) {
-    // Use same code to handle both legal and non-legal i128 types.
-    SmallVector<SDValue, 2> Results;
-    LowerOperationWrapper(Node, Results, DAG);
-    return DAG.getMergeValues(Results, SDLoc(Op));
-  }
-  return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
-                        Node->getChain(), Node->getBasePtr(),
-                        Node->getMemoryVT(), Node->getMemOperand());
-}
-
-// Op is an atomic store.  Lower it into a normal volatile store.
-SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
+                                                     SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
-  if (Node->getMemoryVT() == MVT::i128) {
-    // Use same code to handle both legal and non-legal i128 types.
-    SmallVector<SDValue, 1> Results;
-    LowerOperationWrapper(Node, Results, DAG);
-    return DAG.getMergeValues(Results, SDLoc(Op));
-  }
-  SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
-                                    Node->getBasePtr(), Node->getMemoryVT(),
-                                    Node->getMemOperand());
-  // We have to enforce sequential consistency by performing a
-  // serialization operation after the store.
-  if (Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent)
-    Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
-                                       MVT::Other, Chain), 0);
-  return Chain;
+  assert(Node->getMemoryVT() == MVT::i128 && "Only custom lowering i128.");
+  // Use same code to handle both legal and non-legal i128 types.
+  SmallVector<SDValue, 2> Results;
+  LowerOperationWrapper(Node, Results, DAG);
+  return DAG.getMergeValues(Results, SDLoc(Op));
 }
 
 // Prepare for a Compare And Swap for a subword operation. This needs to be
@@ -5662,6 +5660,9 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
 bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
   if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
     return true;
+  if (auto *AL = dyn_cast<AtomicSDNode>(Op))
+    if (AL->getOpcode() == ISD::ATOMIC_LOAD)
+      return true;
   if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
     return true;
   return false;
@@ -6138,9 +6139,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   case ISD::ATOMIC_SWAP:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
   case ISD::ATOMIC_STORE:
-    return lowerATOMIC_STORE(Op, DAG);
   case ISD::ATOMIC_LOAD:
-    return lowerATOMIC_LOAD(Op, DAG);
+    return lowerATOMIC_LDST_I128(Op, DAG);
   case ISD::ATOMIC_LOAD_ADD:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
   case ISD::ATOMIC_LOAD_SUB:
@@ -6587,6 +6587,27 @@ SDValue SystemZTargetLowering::combineTruncateExtract(
   return SDValue();
 }
 
+// Replace ALoad with a new ATOMIC_LOAD with a result that is extended to VT
+// per ETy.
+static SDValue extendAtomicLoad(AtomicSDNode *ALoad, EVT VT, SelectionDAG &DAG,
+                                ISD::LoadExtType ETy) {
+  if (VT.getSizeInBits() > 64)
+    return SDValue();
+  EVT OrigVT = ALoad->getValueType(0);
+  assert(OrigVT.getSizeInBits() < VT.getSizeInBits() && "VT should be wider.");
+  EVT MemoryVT = ALoad->getMemoryVT();
+  auto *NewALoad = dyn_cast<AtomicSDNode>(DAG.getAtomic(
+      ISD::ATOMIC_LOAD, SDLoc(ALoad), MemoryVT, VT, ALoad->getChain(),
+      ALoad->getBasePtr(), ALoad->getMemOperand()));
+  NewALoad->setExtensionType(ETy);
+  DAG.ReplaceAllUsesOfValueWith(
+      SDValue(ALoad, 0),
+      DAG.getNode(ISD::TRUNCATE, SDLoc(ALoad), OrigVT, SDValue(NewALoad, 0)));
+  // Update the chain uses.
+  DAG.ReplaceAllUsesOfValueWith(SDValue(ALoad, 1), SDValue(NewALoad, 1));
+  return SDValue(NewALoad, 0);
+}
+
 SDValue SystemZTargetLowering::combineZERO_EXTEND(
     SDNode *N, DAGCombinerInfo &DCI) const {
   // Convert (zext (select_ccmask C1, C2)) into (select_ccmask C1', C2')
@@ -6611,6 +6632,34 @@ SDValue SystemZTargetLowering::combineZERO_EXTEND(
       return NewSelect;
     }
   }
+  // Convert (zext (xor (trunc X), C)) into (xor (trunc X), C') if the size
+  // of the result is smaller than the size of X and all the truncated bits
+  // of X are already zero.
+  if (N0.getOpcode() == ISD::XOR &&
+      N0.hasOneUse() && N0.getOperand(0).hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (VT.isScalarInteger() && VT.getSizeInBits() < X.getValueSizeInBits()) {
+      KnownBits Known = DAG.computeKnownBits(X);
+      APInt TruncatedBits = APInt::getBitsSet(X.getValueSizeInBits(),
+                                              N0.getValueSizeInBits(),
+                                              VT.getSizeInBits());
+      if (TruncatedBits.isSubsetOf(Known.Zero)) {
+        X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
+        APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
+        return DAG.getNode(ISD::XOR, SDLoc(N0), VT,
+                           X, DAG.getConstant(Mask, SDLoc(N0), VT));
+      }
+    }
+  }
+
+  // Fold into ATOMIC_LOAD unless it is already sign extending.
+  if (auto *ALoad = dyn_cast<AtomicSDNode>(N0))
+    if (ALoad->getOpcode() == ISD::ATOMIC_LOAD &&
+        ALoad->getExtensionType() != ISD::SEXTLOAD)
+      return extendAtomicLoad(ALoad, VT, DAG, ISD::ZEXTLOAD);
+
   return SDValue();
 }
 
@@ -6662,6 +6711,13 @@ SDValue SystemZTargetLowering::combineSIGN_EXTEND(
       }
     }
   }
+
+  // Fold into ATOMIC_LOAD unless it is already zero extending.
+  if (auto *ALoad = dyn_cast<AtomicSDNode>(N0))
+    if (ALoad->getOpcode() == ISD::ATOMIC_LOAD &&
+        ALoad->getExtensionType() != ISD::ZEXTLOAD)
+      return extendAtomicLoad(ALoad, VT, DAG, ISD::SEXTLOAD);
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index baf4ba41654879..406a13b9281ca8 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -474,6 +474,8 @@ class SystemZTargetLowering : public TargetLowering {
     return VT != MVT::f64;
   }
   bool hasInlineStackProbe(const MachineFunction &MF) const override;
+  AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override;
+  AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override;
   AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override;
   bool isLegalICmpImmediate(int64_t Imm) const override;
@@ -692,8 +694,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
                               unsigned Opcode) const;
   SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 6e67425c1e788b..f4b5aeaebef923 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -129,8 +129,8 @@ defm LoadStoreF128 : MVCLoadStore<load, f128, MVCImm, 15>;
 //===----------------------------------------------------------------------===//
 
 let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
-  defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
-  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
+  defm LE : UnaryRXPair<"le", 0x78, 0xED64, z_load, FP32, 4>;
+  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, z_load, FP64, 8>;
 
   // For z13 we prefer LDE over LE to avoid partial register dependencies.
   let isCodeGenOnly = 1 in
@@ -200,14 +200,14 @@ let Predicates = [FeatureNoVectorEnhancements1] in {
 
 // Extend memory floating-point values to wider representations.
 let Uses = [FPC], mayRaiseFPException = 1 in {
-  def LDEB : UnaryRXE<"ldeb", 0xED04, any_extloadf32, FP64, 4>;
+  def LDEB : UnaryRXE<"ldeb", 0xED04, z_any_extloadf32, FP64, 4>;
   def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>;
   def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>;
 }
 let Predicates = [FeatureNoVectorEnhancements1] in {
-  def : Pat<(f128 (any_extloadf32 bdxaddr12only:$src)),
+  def : Pat<(f128 (z_any_extloadf32 bdxaddr12only:$src)),
             (LXEB bdxaddr12only:$src)>;
-  def : Pat<(f128 (any_extloadf64 bdxaddr12only:$src)),
+  def : Pat<(f128 (z_any_extloadf64 bdxaddr12only:$src)),
             (LXDB bdxaddr12only:$src)>;
 }
 
@@ -430,8 +430,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
     def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64,  FP64>;
     def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
   }
-  defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, load, 4>;
-  defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, load, 8>;
+  defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_load, 4>;
+  defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_load, 8>;
 }
 
 // Subtraction.
@@ -441,8 +441,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
   def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64,  FP64>;
   def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;
 
-  defm SEB : BinaryRXEAndPseudo<"seb",  0xED0B, any_fsub, FP32, load, 4>;
-  defm SDB : BinaryRXEAndPseudo<"sdb",  0xED1B, any_fsub, FP64, load, 8>;
+  defm SEB : BinaryRXEAndPseudo<"seb",  0xED0B, any_fsub, FP32, z_load, 4>;
+  defm SDB : BinaryRXEAndPseudo<"sdb",  0xED1B, any_fsub, FP64, z_load, 8>;
 }
 
 // Multiplication.
@@ -452,8 +452,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
     def MDBR  : BinaryRRE<"mdbr",  0xB31C, any_fmul, FP64,  FP64>;
     def MXBR  : BinaryRRE<"mxbr",  0xB34C, any_fmul, FP128, FP128>;
   }
-  defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, load, 4>;
-  defm MDB  : BinaryRXEAndPseudo<"mdb",  0xED1C, any_fmul, FP64, load, 8>;
+  defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_load, 4>;
+  defm MDB  : BinaryRXEAndPseudo<"mdb",  0xED1C, any_fmul, FP64, z_load, 8>;
 }
 
 // f64 multiplication of two FP32 registers.
@@ -466,7 +466,7 @@ def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)),
 
 // f64 multiplication of an FP32 register and an f32 memory.
 let Uses = [FPC], mayRaiseFPException = 1 in
-  def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
+  def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, z_load, 4>;
 def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)),
                     (f64 (any_extloadf32 bdxaddr12only:$addr))),
           (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
@@ -483,7 +483,7 @@ let Predicates = [FeatureNoVectorEnhancements1] in
 
 // f128 multiplication of an FP64 register and an f64 memory.
 let Uses = [FPC], mayRaiseFPException = 1 in
-  def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
+  def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, z_load, 8>;
 let Predicates = [FeatureNoVectorEnhancements1] in
   def : Pat<(any_fmul (f128 (any_fpextend FP64:$src1)),
                       (f128 (any_extloadf64 bdxaddr12only:$addr))),
@@ -495,8 +495,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
   def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
 
-  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
-  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
+  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, z_load, 4>;
+  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, z_load, 8>;
 }
 
 // Fused multiply-subtract.
@@ -504,8 +504,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>;
   def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>;
 
-  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
-  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
+  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, z_load, 4>;
+  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, z_load, 8>;
 }
 
 // Division.
@@ -514,8 +514,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def DDBR : BinaryRRE<"ddbr", 0xB31D, any_fdiv, FP64,  FP64>;
   def DXBR : BinaryRRE<"dxbr", 0xB34D, any_fdiv, FP128, FP128>;
 
-  defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, load, 4>;
-  defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, load, 8>;
+  defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, z_load, 4>;
+  defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, z_load, 8>;
 }
 
 // Divide to integer.
@@ -533,15 +533,15 @@ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC], CCValues = 0xF in {
   def CDBR : CompareRRE<"cdbr", 0xB319, z_any_fcmp, FP64,  FP64>;
   def CXBR : CompareRRE<"cxbr", 0xB349, z_any_fcmp, FP128, FP128>;
 
-  def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, load, 4>;
-  def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, load, 8>;
+  def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, z_load, 4>;
+  def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, z_load, 8>;
 
   def KEBR : CompareRRE<"kebr", 0xB308, z_strict_fcmps, FP32,  FP32>;
   def KDBR : CompareRRE<"kdbr", 0xB318, z_strict_fcmps, FP64,  FP64>;
   def KXBR : CompareRRE<"kxbr", 0xB348, z_strict_fcmps, FP128, FP128>;
 
-  def KEB : CompareRXE<"keb", 0xED08, z_strict_fcmps, FP32, load, 4>;
-  def KDB : CompareRXE<"kdb", 0xED18, z_strict_fcmps, FP64, load, 8>;
+  def KEB : CompareRXE<"keb", 0xED08, z_strict_fcmps, FP32, z_load, 4>;
+  def KDB : CompareRXE<"kdb", 0xED18, z_strict_fcmps, FP64, z_load, 8>;
 }
 
 // Test Data Class.
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index bb9fa0fc33ffa0..3dba33b66bf4f4 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -3777,7 +3777,7 @@ class BinarySI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                Operand imm, AddressingMode mode = bdaddr12only>
   : InstSI<opcode, (outs), (ins (mode $B1, $D1):$BD1, imm:$I2),
            mnemonic#"\t$BD1, $I2",
-           [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+           [(store (operator (z_load mode:$BD1), imm:$I2), mode:$BD1)]> {
   let mayLoad = 1;
   let mayStore = 1;
 }
@@ -3786,7 +3786,7 @@ class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 Operand imm, AddressingMode mode = bdaddr20only>
   : InstSIY<opcode, (outs), (ins (mode $B1, $D1):$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
-            [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+            [(store (operator (z_load mode:$BD1), imm:$I2), mode:$BD1)]> {
   let mayLoad = 1;
   let mayStore = 1;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
index 2e3c9932d62142..d2e05b63c6c630 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
@@ -134,8 +134,8 @@ let Defs = [CC] in {
     def ADR : BinaryRR<"adr", 0x2A, null_frag, FP64,  FP64>;
     def AXR : BinaryRR<"axr", 0x36, null_frag, FP128, FP128>;
   }
-  def AE : BinaryRX<"ae", 0x7A, null_frag, FP32, load, 4>;
-  def AD : BinaryRX<"ad", 0x6A, null_frag, FP64, load, 8>;
+  def AE : BinaryRX<"ae", 0x7A, null_frag, FP32, z_load, 4>;
+  def AD : BinaryRX<"ad", 0x6A, null_frag, FP64, z_load, 8>;
 }
 
 // Addition (unnormalized).
@@ -144,8 +144,8 @@ let Defs = [CC] in {
     def AUR : BinaryRR<"aur", 0x3E, null_frag, FP32, FP32>;
     def AWR : BinaryRR<"awr", 0x2E, null_frag, FP64, FP64>;
   }
-  def AU : BinaryRX<"au", 0x7E, null_frag, FP32, load, 4>;
-  def AW : BinaryRX<"aw", 0x6E, null_frag, FP64, load, 8>;
+  def AU : BinaryRX<"au", 0x7E, null_frag, FP32, z_load, 4>;
+  def AW : BinaryRX<"aw", 0x6E, null_frag, FP64, z_load, 8>;
 }
 
 // Subtraction.
@@ -154,8 +154,8 @@ let Defs = [CC] in {
   def SDR : BinaryRR<"sdr", 0x2B, null_frag, FP64,  FP64>;
   def SXR : BinaryRR<"sxr", 0x37, null_frag, FP128, FP128>;
 
-  def SE : BinaryRX<"se", 0x7B, null_frag, FP32, load, 4>;
-  def SD : BinaryRX<"sd", 0x6B, null_frag, FP64, load, 8>;
+  def SE : BinaryRX<"se", 0x7B, null_frag, FP32, z_load, 4>;
+  def SD : BinaryRX<"sd", 0x6B, null_frag, FP64, z_load, 8>;
 }
 
 // Subtraction (unnormalized).
@@ -163,8 +163,8 @@ let Defs = [CC] in {
   def SUR : BinaryRR<"sur", 0x3F, null_frag, FP32, FP32>;
   def SWR : BinaryRR<"swr", 0x2F, null_frag, FP64, FP64>;
 
-  def SU : BinaryRX<"su", 0x7F, null_frag, FP32, load, 4>;
-  def SW : BinaryRX<"sw", 0x6F, null_frag, FP64, load, 8>;
+  def SU : BinaryRX<"su", 0x7F, null_frag, FP32, z_load, 4>;
+  def SW : BinaryRX<"sw", 0x6F, null_frag, FP64, z_load, 8>;
 }
 
 // Multiplication.
@@ -173,55 +173,55 @@ let isCommutable = 1 in {
   def MDR  : BinaryRR <"mdr",  0x2C,   null_frag, FP64,  FP64>;
   def MXR  : BinaryRR <"mxr",  0x26,   null_frag, FP128, FP128>;
 }
-def MEE : BinaryRXE<"mee", 0xED37, null_frag, FP32, load, 4>;
-def MD  : BinaryRX <"md",  0x6C,   null_frag, FP64, load, 8>;
+def MEE : BinaryRXE<"mee", 0xED37, null_frag, FP32, z_load, 4>;
+def MD  : BinaryRX <"md",  0x6C,   null_frag, FP64, z_load, 8>;
 
 // Extending multiplication (f32 x f32 -> f64).
 def MDER : BinaryRR<"mder", 0x3C, null_frag, FP64, FP32>;
-def MDE  : BinaryRX<"mde",  0x7C, null_frag, FP64, load, 4>;
+def MDE  : BinaryRX<"mde",  0x7C, null_frag, FP64, z_load, 4>;
 let isAsmParserOnly = 1 in {
   def MER : BinaryRR<"mer", 0x3C, null_frag, FP64, FP32>;
-  def ME  : BinaryRX<"me",  0x7C, null_frag, FP64, load, 4>;
+  def ME  : BinaryRX<"me",  0x7C, null_frag, FP64, z_load, 4>;
 }
 
 // Extending multiplication (f64 x f64 -> f128).
 def MXDR : BinaryRR<"mxdr", 0x27, null_frag, FP128, FP64>;
-def MXD  : BinaryRX<"mxd",  0x67, null_frag, FP128, load, 8>;
+def MXD  : BinaryRX<"mxd",  0x67, null_frag, FP128, z_load, 8>;
 
 // Fused multiply-add.
 def MAER : TernaryRRD<"maer", 0xB32E, null_frag, FP32, FP32>;
 def MADR : TernaryRRD<"madr", 0xB33E, null_frag, FP64, FP64>;
-def MAE  : TernaryRXF<"mae",  0xED2E, null_frag, FP32, FP32, load, 4>;
-def MAD  : TernaryRXF<"mad",  0xED3E, null_frag, FP64, FP64, load, 8>;
+def MAE  : TernaryRXF<"mae",  0xED2E, null_frag, FP32, FP32, z_load, 4>;
+def MAD  : TernaryRXF<"mad",  0xED3E, null_frag, FP64, FP64, z_load, 8>;
 
 // Fused multiply-subtract.
 def MSER : TernaryRRD<"mser", 0xB32F, null_frag, FP32, FP32>;
 def MSDR : TernaryRRD<"msdr", 0xB33F, null_frag, FP64, FP64>;
-def MSE  : TernaryRXF<"mse",  0xED2F, null_frag, FP32, FP32, load, 4>;
-def MSD  : TernaryRXF<"msd",  0xED3F, null_frag, FP64, FP64, load, 8>;
+def MSE  : TernaryRXF<"mse",  0xED2F, null_frag, FP32, FP32, z_load, 4>;
+def MSD  : TernaryRXF<"msd",  0xED3F, null_frag, FP64, FP64, z_load, 8>;
 
 // Multiplication (unnormalized).
 def MYR  : BinaryRRD<"myr",  0xB33B, null_frag, FP128, FP64>;
 def MYHR : BinaryRRD<"myhr", 0xB33D, null_frag, FP64,  FP64>;
 def MYLR : BinaryRRD<"mylr", 0xB339, null_frag, FP64,  FP64>;
-def MY   : BinaryRXF<"my",   0xED3B, null_frag, FP128, FP64, load, 8>;
-def MYH  : BinaryRXF<"myh",  0xED3D, null_frag, FP64,  FP64, load, 8>;
-def MYL  : BinaryRXF<"myl",  0xED39, null_frag, FP64,  FP64, load, 8>;
+def MY   : BinaryRXF<"my",   0xED3B, null_frag, FP128, FP64, z_load, 8>;
+def MYH  : BinaryRXF<"myh",  0xED3D, null_frag, FP64,  FP64, z_load, 8>;
+def MYL  : BinaryRXF<"myl",  0xED39, null_frag, FP64,  FP64, z_load, 8>;
 
 // Fused multiply-add (unnormalized).
 def MAYR  : TernaryRRD<"mayr",  0xB33A, null_frag, FP128, FP64>;
 def MAYHR : TernaryRRD<"mayhr", 0xB33C, null_frag, FP64,  FP64>;
 def MAYLR : TernaryRRD<"maylr", 0xB338, null_frag, FP64,  FP64>;
-def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP128, FP64, load, 8>;
-def MAYH  : TernaryRXF<"mayh",  0xED3C, null_frag, FP64,  FP64, load, 8>;
-def MAYL  : TernaryRXF<"mayl",  0xED38, null_frag, FP64,  FP64, load, 8>;
+def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP128, FP64, z_load, 8>;
+def MAYH  : TernaryRXF<"mayh",  0xED3C, null_frag, FP64,  FP64, z_load, 8>;
+def MAYL  : TernaryRXF<"mayl",  0xED38, null_frag, FP64,  FP64, z_load, 8>;
 
 // Division.
 def DER : BinaryRR <"der", 0x3D,   null_frag, FP32,  FP32>;
 def DDR : BinaryRR <"ddr", 0x2D,   null_frag, FP64,  FP64>;
 def DXR : BinaryRRE<"dxr", 0xB22D, null_frag, FP128, FP128>;
-def DE  : BinaryRX <"de",  0x7D,   null_frag, FP32, load, 4>;
-def DD  : BinaryRX <"dd",  0x6D,   null_frag, FP64, load, 8>;
+def DE  : BinaryRX <"de",  0x7D,   null_frag, FP32, z_load, 4>;
+def DD  : BinaryRX <"dd",  0x6D,   null_frag, FP64, z_load, 8>;
 
 
 //===----------------------------------------------------------------------===//
@@ -233,7 +233,7 @@ let Defs = [CC] in {
   def CDR : CompareRR <"cdr", 0x29,   null_frag, FP64,  FP64>;
   def CXR : CompareRRE<"cxr", 0xB369, null_frag, FP128, FP128>;
 
-  def CE : CompareRX<"ce", 0x79, null_frag, FP32, load, 4>;
-  def CD : CompareRX<"cd", 0x69, null_frag, FP64, load, 8>;
+  def CE : CompareRX<"ce", 0x79, null_frag, FP32, z_load, 4>;
+  def CD : CompareRX<"cd", 0x69, null_frag, FP64, z_load, 8>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 53e9bf9a9d1bb0..2a6dce863c28f1 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -2020,11 +2020,12 @@ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
   }
   if (SameVal) {
     int OffsetA = MMOa->getOffset(), OffsetB = MMOb->getOffset();
-    int WidthA = MMOa->getSize(), WidthB = MMOb->getSize();
+    LocationSize WidthA = MMOa->getSize(), WidthB = MMOb->getSize();
     int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
     int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
-    int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
-    if (LowOffset + LowWidth <= HighOffset)
+    LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+    if (LowWidth.hasValue() &&
+        LowOffset + (int)LowWidth.getValue() <= HighOffset)
       return true;
   }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 937e36057a6ede..96ea65b6c3d881 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -306,7 +306,7 @@ let Predicates = [IsTargetXPLINK64] in {
   let mayLoad = 1, AddedComplexity = 20, hasNoSchedulingInfo = 1, Defs = [CC] in {
     def ADA_ENTRY_VALUE : Alias<12, (outs GR64:$Reg), (ins adasym:$addr,
                               ADDR64:$ADA, imm64:$Offset),
-                            [(set i64:$Reg, (load (z_ada_entry
+                            [(set i64:$Reg, (z_load (z_ada_entry
                               iPTR:$addr, iPTR:$ADA, i64:$Offset)))]>;
  }
 }
@@ -468,12 +468,12 @@ let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
 // Register loads.
 let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
   // Expands to L, LY or LFH, depending on the choice of register.
-  def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
+  def LMux : UnaryRXYPseudo<"l", z_load, GRX32, 4>,
              Requires<[FeatureHighWord]>;
-  defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32, 4>;
-  def LFH : UnaryRXY<"lfh", 0xE3CA, load, GRH32, 4>,
+  defm L : UnaryRXPair<"l", 0x58, 0xE358, z_load, GR32, 4>;
+  def LFH : UnaryRXY<"lfh", 0xE3CA, z_load, GRH32, 4>,
             Requires<[FeatureHighWord]>;
-  def LG : UnaryRXY<"lg", 0xE304, load, GR64, 8>;
+  def LG : UnaryRXY<"lg", 0xE304, z_load, GR64, 8>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -483,22 +483,22 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
   }
 }
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
-  def LT  : UnaryRXY<"lt",  0xE312, load, GR32, 4>;
-  def LTG : UnaryRXY<"ltg", 0xE302, load, GR64, 8>;
+  def LT  : UnaryRXY<"lt",  0xE312, z_load, GR32, 4>;
+  def LTG : UnaryRXY<"ltg", 0xE302, z_load, GR64, 8>;
 }
 
 let canFoldAsLoad = 1 in {
-  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_load, GR32>;
-  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_z_load, GR32>;
+  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_z_load, GR64>;
 }
 
 // Load and zero rightmost byte.
 let Predicates = [FeatureLoadAndZeroRightmostByte] in {
   def LZRF : UnaryRXY<"lzrf", 0xE33B, null_frag, GR32, 4>;
   def LZRG : UnaryRXY<"lzrg", 0xE32A, null_frag, GR64, 8>;
-  def : Pat<(and (i32 (load bdxaddr20only:$src)), 0xffffff00),
+  def : Pat<(and (i32 (z_load bdxaddr20only:$src)), 0xffffff00),
             (LZRF bdxaddr20only:$src)>;
-  def : Pat<(and (i64 (load bdxaddr20only:$src)), 0xffffffffffffff00),
+  def : Pat<(and (i64 (z_load bdxaddr20only:$src)), 0xffffffffffffff00),
             (LZRG bdxaddr20only:$src)>;
 }
 
@@ -689,29 +689,29 @@ def : Pat<(sext_inreg GR64:$src, i32),
 
 // 32-bit extensions from 8-bit memory.  LBMux expands to LB or LBH,
 // depending on the choice of register.
-def LBMux : UnaryRXYPseudo<"lb", asextloadi8, GRX32, 1>,
+def LBMux : UnaryRXYPseudo<"lb", z_asextloadi8, GRX32, 1>,
             Requires<[FeatureHighWord]>;
-def LB  : UnaryRXY<"lb", 0xE376, asextloadi8, GR32, 1>;
-def LBH : UnaryRXY<"lbh", 0xE3C0, asextloadi8, GRH32, 1>,
+def LB  : UnaryRXY<"lb", 0xE376, z_asextloadi8, GR32, 1>;
+def LBH : UnaryRXY<"lbh", 0xE3C0, z_asextloadi8, GRH32, 1>,
           Requires<[FeatureHighWord]>;
 
 // 32-bit extensions from 16-bit memory.  LHMux expands to LH or LHH,
 // depending on the choice of register.
-def LHMux : UnaryRXYPseudo<"lh", asextloadi16, GRX32, 2>,
+def LHMux : UnaryRXYPseudo<"lh", z_asextloadi16, GRX32, 2>,
             Requires<[FeatureHighWord]>;
-defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, asextloadi16, GR32, 2>;
-def  LHH  : UnaryRXY<"lhh", 0xE3C4, asextloadi16, GRH32, 2>,
+defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, z_asextloadi16, GR32, 2>;
+def  LHH  : UnaryRXY<"lhh", 0xE3C4, z_asextloadi16, GRH32, 2>,
             Requires<[FeatureHighWord]>;
-def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_asextloadi16, GR32>;
+def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_z_asextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LGB   : UnaryRXY<"lgb", 0xE377, asextloadi8,  GR64, 1>;
-def LGH   : UnaryRXY<"lgh", 0xE315, asextloadi16, GR64, 2>;
-def LGF   : UnaryRXY<"lgf", 0xE314, asextloadi32, GR64, 4>;
-def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_asextloadi16, GR64>;
-def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_asextloadi32, GR64>;
+def LGB   : UnaryRXY<"lgb", 0xE377, z_asextloadi8,  GR64, 1>;
+def LGH   : UnaryRXY<"lgh", 0xE315, z_asextloadi16, GR64, 2>;
+def LGF   : UnaryRXY<"lgf", 0xE314, z_asextloadi32, GR64, 4>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_z_asextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_z_asextloadi32, GR64>;
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
-  def LTGF : UnaryRXY<"ltgf", 0xE332, asextloadi32, GR64, 4>;
+  def LTGF : UnaryRXY<"ltgf", 0xE332, z_asextloadi32, GR64, 4>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
@@ -740,40 +740,40 @@ def : Pat<(and GR64:$src, 0xffffffff),
 
 // 32-bit extensions from 8-bit memory.  LLCMux expands to LLC or LLCH,
 // depending on the choice of register.
-def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
+def LLCMux : UnaryRXYPseudo<"llc", z_azextloadi8, GRX32, 1>,
              Requires<[FeatureHighWord]>;
-def LLC  : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
-def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GRH32, 1>,
+def LLC  : UnaryRXY<"llc", 0xE394, z_azextloadi8, GR32, 1>;
+def LLCH : UnaryRXY<"llch", 0xE3C2, z_azextloadi8, GRH32, 1>,
            Requires<[FeatureHighWord]>;
 
 // 32-bit extensions from 16-bit memory.  LLHMux expands to LLH or LLHH,
 // depending on the choice of register.
-def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
+def LLHMux : UnaryRXYPseudo<"llh", z_azextloadi16, GRX32, 2>,
              Requires<[FeatureHighWord]>;
-def LLH   : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
-def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GRH32, 2>,
+def LLH   : UnaryRXY<"llh", 0xE395, z_azextloadi16, GR32, 2>;
+def LLHH  : UnaryRXY<"llhh", 0xE3C6, z_azextloadi16, GRH32, 2>,
             Requires<[FeatureHighWord]>;
-def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_z_azextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LLGC   : UnaryRXY<"llgc", 0xE390, azextloadi8,  GR64, 1>;
-def LLGH   : UnaryRXY<"llgh", 0xE391, azextloadi16, GR64, 2>;
-def LLGF   : UnaryRXY<"llgf", 0xE316, azextloadi32, GR64, 4>;
-def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_azextloadi16, GR64>;
-def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_azextloadi32, GR64>;
+def LLGC   : UnaryRXY<"llgc", 0xE390, z_azextloadi8,  GR64, 1>;
+def LLGH   : UnaryRXY<"llgh", 0xE391, z_azextloadi16, GR64, 2>;
+def LLGF   : UnaryRXY<"llgf", 0xE316, z_azextloadi32, GR64, 4>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_z_azextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_z_azextloadi32, GR64>;
 
 // 31-to-64-bit zero extensions.
 def LLGTR : UnaryRRE<"llgtr", 0xB917, null_frag, GR64, GR64>;
 def LLGT  : UnaryRXY<"llgt",  0xE317, null_frag, GR64, 4>;
 def : Pat<(and GR64:$src, 0x7fffffff),
           (LLGTR GR64:$src)>;
-def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0x7fffffff),
+def : Pat<(and (i64 (z_azextloadi32 bdxaddr20only:$src)), 0x7fffffff),
           (LLGT bdxaddr20only:$src)>;
 
 // Load and zero rightmost byte.
 let Predicates = [FeatureLoadAndZeroRightmostByte] in {
   def LLZRGF : UnaryRXY<"llzrgf", 0xE33A, null_frag, GR64, 4>;
-  def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0xffffff00),
+  def : Pat<(and (i64 (z_azextloadi32 bdxaddr20only:$src)), 0xffffff00),
             (LLZRGF bdxaddr20only:$src)>;
 }
 
@@ -930,14 +930,14 @@ defm : SXU<ineg, LCGFR>;
 //===----------------------------------------------------------------------===//
 
 let isCodeGenOnly = 1 in
-  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, azextloadi8, 1>;
-defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, azextloadi8, 1>;
+  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, z_azextloadi8, 1>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, z_azextloadi8, 1>;
 
-defm : InsertMem<"inserti8", IC32,  GR32, azextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC32,  GR32, z_azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, z_azextloadi8, bdxaddr20pair>;
 
-defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC,  GR64, z_azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, z_azextloadi8, bdxaddr20pair>;
 
 // Insert characters under mask -- not (yet) used for codegen.
 let Defs = [CC] in {
@@ -1015,12 +1015,12 @@ let Defs = [CC], CCValues = 0xF, CCIfNoSignedWrap = 1 in {
   def AGFI : BinaryRIL<"agfi", 0xC28, z_sadd, GR64, imm64sx32>;
 
   // Addition of memory.
-  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, asextloadi16, 2>;
-  defm A   : BinaryRXPairAndPseudo<"a",  0x5A, 0xE35A, z_sadd, GR32, load, 4>;
-  def  AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, asextloadi16, 2>,
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, z_asextloadi16, 2>;
+  defm A   : BinaryRXPairAndPseudo<"a",  0x5A, 0xE35A, z_sadd, GR32, z_load, 4>;
+  def  AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, z_asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
-  def  AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, asextloadi32, 4>;
-  defm AG  : BinaryRXYAndPseudo<"ag",  0xE308, z_sadd, GR64, load, 8>;
+  def  AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, z_asextloadi32, 4>;
+  defm AG  : BinaryRXYAndPseudo<"ag",  0xE308, z_sadd, GR64, z_load, 8>;
 
   // Addition to memory.
   def ASI  : BinarySIY<"asi",  0xEB6A, add, imm32sx8>;
@@ -1058,9 +1058,9 @@ let Defs = [CC], CCValues = 0xF, IsLogical = 1 in {
               Requires<[FeatureHighWord]>;
 
   // Addition of memory.
-  defm AL   : BinaryRXPairAndPseudo<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>;
-  def  ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, azextloadi32, 4>;
-  defm ALG  : BinaryRXYAndPseudo<"alg",  0xE30A, z_uadd, GR64, load, 8>;
+  defm AL   : BinaryRXPairAndPseudo<"al", 0x5E, 0xE35E, z_uadd, GR32, z_load, 4>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, z_azextloadi32, 4>;
+  defm ALG  : BinaryRXYAndPseudo<"alg",  0xE30A, z_uadd, GR64, z_load, 8>;
 
   // Addition to memory.
   def ALSI  : BinarySIY<"alsi",  0xEB6E, null_frag, imm32sx8>;
@@ -1075,8 +1075,8 @@ let Defs = [CC], Uses = [CC], CCValues = 0xF, IsLogical = 1 in {
   def ALCGR : BinaryRRE<"alcgr", 0xB988, z_addcarry, GR64, GR64>;
 
   // Addition of memory.
-  def ALC  : BinaryRXY<"alc",  0xE398, z_addcarry, GR32, load, 4>;
-  def ALCG : BinaryRXY<"alcg", 0xE388, z_addcarry, GR64, load, 8>;
+  def ALC  : BinaryRXY<"alc",  0xE398, z_addcarry, GR32, z_load, 4>;
+  def ALCG : BinaryRXY<"alcg", 0xE388, z_addcarry, GR64, z_load, 8>;
 }
 
 // Addition that does not modify the condition code.
@@ -1103,12 +1103,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8,
               Requires<[FeatureHighWord]>;
 
   // Subtraction of memory.
-  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, asextloadi16, 2>;
-  defm S   : BinaryRXPairAndPseudo<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>;
-  def  SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, asextloadi16, 2>,
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, z_asextloadi16, 2>;
+  defm S   : BinaryRXPairAndPseudo<"s", 0x5B, 0xE35B, z_ssub, GR32, z_load, 4>;
+  def  SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, z_asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
-  def  SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, asextloadi32, 4>;
-  defm SG  : BinaryRXYAndPseudo<"sg",  0xE309, z_ssub, GR64, load, 8>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, z_asextloadi32, 4>;
+  defm SG  : BinaryRXYAndPseudo<"sg",  0xE309, z_ssub, GR64, z_load, 8>;
 }
 defm : SXB<z_ssub, GR64, SGFR>;
 
@@ -1156,9 +1156,9 @@ let Defs = [CC], CCValues = 0x7, IsLogical = 1 in {
   def SLGFI : BinaryRIL<"slgfi", 0xC24, z_usub, GR64, imm64zx32>;
 
   // Subtraction of memory.
-  defm SL   : BinaryRXPairAndPseudo<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>;
-  def  SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, azextloadi32, 4>;
-  defm SLG  : BinaryRXYAndPseudo<"slg",  0xE30B, z_usub, GR64, load, 8>;
+  defm SL   : BinaryRXPairAndPseudo<"sl", 0x5F, 0xE35F, z_usub, GR32, z_load, 4>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, z_azextloadi32, 4>;
+  defm SLG  : BinaryRXYAndPseudo<"slg",  0xE30B, z_usub, GR64, z_load, 8>;
 }
 defm : ZXB<z_usub, GR64, SLGFR>;
 
@@ -1183,8 +1183,8 @@ let Defs = [CC], Uses = [CC], CCValues = 0xF, IsLogical = 1 in {
   def SLBGR : BinaryRRE<"slbgr", 0xB989, z_subcarry, GR64, GR64>;
 
   // Subtraction of memory.
-  def SLB  : BinaryRXY<"slb",  0xE399, z_subcarry, GR32, load, 4>;
-  def SLBG : BinaryRXY<"slbg", 0xE389, z_subcarry, GR64, load, 8>;
+  def SLB  : BinaryRXY<"slb",  0xE399, z_subcarry, GR32, z_load, 4>;
+  def SLBG : BinaryRXY<"slbg", 0xE389, z_subcarry, GR64, z_load, 8>;
 }
 
 
@@ -1233,8 +1233,8 @@ let Defs = [CC] in {
 
   // ANDs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm N  : BinaryRXPairAndPseudo<"n", 0x54, 0xE354, and, GR32, load, 4>;
-    defm NG : BinaryRXYAndPseudo<"ng", 0xE380, and, GR64, load, 8>;
+    defm N  : BinaryRXPairAndPseudo<"n", 0x54, 0xE354, and, GR32, z_load, 4>;
+    defm NG : BinaryRXYAndPseudo<"ng", 0xE380, and, GR64, z_load, 8>;
   }
 
   // AND to memory
@@ -1290,8 +1290,8 @@ let Defs = [CC] in {
 
   // ORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm O  : BinaryRXPairAndPseudo<"o", 0x56, 0xE356, or, GR32, load, 4>;
-    defm OG : BinaryRXYAndPseudo<"og", 0xE381, or, GR64, load, 8>;
+    defm O  : BinaryRXPairAndPseudo<"o", 0x56, 0xE356, or, GR32, z_load, 4>;
+    defm OG : BinaryRXYAndPseudo<"og", 0xE381, or, GR64, z_load, 8>;
   }
 
   // OR to memory
@@ -1330,8 +1330,8 @@ let Defs = [CC] in {
 
   // XORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm X  : BinaryRXPairAndPseudo<"x",0x57, 0xE357, xor, GR32, load, 4>;
-    defm XG : BinaryRXYAndPseudo<"xg", 0xE382, xor, GR64, load, 8>;
+    defm X  : BinaryRXPairAndPseudo<"x",0x57, 0xE357, xor, GR32, z_load, 4>;
+    defm XG : BinaryRXYAndPseudo<"xg", 0xE382, xor, GR64, z_load, 8>;
   }
 
   // XOR to memory
@@ -1411,17 +1411,17 @@ def MSFI  : BinaryRIL<"msfi",  0xC21, mul, GR32, simm32>;
 def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 
 // Multiplication of memory.
-defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
-defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
-def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, asextloadi16, 2>,
+defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, z_asextloadi16, 2>;
+defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, z_load, 4>;
+def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, z_asextloadi16, 2>,
             Requires<[FeatureMiscellaneousExtensions2]>;
-def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
-def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
+def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, z_asextloadi32, 4>;
+def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, z_load, 8>;
 
 // Multiplication of memory, setting the condition code.
 let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in {
-  defm MSC  : BinaryRXYAndPseudo<"msc",  0xE353, null_frag, GR32, load, 4>;
-  defm MSGC : BinaryRXYAndPseudo<"msgc", 0xE383, null_frag, GR64, load, 8>;
+  defm MSC  : BinaryRXYAndPseudo<"msc",  0xE353, null_frag, GR32, z_load, 4>;
+  defm MSGC : BinaryRXYAndPseudo<"msgc", 0xE383, null_frag, GR64, z_load, 8>;
 }
 
 // Multiplication of a register, producing two results.
@@ -1437,16 +1437,16 @@ def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2),
           (MLGR (AEXT128 GR64:$src1), GR64:$src2)>;
 
 // Multiplication of memory, producing two results.
-def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, load, 4>;
-def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
-def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, load, 8>,
+def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, z_load, 4>;
+def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, z_load, 4>;
+def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, z_load, 8>,
           Requires<[FeatureMiscellaneousExtensions2]>;
-def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
-def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>;
+def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, z_load, 4>;
+def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, z_load, 8>;
 
-def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_smul_lohi GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
-def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_umul_lohi GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 //===----------------------------------------------------------------------===//
@@ -1462,30 +1462,30 @@ let hasSideEffects = 1 in {  // Do not speculatively execute.
   def DLGR  : BinaryRRE<"dlgr",  0xB987, null_frag, GR128, GR64>;
 
   // Division and remainder, from memory.
-  def D    : BinaryRX <"d",    0x5D,   null_frag, GR128, load, 4>;
-  def DSGF : BinaryRXY<"dsgf", 0xE31D, null_frag, GR128, load, 4>;
-  def DSG  : BinaryRXY<"dsg",  0xE30D, null_frag, GR128, load, 8>;
-  def DL   : BinaryRXY<"dl",   0xE397, null_frag, GR128, load, 4>;
-  def DLG  : BinaryRXY<"dlg",  0xE387, null_frag, GR128, load, 8>;
+  def D    : BinaryRX <"d",    0x5D,   null_frag, GR128, z_load, 4>;
+  def DSGF : BinaryRXY<"dsgf", 0xE31D, null_frag, GR128, z_load, 4>;
+  def DSG  : BinaryRXY<"dsg",  0xE30D, null_frag, GR128, z_load, 8>;
+  def DL   : BinaryRXY<"dl",   0xE397, null_frag, GR128, z_load, 4>;
+  def DLG  : BinaryRXY<"dlg",  0xE387, null_frag, GR128, z_load, 8>;
 }
 def : Pat<(z_sdivrem GR64:$src1, GR32:$src2),
           (DSGFR (AEXT128 GR64:$src1), GR32:$src2)>;
-def : Pat<(z_sdivrem GR64:$src1, (i32 (load bdxaddr20only:$src2))),
+def : Pat<(z_sdivrem GR64:$src1, (i32 (z_load bdxaddr20only:$src2))),
           (DSGF (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 def : Pat<(z_sdivrem GR64:$src1, GR64:$src2),
           (DSGR (AEXT128 GR64:$src1), GR64:$src2)>;
-def : Pat<(z_sdivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_sdivrem GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (DSG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 def : Pat<(z_udivrem GR32:$src1, GR32:$src2),
           (DLR (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1,
                                        subreg_l32)), GR32:$src2)>;
-def : Pat<(z_udivrem GR32:$src1, (i32 (load bdxaddr20only:$src2))),
+def : Pat<(z_udivrem GR32:$src1, (i32 (z_load bdxaddr20only:$src2))),
           (DL (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1,
                                       subreg_l32)), bdxaddr20only:$src2)>;
 def : Pat<(z_udivrem GR64:$src1, GR64:$src2),
           (DLGR (ZEXT128 GR64:$src1), GR64:$src2)>;
-def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_udivrem GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (DLG (ZEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 //===----------------------------------------------------------------------===//
@@ -1591,25 +1591,25 @@ let Defs = [CC], CCValues = 0xE in {
   def CGFI : CompareRIL<"cgfi", 0xC2C, z_scmp, GR64, imm64sx32>;
 
   // Comparison with memory.
-  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, asextloadi16, 2>;
-  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, load, 4>,
+  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, z_asextloadi16, 2>;
+  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, z_load, 4>,
                Requires<[FeatureHighWord]>;
-  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, load, 4>;
-  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, load, 4>,
+  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, z_load, 4>;
+  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, z_load, 4>,
                Requires<[FeatureHighWord]>;
-  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, asextloadi16, 2>;
-  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, asextloadi32, 4>;
-  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, load, 8>;
-  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_asextloadi16>;
-  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_load>;
-  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_asextloadi16>;
-  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_asextloadi32>;
-  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_load>;
+  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, z_asextloadi16, 2>;
+  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, z_asextloadi32, 4>;
+  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, z_load, 8>;
+  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_z_asextloadi16>;
+  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_z_load>;
+  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_z_asextloadi16>;
+  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_z_asextloadi32>;
+  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_z_load>;
 
   // Comparison between memory and a signed 16-bit immediate.
-  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, asextloadi16, imm32sx16>;
-  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, load, imm32sx16>;
-  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, load, imm64sx16>;
+  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, z_asextloadi16, imm32sx16>;
+  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, z_load, imm32sx16>;
+  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, z_load, imm64sx16>;
 }
 defm : SXB<z_scmp, GR64, CGFR>;
 
@@ -1636,31 +1636,31 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
 
   // Comparison with memory.
-  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, load, 4>,
+  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, z_load, 4>,
                 Requires<[FeatureHighWord]>;
-  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load, 4>;
-  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, load, 4>,
+  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, z_load, 4>;
+  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, z_load, 4>,
                 Requires<[FeatureHighWord]>;
-  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, azextloadi32, 4>;
-  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load, 8>;
+  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, z_azextloadi32, 4>;
+  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, z_load, 8>;
   def  CLHRL  : CompareRILPC<"clhrl",  0xC67, z_ucmp, GR32,
-                             aligned_azextloadi16>;
+                             aligned_z_azextloadi16>;
   def  CLRL   : CompareRILPC<"clrl",   0xC6F, z_ucmp, GR32,
-                             aligned_load>;
+                             aligned_z_load>;
   def  CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
-                             aligned_azextloadi16>;
+                             aligned_z_azextloadi16>;
   def  CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
-                             aligned_azextloadi32>;
+                             aligned_z_azextloadi32>;
   def  CLGRL  : CompareRILPC<"clgrl",  0xC6A, z_ucmp, GR64,
-                             aligned_load>;
+                             aligned_z_load>;
 
   // Comparison between memory and an unsigned 8-bit immediate.
-  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, azextloadi8, imm32zx8>;
+  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, z_azextloadi8, imm32zx8>;
 
   // Comparison between memory and an unsigned 16-bit immediate.
-  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, azextloadi16, imm32zx16>;
-  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load, imm32zx16>;
-  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load, imm64zx16>;
+  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, z_azextloadi16, imm32zx16>;
+  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, z_load, imm32zx16>;
+  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, z_load, imm64zx16>;
 }
 defm : ZXB<z_ucmp, GR64, CLGFR>;
 
@@ -1693,7 +1693,7 @@ let Defs = [CC] in {
   def TMHL64 : CompareAliasRI<z_tm_reg, GR64, imm64hl16>;
   def TMHH64 : CompareAliasRI<z_tm_reg, GR64, imm64hh16>;
 
-  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
+  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, z_anyextloadi8, imm32zx8>;
 }
 
 def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
@@ -1914,8 +1914,8 @@ let Predicates = [FeatureGuardedStorage], hasSideEffects = 1 in {
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-defm CVB  : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>;
-def  CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>;
+defm CVB  : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, z_load, 4>;
+def  CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, z_load, 8>;
 
 defm CVD  : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>;
 def  CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 799b27d74414d5..245e3c3399a986 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -140,8 +140,8 @@ let Predicates = [FeatureVector] in {
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
   let mayLoad = 1 in {
-    def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>;
-    def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+    def VL32 : UnaryAliasVRX<z_load, v32sb, bdxaddr12pair>;
+    def VL64 : UnaryAliasVRX<z_load, v64db, bdxaddr12pair>;
   }
 
   // Load logical element and zero.
@@ -198,12 +198,12 @@ multiclass ReplicatePeephole<Instruction vlrep, ValueType vectype,
                       (scalartype (load bdxaddr12only:$addr)))),
             (vlrep bdxaddr12only:$addr)>;
 }
-defm : ReplicatePeephole<VLREPB, v16i8, anyextloadi8, i32>;
-defm : ReplicatePeephole<VLREPH, v8i16, anyextloadi16, i32>;
-defm : ReplicatePeephole<VLREPF, v4i32, load, i32>;
-defm : ReplicatePeephole<VLREPG, v2i64, load, i64>;
-defm : ReplicatePeephole<VLREPF, v4f32, load, f32>;
-defm : ReplicatePeephole<VLREPG, v2f64, load, f64>;
+defm : ReplicatePeephole<VLREPB, v16i8, z_anyextloadi8, i32>;
+defm : ReplicatePeephole<VLREPH, v8i16, z_anyextloadi16, i32>;
+defm : ReplicatePeephole<VLREPF, v4i32, z_load, i32>;
+defm : ReplicatePeephole<VLREPG, v2i64, z_load, i64>;
+defm : ReplicatePeephole<VLREPF, v4f32, z_load, f32>;
+defm : ReplicatePeephole<VLREPG, v2f64, z_load, f64>;
 
 //===----------------------------------------------------------------------===//
 // Stores
@@ -1561,13 +1561,13 @@ let Predicates = [FeatureVector] in {
 
 // Any-extending loads into i128.
 let Predicates = [FeatureVector] in {
-  def : Pat<(i128 (extloadi8 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi8 bdxaddr12only:$addr)),
             (VLREPB bdxaddr12only:$addr)>;
-  def : Pat<(i128 (extloadi16 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi16 bdxaddr12only:$addr)),
             (VLREPH bdxaddr12only:$addr)>;
-  def : Pat<(i128 (extloadi32 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi32 bdxaddr12only:$addr)),
             (VLREPF bdxaddr12only:$addr)>;
-  def : Pat<(i128 (extloadi64 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi64 bdxaddr12only:$addr)),
             (VLREPG bdxaddr12only:$addr)>;
 }
 
@@ -1621,13 +1621,13 @@ let Predicates = [FeatureVector] in {
 
 // Zero-extending loads into i128.
 let Predicates = [FeatureVector] in {
-  def : Pat<(i128 (zextloadi8 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi8 bdxaddr12only:$addr)),
             (VLEB (VGBM 0), bdxaddr12only:$addr, 15)>;
-  def : Pat<(i128 (zextloadi16 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi16 bdxaddr12only:$addr)),
             (VLEH (VGBM 0), bdxaddr12only:$addr, 7)>;
-  def : Pat<(i128 (zextloadi32 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi32 bdxaddr12only:$addr)),
             (VLEF (VGBM 0), bdxaddr12only:$addr, 3)>;
-  def : Pat<(i128 (zextloadi64 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi64 bdxaddr12only:$addr)),
             (VLEG (VGBM 0), bdxaddr12only:$addr, 1)>;
 }
 
@@ -1663,13 +1663,13 @@ let Predicates = [FeatureVector] in {
 
 // Sign-extending loads into i128.
 let Predicates = [FeatureVector] in {
-  def : Pat<(i128 (sextloadi8 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi8 bdxaddr12only:$addr)),
             (VSRAB (VLREPB bdxaddr12only:$addr), (VREPIB 120))>;
-  def : Pat<(i128 (sextloadi16 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi16 bdxaddr12only:$addr)),
             (VSRAB (VLREPH bdxaddr12only:$addr), (VREPIB 112))>;
-  def : Pat<(i128 (sextloadi32 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi32 bdxaddr12only:$addr)),
             (VSRAB (VLREPF bdxaddr12only:$addr), (VREPIB 96))>;
-  def : Pat<(i128 (sextloadi64 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi64 bdxaddr12only:$addr)),
             (VSRAB (VLREPG bdxaddr12only:$addr), (VREPIB 64))>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index d98bb886c18506..6c4e33a6aa7fa8 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -534,37 +534,108 @@ def zext8  : PatFrag<(ops node:$src), (and node:$src, 0xff)>;
 def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>;
 def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
 
-// Extending loads in which the extension type can be signed.
-def asextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
-  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
-  return Type == ISD::EXTLOAD || Type == ISD::SEXTLOAD;
+// Match a load or a non-extending atomic load.
+def z_load : PatFrags<(ops node:$ptr),
+                      [(load node:$ptr),
+                       (atomic_load node:$ptr)], [{
+  if (auto *AL = dyn_cast<AtomicSDNode>(N))
+    if (AL->getExtensionType() != ISD::NON_EXTLOAD)
+      return false;
+  return true;
 }]>;
-def asextloadi8 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+
+// Sign extending (atomic) loads.
+def z_sextload : PatFrags<(ops node:$ptr),
+                          [(unindexedload node:$ptr),
+                           (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) == ISD::SEXTLOAD;
 }]>;
-def asextloadi16 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+def z_sextloadi8 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
-def asextloadi32 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+def z_sextloadi16 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_sextloadi32 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_sextloadi64 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
 }]>;
 
-// Extending loads in which the extension type can be unsigned.
-def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
-  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
-  return Type == ISD::EXTLOAD || Type == ISD::ZEXTLOAD;
+// Zero extending (atomic) loads.
+def z_zextload : PatFrags<(ops node:$ptr),
+                          [(unindexedload node:$ptr),
+                           (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) == ISD::ZEXTLOAD;
 }]>;
-def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+def z_zextloadi8 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
-def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+def z_zextloadi16 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
 }]>;
-def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+def z_zextloadi32 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_zextloadi64 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+// Extending (atomic) loads in which the extension type can be signed.
+def z_asextload : PatFrags<(ops node:$ptr),
+                           [(unindexedload node:$ptr),
+                            (atomic_load node:$ptr)], [{
+  ISD::LoadExtType ETy = getLoadExtType(N);
+  return ETy == ISD::EXTLOAD || ETy == ISD::SEXTLOAD;
+}]>;
+def z_asextloadi8 : PatFrag<(ops node:$ptr), (z_asextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_asextloadi16 : PatFrag<(ops node:$ptr), (z_asextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_asextloadi32 : PatFrag<(ops node:$ptr), (z_asextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
-// Extending loads in which the extension type doesn't matter.
+// Extending (atomic) loads in which the extension type can be unsigned.
+def z_azextload : PatFrags<(ops node:$ptr),
+                           [(unindexedload node:$ptr),
+                           (atomic_load node:$ptr)], [{
+  ISD::LoadExtType ETy = getLoadExtType(N);
+  return ETy == ISD::EXTLOAD || ETy == ISD::ZEXTLOAD;
+}]>;
+def z_azextloadi8 : PatFrag<(ops node:$ptr), (z_azextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_azextloadi16 : PatFrag<(ops node:$ptr), (z_azextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_azextloadi32 : PatFrag<(ops node:$ptr), (z_azextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending (atomic) loads in which the extension type doesn't matter.
+def z_anyextload : PatFrags<(ops node:$ptr),
+                            [(unindexedload node:$ptr),
+                             (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) != ISD::NON_EXTLOAD;
+}]>;
+def z_anyextloadi8 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_anyextloadi16 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_anyextloadi32 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_anyextloadi64 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+// Extending non-atomic loads in which the extension type doesn't matter.
 def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
   return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
 }]>;
@@ -578,15 +649,42 @@ def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
+// Extending (atomic) loads that are not sign/zero extending.
+def z_extload : PatFrags<(ops node:$ptr),
+                         [(extload node:$ptr),
+                          (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) == ISD::EXTLOAD;
+}]>;
+def z_extloadi8 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_extloadi16 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_extloadi32 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_extloadi64 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+// Extending atomic FP loads.
+def z_any_extloadf32 : PatFrags<(ops node:$ptr),
+                                [(any_extloadf32 node:$ptr),
+                                 (any_fpextend (f32 (atomic_load node:$ptr)))]>;
+def z_any_extloadf64 : PatFrags<(ops node:$ptr),
+                                [(any_extloadf64 node:$ptr),
+                                 (any_fpextend (f64 (atomic_load node:$ptr)))]>;
+
 // Aligned loads.
 class AlignedLoad<SDPatternOperator load>
   : PatFrag<(ops node:$addr), (load node:$addr),
   [{ return storeLoadIsAligned(N); }]>;
-def aligned_load         : AlignedLoad<load>;
-def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
-def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
-def aligned_azextloadi16 : AlignedLoad<azextloadi16>;
-def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
+def aligned_z_load         : AlignedLoad<z_load>;
+def aligned_z_asextloadi16 : AlignedLoad<z_asextloadi16>;
+def aligned_z_asextloadi32 : AlignedLoad<z_asextloadi32>;
+def aligned_z_azextloadi16 : AlignedLoad<z_azextloadi16>;
+def aligned_z_azextloadi32 : AlignedLoad<z_azextloadi32>;
 
 // Aligned stores.
 class AlignedStore<SDPatternOperator store>
@@ -749,7 +847,7 @@ def z_any_vround  : PatFrags<(ops node:$src),
 
 // Create a unary operator that loads from memory and then performs
 // the given operation on it.
-class loadu<SDPatternOperator operator, SDPatternOperator load = load>
+class loadu<SDPatternOperator operator, SDPatternOperator load = z_load>
   : PatFrag<(ops node:$addr), (operator (load node:$addr))>;
 
 // Create a store operator that performs the given unary operation
@@ -799,12 +897,12 @@ def imm32nobytes : PatLeaf<(i32 imm), [{
 class z_replicate_load<ValueType scalartype, SDPatternOperator load>
   : PatFrag<(ops node:$addr),
             (z_replicate (scalartype (load node:$addr)))>;
-def z_replicate_loadi8  : z_replicate_load<i32, anyextloadi8>;
-def z_replicate_loadi16 : z_replicate_load<i32, anyextloadi16>;
-def z_replicate_loadi32 : z_replicate_load<i32, load>;
-def z_replicate_loadi64 : z_replicate_load<i64, load>;
-def z_replicate_loadf32 : z_replicate_load<f32, load>;
-def z_replicate_loadf64 : z_replicate_load<f64, load>;
+def z_replicate_loadi8  : z_replicate_load<i32, z_anyextloadi8>;
+def z_replicate_loadi16 : z_replicate_load<i32, z_anyextloadi16>;
+def z_replicate_loadi32 : z_replicate_load<i32, z_load>;
+def z_replicate_loadi64 : z_replicate_load<i64, z_load>;
+def z_replicate_loadf32 : z_replicate_load<f32, z_load>;
+def z_replicate_loadf64 : z_replicate_load<f64, z_load>;
 // Byte-swapped replicated vector element loads.
 def z_replicate_loadbswapi16 : z_replicate_load<i32, z_loadbswap16>;
 def z_replicate_loadbswapi32 : z_replicate_load<i32, z_loadbswap32>;
@@ -815,12 +913,12 @@ class z_vle<ValueType scalartype, SDPatternOperator load>
   : PatFrag<(ops node:$vec, node:$addr, node:$index),
             (z_vector_insert node:$vec, (scalartype (load node:$addr)),
                              node:$index)>;
-def z_vlei8  : z_vle<i32, anyextloadi8>;
-def z_vlei16 : z_vle<i32, anyextloadi16>;
-def z_vlei32 : z_vle<i32, load>;
-def z_vlei64 : z_vle<i64, load>;
-def z_vlef32 : z_vle<f32, load>;
-def z_vlef64 : z_vle<f64, load>;
+def z_vlei8  : z_vle<i32, z_anyextloadi8>;
+def z_vlei16 : z_vle<i32, z_anyextloadi16>;
+def z_vlei32 : z_vle<i32, z_load>;
+def z_vlei64 : z_vle<i64, z_load>;
+def z_vlef32 : z_vle<f32, z_load>;
+def z_vlef64 : z_vle<f64, z_load>;
 // Byte-swapped vector element loads.
 def z_vlebri16 : z_vle<i32, z_loadbswap16>;
 def z_vlebri32 : z_vle<i32, z_loadbswap32>;
@@ -832,13 +930,13 @@ class z_vllez<ValueType scalartype, SDPatternOperator load, int index>
   : PatFrag<(ops node:$addr),
             (z_vector_insert immAllZerosV,
                              (scalartype (load node:$addr)), (i32 index))>;
-def z_vllezi8  : z_vllez<i32, anyextloadi8, 7>;
-def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>;
-def z_vllezi32 : z_vllez<i32, load, 1>;
+def z_vllezi8  : z_vllez<i32, z_anyextloadi8, 7>;
+def z_vllezi16 : z_vllez<i32, z_anyextloadi16, 3>;
+def z_vllezi32 : z_vllez<i32, z_load, 1>;
 def z_vllezi64 : PatFrags<(ops node:$addr),
                           [(z_vector_insert immAllZerosV,
-                                            (i64 (load node:$addr)), (i32 0)),
-                           (z_join_dwords (i64 (load node:$addr)), (i64 0))]>;
+                                            (i64 (z_load node:$addr)), (i32 0)),
+                           (z_join_dwords (i64 (z_load node:$addr)), (i64 0))]>;
 // We use high merges to form a v4f32 from four f32s.  Propagating zero
 // into all elements but index 1 gives this expression.
 def z_vllezf32 : PatFrag<(ops node:$addr),
@@ -848,23 +946,23 @@ def z_vllezf32 : PatFrag<(ops node:$addr),
                             (v4i32
                              (bitconvert
                               (v4f32 (scalar_to_vector
-                                      (f32 (load node:$addr)))))))),
+                                      (f32 (z_load node:$addr)))))))),
                           (v2i64
                            (bitconvert (v4f32 immAllZerosV))))>;
 def z_vllezf64 : PatFrag<(ops node:$addr),
                          (z_merge_high
-                          (v2f64 (scalar_to_vector (f64 (load node:$addr)))),
+                          (v2f64 (scalar_to_vector (f64 (z_load node:$addr)))),
                           immAllZerosV)>;
 
 // Similarly for the high element of a zeroed vector.
-def z_vllezli32 : z_vllez<i32, load, 0>;
+def z_vllezli32 : z_vllez<i32, z_load, 0>;
 def z_vllezlf32 : PatFrag<(ops node:$addr),
                           (z_merge_high
                            (v2i64
                             (bitconvert
                              (z_merge_high
                               (v4f32 (scalar_to_vector
-                                      (f32 (load node:$addr)))),
+                                      (f32 (z_load node:$addr)))),
                               (v4f32 immAllZerosV)))),
                            (v2i64
                             (bitconvert (v4f32 immAllZerosV))))>;
diff --git a/llvm/lib/Target/SystemZ/SystemZPatterns.td b/llvm/lib/Target/SystemZ/SystemZPatterns.td
index 5e5dca77e9553b..4d6bc68e9a7edd 100644
--- a/llvm/lib/Target/SystemZ/SystemZPatterns.td
+++ b/llvm/lib/Target/SystemZ/SystemZPatterns.td
@@ -49,8 +49,8 @@ class RMWI<SDPatternOperator load, SDPatternOperator operator,
 // memory location.  IMM is the type of the second operand.
 multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
                     Instruction insn> {
-  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
-  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
+  def : RMWI<z_anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<z_anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
 }
 
 // Record that INSN performs insertion TYPE into a register of class CLS.
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 1e548d7c101a7a..cbad5a0eafb27f 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -1785,14 +1785,14 @@ defm : TRUNC64m<truncstorei8, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
 defm : TRUNC64m<truncstorei16, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
 defm : TRUNC64m<truncstorei32, STLrri, STLrii, STLzri, ST1Bzii>;
 
-// Atomic loads
+// Atomic loads (FIXME: replace iAny with the correct integer VT:)
 multiclass ATMLDm<SDPatternOperator from,
                   RM torri, RM torii,
                   RM tozri, RM tozii> {
-  def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>;
-  def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>;
-  def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>;
-  def : Pat<(from ADDRzii:$addr), (tozii MEMzii:$addr)>;
+  def : Pat<(iAny (from ADDRrri:$addr)), (torri MEMrri:$addr)>;
+  def : Pat<(iAny (from ADDRrii:$addr)), (torii MEMrii:$addr)>;
+  def : Pat<(iAny (from ADDRzri:$addr)), (tozri MEMzri:$addr)>;
+  def : Pat<(iAny (from ADDRzii:$addr)), (tozii MEMzii:$addr)>;
 }
 defm : ATMLDm<atomic_load_8, LD1BZXrri, LD1BZXrii, LD1BZXzri, LD1BZXzii>;
 defm : ATMLDm<atomic_load_16, LD2BZXrri, LD2BZXrii, LD2BZXzri, LD2BZXzii>;
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index 86fb99cc98a905..fac2e0d935f5ab 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -63,7 +63,7 @@ wasm::ValType WebAssembly::toValType(MVT Type) {
 }
 
 void WebAssembly::wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
-                                    const ArrayRef<MVT> &VTs) {
+                                    ArrayRef<MVT> VTs) {
   assert(!Sym->getType());
 
   // Tables are represented as Arrays in LLVM IR therefore
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
index a184ecb9a465a5..87660947e7de11 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h
@@ -60,7 +60,7 @@ wasm::ValType toValType(MVT Type);
 
 /// Sets a Wasm Symbol Type.
 void wasmSymbolSetType(MCSymbolWasm *Sym, const Type *GlobalVT,
-                       const ArrayRef<MVT> &VTs);
+                       ArrayRef<MVT> VTs);
 
 } // end namespace WebAssembly
 } // end namespace llvm
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index fb949d4b19a3e1..03897b551ecaee 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -264,7 +264,7 @@ MCSymbol *WebAssemblyAsmPrinter::getOrCreateWasmSymbol(StringRef Name) {
     Params.push_back(AddrType);
   } else { // Function symbols
     WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
-    getLibcallSignature(Subtarget, Name, Returns, Params);
+    WebAssembly::getLibcallSignature(Subtarget, Name, Returns, Params);
   }
   auto Signature = std::make_unique<wasm::WasmSignature>(std::move(Returns),
                                                          std::move(Params));
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 77e6640d5a8224..0427fe473f8e16 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -1283,9 +1283,9 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   DebugLoc FirstDL = getOrCreateDebugLoc(&*Entry->begin(), F.getSubprogram());
   SplitBlock(Entry, &*Entry->getFirstInsertionPt());
 
-  BinaryOperator *SetjmpTableSize =
-      BinaryOperator::Create(Instruction::Add, IRB.getInt32(4), IRB.getInt32(0),
-                             "setjmpTableSize", Entry->getTerminator());
+  BinaryOperator *SetjmpTableSize = BinaryOperator::Create(
+      Instruction::Add, IRB.getInt32(4), IRB.getInt32(0), "setjmpTableSize",
+      Entry->getTerminator()->getIterator());
   SetjmpTableSize->setDebugLoc(FirstDL);
   // setjmpTable = (int *) malloc(40);
   Type *IntPtrTy = getAddrIntType(&M);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
index e0a21921122858..2594430d1d8f3a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
@@ -73,7 +73,7 @@ bool WebAssemblyLowerRefTypesIntPtrConv::runOnFunction(Function &F) {
 
     Function *TrapIntrin =
         Intrinsic::getDeclaration(F.getParent(), Intrinsic::debugtrap);
-    CallInst::Create(TrapIntrin, {}, "", &*I);
+    CallInst::Create(TrapIntrin, {}, "", I->getIterator());
 
     worklist.insert(&*I);
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 1e959111a4dbcb..d17394eede7725 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -105,7 +105,7 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
   }
 }
 
-void llvm::valTypesFromMVTs(const ArrayRef<MVT> &In,
+void llvm::valTypesFromMVTs(ArrayRef<MVT> In,
                             SmallVectorImpl<wasm::ValType> &Out) {
   for (MVT Ty : In)
     Out.push_back(WebAssembly::toValType(Ty));
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index fe18347ad8c1da..37059188a7614e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -169,8 +169,7 @@ void computeSignatureVTs(const FunctionType *Ty, const Function *TargetFunc,
                          SmallVectorImpl<MVT> &Params,
                          SmallVectorImpl<MVT> &Results);
 
-void valTypesFromMVTs(const ArrayRef<MVT> &In,
-                      SmallVectorImpl<wasm::ValType> &Out);
+void valTypesFromMVTs(ArrayRef<MVT> In, SmallVectorImpl<wasm::ValType> &Out);
 
 std::unique_ptr<wasm::WasmSignature>
 signatureFromMVTs(const SmallVectorImpl<MVT> &Results,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index c9ef17f928144d..acd984f2f8f882 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -129,7 +129,7 @@ buildVRegToDbgValueMap(MachineFunction &MF, const LiveIntervals *Liveness) {
 // changes.
 static void undefInvalidDbgValues(
     const LiveIntervals *Liveness,
-    const ArrayRef<SmallVector<LiveInterval *, 4>> &Assignments,
+    ArrayRef<SmallVector<LiveInterval *, 4>> Assignments,
     DenseMap<Register, std::vector<std::pair<SlotIndex, MachineInstr *>>>
         &DbgVRegToValues) {
 #ifndef NDEBUG
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 3e2e029695ab6f..1896ac631d96e5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -524,10 +524,10 @@ struct StaticLibcallNameMap {
 
 } // end anonymous namespace
 
-void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
-                               RTLIB::Libcall LC,
-                               SmallVectorImpl<wasm::ValType> &Rets,
-                               SmallVectorImpl<wasm::ValType> &Params) {
+void WebAssembly::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
+                                      RTLIB::Libcall LC,
+                                      SmallVectorImpl<wasm::ValType> &Rets,
+                                      SmallVectorImpl<wasm::ValType> &Params) {
   assert(Rets.empty());
   assert(Params.empty());
 
@@ -889,10 +889,10 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
 
 // TODO: If the RTLIB::Libcall-taking flavor of GetSignature remains unused
 // other than here, just roll its logic into this version.
-void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
-                               StringRef Name,
-                               SmallVectorImpl<wasm::ValType> &Rets,
-                               SmallVectorImpl<wasm::ValType> &Params) {
+void WebAssembly::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
+                                      StringRef Name,
+                                      SmallVectorImpl<wasm::ValType> &Rets,
+                                      SmallVectorImpl<wasm::ValType> &Params) {
   static StaticLibcallNameMap LibcallNameMap;
   auto &Map = LibcallNameMap.Map;
   auto Val = Map.find(Name);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
index f7a94aa20bd463..ff6515d5bf4e67 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -22,16 +22,18 @@ namespace llvm {
 
 class WebAssemblySubtarget;
 
-extern void getLibcallSignature(const WebAssemblySubtarget &Subtarget,
-                                RTLIB::Libcall LC,
-                                SmallVectorImpl<wasm::ValType> &Rets,
-                                SmallVectorImpl<wasm::ValType> &Params);
+namespace WebAssembly {
 
-extern void getLibcallSignature(const WebAssemblySubtarget &Subtarget,
-                                StringRef Name,
-                                SmallVectorImpl<wasm::ValType> &Rets,
-                                SmallVectorImpl<wasm::ValType> &Params);
+void getLibcallSignature(const WebAssemblySubtarget &Subtarget,
+                         RTLIB::Libcall LC,
+                         SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
 
+void getLibcallSignature(const WebAssemblySubtarget &Subtarget, StringRef Name,
+                         SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
+
+} // end namespace WebAssembly
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e3701d69934c08..6401df9f49f033 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -4068,7 +4068,7 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
 
   if (UseApxExtendedReg && !X86II::canUseApxExtendedReg(MCID))
     return Match_Unsupported;
-  if (ForcedNoFlag != !!(MCID.TSFlags & X86II::EVEX_NF))
+  if (ForcedNoFlag == !(MCID.TSFlags & X86II::EVEX_NF) && !X86::isCFCMOVCC(Opc))
     return Match_Unsupported;
 
   if (ForcedVEXEncoding == VEXEncoding_EVEX &&
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 65b8ebc0b9b994..c0a75e215a40b5 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -545,6 +545,14 @@ enum : uint64_t {
   /// PrefixByte - This form is used for instructions that represent a prefix
   /// byte like data16 or rep.
   PrefixByte = 10,
+  /// MRMDestRegCC - This form is used for the cfcmov instructions, which use
+  /// the Mod/RM byte to specify the operands reg(r/m) and reg(reg) and also
+  /// encodes a condition code.
+  MRMDestRegCC = 18,
+  /// MRMDestMemCC - This form is used for the cfcmov instructions, which use
+  /// the Mod/RM byte to specify the operands mem(r/m) and reg(reg) and also
+  /// encodes a condition code.
+  MRMDestMemCC = 19,
   /// MRMDestMem4VOp3CC - This form is used for instructions that use the Mod/RM
   /// byte to specify a destination which in this case is memory and operand 3
   /// with VEX.VVVV, and also encodes a condition code.
@@ -1032,6 +1040,7 @@ inline int getMemoryOperandNo(uint64_t TSFlags) {
     return -1;
   case X86II::MRMDestMem:
   case X86II::MRMDestMemFSIB:
+  case X86II::MRMDestMemCC:
     return hasNewDataDest(TSFlags);
   case X86II::MRMSrcMem:
   case X86II::MRMSrcMemFSIB:
@@ -1045,11 +1054,13 @@ inline int getMemoryOperandNo(uint64_t TSFlags) {
     // Skip registers encoded in reg, VEX_VVVV, and I8IMM.
     return 3;
   case X86II::MRMSrcMemCC:
+    return 1 + hasNewDataDest(TSFlags);
   case X86II::MRMDestMem4VOp3CC:
     // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
     // mask register.
     return 1;
   case X86II::MRMDestReg:
+  case X86II::MRMDestRegCC:
   case X86II::MRMSrcReg:
   case X86II::MRMSrcReg4VOp3:
   case X86II::MRMSrcRegOp4:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 29a1866bf01ab0..21c1556d1d8ed2 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -394,7 +394,7 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O,
   else if (Flags & X86::IP_HAS_REPEAT)
     O << "\trep\t";
 
-  if (TSFlags & X86II::EVEX_NF)
+  if (TSFlags & X86II::EVEX_NF && !X86::isCFCMOVCC(MI->getOpcode()))
     O << "\t{nf}";
 
   // These all require a pseudo prefix
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index dcd0551b7b7653..92a14226a0dc05 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1071,6 +1071,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
   case X86II::MRM_C0:
   case X86II::RawFrm:
     break;
+  case X86II::MRMDestMemCC:
   case X86II::MRMDestMemFSIB:
   case X86II::MRMDestMem: {
     // MRMDestMem instructions forms:
@@ -1102,6 +1103,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     }
     break;
   }
+  case X86II::MRMSrcMemCC:
   case X86II::MRMSrcMemFSIB:
   case X86II::MRMSrcMem: {
     // MRMSrcMem instructions forms:
@@ -1180,6 +1182,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     }
     break;
   }
+  case X86II::MRMSrcRegCC:
   case X86II::MRMSrcReg: {
     // MRMSrcReg instructions forms:
     //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
@@ -1242,6 +1245,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     ++CurOp;
     break;
   }
+  case X86II::MRMDestRegCC:
   case X86II::MRMDestReg: {
     // MRMDestReg instructions forms:
     //  dst(ModR/M), src(ModR/M)
@@ -1638,6 +1642,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     CurOp = SrcRegNum + 1;
     break;
   }
+  case X86II::MRMDestRegCC: {
+    unsigned FirstOp = CurOp++;
+    unsigned SecondOp = CurOp++;
+    unsigned CC = MI.getOperand(CurOp++).getImm();
+    emitByte(BaseOpcode + CC, CB);
+    emitRegModRMByte(MI.getOperand(FirstOp),
+                     getX86RegNum(MI.getOperand(SecondOp)), CB);
+    break;
+  }
   case X86II::MRMDestMem4VOp3CC: {
     unsigned CC = MI.getOperand(8).getImm();
     emitByte(BaseOpcode + CC, CB);
@@ -1667,6 +1680,16 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     CurOp = SrcRegNum + 1;
     break;
   }
+  case X86II::MRMDestMemCC: {
+    unsigned MemOp = CurOp;
+    CurOp = MemOp + X86::AddrNumOperands;
+    unsigned RegOp = CurOp++;
+    unsigned CC = MI.getOperand(CurOp++).getImm();
+    emitByte(BaseOpcode + CC, CB);
+    emitMemModRMByte(MI, MemOp, getX86RegNum(MI.getOperand(RegOp)), TSFlags,
+                     Kind, StartByte, CB, Fixups, STI);
+    break;
+  }
   case X86II::MRMSrcReg: {
     emitByte(BaseOpcode, CB);
     unsigned SrcRegNum = CurOp + 1;
@@ -1717,6 +1740,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     break;
   }
   case X86II::MRMSrcRegCC: {
+    if (IsND) // Skip new data destination
+      ++CurOp;
     unsigned FirstOp = CurOp++;
     unsigned SecondOp = CurOp++;
 
@@ -1778,6 +1803,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,
     break;
   }
   case X86II::MRMSrcMemCC: {
+    if (IsND) // Skip new data destination
+      ++CurOp;
     unsigned RegOp = CurOp++;
     unsigned FirstMemOp = CurOp;
     CurOp = FirstMemOp + X86::AddrNumOperands;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8367f938c0ddfa..78bc043911f2fc 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -683,6 +683,12 @@ def TuningFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
 
+// Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
+def TuningFastDPWSSD
+    : SubtargetFeature<
+          "fast-dpwssd", "HasFastDPWSSD", "true",
+          "Prefer vpdpwssd instruction over vpmaddwd+vpaddd instruction sequence">;
+
 def TuningPreferNoGather
     : SubtargetFeature<"prefer-no-gather", "PreferGather", "false",
                        "Prefer no gather instructions">;
@@ -1502,7 +1508,11 @@ def ProcessorFeatures {
     !listconcat(ZN2Tuning, ZN3AdditionalTuning);
   list<SubtargetFeature> ZN3Features =
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
-  list<SubtargetFeature> ZN4Tuning = ZN3Tuning;
+
+
+  list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastDPWSSD];
+  list<SubtargetFeature> ZN4Tuning =
+    !listconcat(ZN3Tuning, ZN4AdditionalTuning);
   list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
                                                   FeatureEVEX512,
                                                   FeatureCDI,
diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 04931afdec51c3..a72eeb53915d65 100644
--- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -521,8 +521,8 @@ bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
     return true;
 
   int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
-  int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
-  int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
+  int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;
+  int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;
 
   return !AA->isNoAlias(
       MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
@@ -688,7 +688,7 @@ bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
           !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
         continue;
       int64_t PBstDispImm = getDispOperand(PBInst).getImm();
-      unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
+      unsigned PBstSize = (*PBInst->memoperands_begin())->getSize().getValue();
       // This check doesn't cover all cases, but it will suffice for now.
       // TODO: take branch probability into consideration, if the blocking
       // store is in an unreached block, breaking the memcopy could lose
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 48d3b68b1823a5..2eae155956368f 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -2133,7 +2133,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     return false;
 
   const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
-  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8);
+  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC) / 8, false,
+                                    Subtarget->hasNDD());
   Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, LHSReg, CC);
   updateValueMap(I, ResultReg);
   return true;
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index af25b34fbab995..d96613d7bb7efc 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -604,7 +604,8 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
         }
 
         // Otherwise we can just rewrite in-place.
-        if (X86::getCondFromCMov(MI) != X86::COND_INVALID) {
+        if (X86::getCondFromCMov(MI) != X86::COND_INVALID ||
+            X86::getCondFromCFCMov(MI) != X86::COND_INVALID) {
           rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
         } else if (getCondFromFCMOV(MI.getOpcode()) != X86::COND_INVALID) {
           rewriteFCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
@@ -829,7 +830,9 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
                                            MachineOperand &FlagUse,
                                            CondRegArray &CondRegs) {
   // First get the register containing this specific condition.
-  X86::CondCode Cond = X86::getCondFromCMov(CMovI);
+  X86::CondCode Cond = X86::getCondFromCMov(CMovI) == X86::COND_INVALID
+                           ? X86::getCondFromCFCMov(CMovI)
+                           : X86::getCondFromCMov(CMovI);
   unsigned CondReg;
   bool Inverted;
   std::tie(CondReg, Inverted) =
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 76c6c1645239ab..4e4241efd63d6b 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3090,13 +3090,19 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
                                         SDValue &Scale, SDValue &Index,
                                         SDValue &Disp, SDValue &Segment) {
-  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
-  auto *GA = cast<GlobalAddressSDNode>(N);
+  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
+         N.getOpcode() == ISD::TargetExternalSymbol);
 
   X86ISelAddressMode AM;
-  AM.GV = GA->getGlobal();
-  AM.Disp += GA->getOffset();
-  AM.SymbolFlags = GA->getTargetFlags();
+  if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
+    AM.GV = GA->getGlobal();
+    AM.Disp += GA->getOffset();
+    AM.SymbolFlags = GA->getTargetFlags();
+  } else {
+    auto *SA = cast<ExternalSymbolSDNode>(N);
+    AM.ES = SA->getSymbol();
+    AM.SymbolFlags = SA->getTargetFlags();
+  }
 
   if (Subtarget->is32Bit()) {
     AM.Scale = 1;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2b5e3c0379a138..8a6594230f0a2c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -18592,13 +18592,22 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDLoc dl(GA);
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                           GA->getValueType(0),
-                                           GA->getOffset(),
-                                           OperandFlags);
+  SDValue TGA;
+  bool UseTLSDESC = DAG.getTarget().useTLSDESC();
+  if (LocalDynamic && UseTLSDESC) {
+    TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
+    auto UI = TGA->use_begin();
+    // Reuse existing GetTLSADDR node if we can find it.
+    if (UI != TGA->use_end())
+      return SDValue(*UI->use_begin()->use_begin(), 0);
+  } else {
+    TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                     GA->getOffset(), OperandFlags);
+  }
 
-  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
-                                           : X86ISD::TLSADDR;
+  X86ISD::NodeType CallType = UseTLSDESC     ? X86ISD::TLSDESC
+                              : LocalDynamic ? X86ISD::TLSBASEADDR
+                                             : X86ISD::TLSADDR;
 
   if (InGlue) {
     SDValue Ops[] = { Chain,  TGA, *InGlue };
@@ -18613,7 +18622,19 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
   MFI.setHasCalls(true);
 
   SDValue Glue = Chain.getValue(1);
-  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
+  SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
+
+  if (!UseTLSDESC)
+    return Ret;
+
+  const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
+  unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
+
+  Value *Ptr = Constant::getNullValue(PointerType::get(*DAG.getContext(), Seg));
+  SDValue Offset =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
+                  MachinePointerInfo(Ptr));
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
@@ -33426,6 +33447,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(TLSADDR)
   NODE_NAME_CASE(TLSBASEADDR)
   NODE_NAME_CASE(TLSCALL)
+  NODE_NAME_CASE(TLSDESC)
   NODE_NAME_CASE(EH_SJLJ_SETJMP)
   NODE_NAME_CASE(EH_SJLJ_LONGJMP)
   NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
@@ -35206,6 +35228,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
   MachineFunction &MF = *BB->getParent();
 
   // Emit CALLSEQ_START right before the instruction.
+  BB->getParent()->getFrameInfo().setAdjustsStack(true);
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   MachineInstrBuilder CallseqStart =
       BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
@@ -36206,6 +36229,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
   case X86::TLS_base_addrX32:
+  case X86::TLS_desc32:
+  case X86::TLS_desc64:
     return EmitLoweredTLSAddr(MI, BB);
   case X86::INDIRECT_THUNK_CALL32:
   case X86::INDIRECT_THUNK_CALL64:
@@ -42641,7 +42666,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
 bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
     bool PoisonOnly, unsigned Depth) const {
-  unsigned EltsBits = Op.getScalarValueSizeInBits();
   unsigned NumElts = DemandedElts.getBitWidth();
 
   // TODO: Add more target shuffles.
@@ -42649,15 +42673,28 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI: {
     SmallVector<int, 8> Mask;
-    DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
-
-    APInt DemandedSrcElts = APInt::getZero(NumElts);
-    for (unsigned I = 0; I != NumElts; ++I)
-      if (DemandedElts[I])
-        DemandedSrcElts.setBit(Mask[I]);
-
-    return DAG.isGuaranteedNotToBeUndefOrPoison(
-        Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
+    SmallVector<SDValue, 2> Ops;
+    if (getTargetShuffleMask(Op.getNode(), Op.getSimpleValueType(), true, Ops,
+                             Mask)) {
+      SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
+                                            APInt::getZero(NumElts));
+      for (auto M : enumerate(Mask)) {
+        if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
+          continue;
+        if (M.value() == SM_SentinelUndef)
+          return false;
+        assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
+               "Shuffle mask index out of range");
+        DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
+      }
+      for (auto Op : enumerate(Ops))
+        if (!DemandedSrcElts[Op.index()].isZero() &&
+            !DAG.isGuaranteedNotToBeUndefOrPoison(
+                Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
+          return false;
+      return true;
+    }
+    break;
   }
   }
   return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
@@ -47351,6 +47388,16 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
     return V;
 
+  APInt ShiftAmt;
+  if (supportedVectorVarShift(VT, Subtarget, ISD::SRA) &&
+      N1.getOpcode() == ISD::UMIN &&
+      ISD::isConstantSplatVector(N1.getOperand(1).getNode(), ShiftAmt) &&
+      ShiftAmt == VT.getScalarSizeInBits() - 1) {
+    SDValue ShrAmtVal = N1.getOperand(0);
+    SDLoc DL(N);
+    return DAG.getNode(X86ISD::VSRAV, DL, N->getVTList(), N0, ShrAmtVal);
+  }
+
   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index fe1943b5760844..0a1e8ca4427314 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -295,6 +295,10 @@ namespace llvm {
     // thunk at the address from an earlier relocation.
     TLSCALL,
 
+    // Thread Local Storage. A descriptor containing pointer to
+    // code and to argument to get the TLS offset for the symbol.
+    TLSDESC,
+
     // Exception Handling helpers.
     EH_RETURN,
 
diff --git a/llvm/lib/Target/X86/X86InstrAsmAlias.td b/llvm/lib/Target/X86/X86InstrAsmAlias.td
index fab058dc390bb3..6b15213a2e6833 100644
--- a/llvm/lib/Target/X86/X86InstrAsmAlias.td
+++ b/llvm/lib/Target/X86/X86InstrAsmAlias.td
@@ -402,6 +402,12 @@ defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
 // No size suffix for intel-style asm.
 defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
 
+// Aliases for cfcmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cfcmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cfcmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cfcmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cfcmov", "", "intel">;
 //===----------------------------------------------------------------------===//
 // Assembler Instruction Aliases
 //===----------------------------------------------------------------------===//
@@ -768,6 +774,20 @@ multiclass CMOV_SETCC_Aliases<string Cond, int CC> {
                   (CMOV64rr GR64:$dst, GR64:$src, CC), 0>;
   def : InstAlias<"cmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
                   (CMOV64rm GR64:$dst, i64mem:$src, CC), 0>;
+let Predicates = [In64BitMode] in {
+  def : InstAlias<"cmov"#Cond#"{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV16rr_ND GR16:$dst, GR16:$src1, GR16:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV16rm_ND GR16:$dst, GR16:$src1, i16mem:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV32rr_ND GR32:$dst, GR32:$src1, GR32:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV32rm_ND GR32:$dst, GR32:$src1, i32mem:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV64rr_ND GR64:$dst, GR64:$src1, GR64:$src2, CC), 0>;
+  def : InstAlias<"cmov"#Cond#"{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CMOV64rm_ND GR64:$dst, GR64:$src1, i64mem:$src2, CC), 0>;
+}
 
   def : InstAlias<"set"#Cond#"\t$dst", (SETCCr GR8:$dst, CC), 0>;
   def : InstAlias<"set"#Cond#"\t$dst", (SETCCm i8mem:$dst, CC), 0>;
@@ -790,6 +810,57 @@ defm : CMOV_SETCC_Aliases<"ge", 13>;
 defm : CMOV_SETCC_Aliases<"le", 14>;
 defm : CMOV_SETCC_Aliases<"g" , 15>;
 
+multiclass CFCMOV_Aliases<string Cond, int CC> {
+let Predicates = [In64BitMode] in {
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV16rr GR16:$dst, GR16:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV32rr GR32:$dst, GR32:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV64rr GR64:$dst, GR64:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV16rm GR16:$dst, i16mem:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV32rm GR32:$dst, i32mem:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV64rm GR64:$dst, i64mem:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV16mr i16mem:$dst, GR16:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV32mr i32mem:$dst, GR32:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src, $dst|$dst, $src}",
+                  (CFCMOV64mr i64mem:$dst, GR64:$src, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV16rr_ND GR16:$dst, GR16:$src1, GR16:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV32rr_ND GR32:$dst, GR32:$src1, GR32:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV64rr_ND GR64:$dst, GR64:$src1, GR64:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV16rm_ND GR16:$dst, GR16:$src1, i16mem:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV32rm_ND GR32:$dst, GR32:$src1, i32mem:$src2, CC), 0>;
+  def : InstAlias<"cfcmov"#Cond#"{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                  (CFCMOV64rm_ND GR64:$dst, GR64:$src1, i64mem:$src2, CC), 0>;
+}
+}
+defm : CFCMOV_Aliases<"o" ,  0>;
+defm : CFCMOV_Aliases<"no",  1>;
+defm : CFCMOV_Aliases<"b" ,  2>;
+defm : CFCMOV_Aliases<"ae",  3>;
+defm : CFCMOV_Aliases<"e" ,  4>;
+defm : CFCMOV_Aliases<"ne",  5>;
+defm : CFCMOV_Aliases<"be",  6>;
+defm : CFCMOV_Aliases<"a" ,  7>;
+defm : CFCMOV_Aliases<"s" ,  8>;
+defm : CFCMOV_Aliases<"ns",  9>;
+defm : CFCMOV_Aliases<"p" , 10>;
+defm : CFCMOV_Aliases<"np", 11>;
+defm : CFCMOV_Aliases<"l" , 12>;
+defm : CFCMOV_Aliases<"ge", 13>;
+defm : CFCMOV_Aliases<"le", 14>;
+defm : CFCMOV_Aliases<"g" , 15>;
+
 // Condition dump instructions Alias
 def : InstAlias<"jo\t$dst",  (JCC_1 brtarget8:$dst,  0), 0>;
 def : InstAlias<"jno\t$dst", (JCC_1 brtarget8:$dst,  1), 0>;
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 2e31c05cd687d3..d41591f68a6050 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -13,46 +13,72 @@
 
 
 // CMOV instructions.
-let isCodeGenOnly = 1, ForceDisassemble = 1 in {
-let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst",
-    isCommutable = 1, SchedRW = [WriteCMOV] in {
-  def CMOV16rr
-    : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
-        "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
-        [(set GR16:$dst,
-              (X86cmov GR16:$src1, GR16:$src2, timm:$cond, EFLAGS))]>,
-              TB, OpSize16;
-  def CMOV32rr
-    : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond),
-        "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
-        [(set GR32:$dst,
-              (X86cmov GR32:$src1, GR32:$src2, timm:$cond, EFLAGS))]>,
-              TB, OpSize32;
-  def CMOV64rr
-    :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond),
-        "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
-        [(set GR64:$dst,
-              (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB;
+multiclass Cmov<X86TypeInfo t, string args, bit ndd = 0, string suffix = ""> {
+let isCommutable = 1, SchedRW = [WriteCMOV] in
+  def rr#suffix : ITy<0x40, MRMSrcRegCC, t, (outs t.RegClass:$dst),
+                      (ins t.RegClass:$src1, t.RegClass:$src2, ccode:$cond),
+                      "cmov${cond}", args,
+                      [(set t.RegClass:$dst, (X86cmov t.RegClass:$src1,
+                                        t.RegClass:$src2, timm:$cond, EFLAGS))]>, UseEFLAGS, NDD<ndd>;
+let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in
+  def rm#suffix : ITy<0x40, MRMSrcMemCC, t, (outs t.RegClass:$dst),
+                      (ins t.RegClass:$src1, t.MemOperand:$src2, ccode:$cond),
+                      "cmov${cond}", args,
+                      [(set t.RegClass:$dst, (X86cmov t.RegClass:$src1,
+                                    (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>, UseEFLAGS, NDD<ndd>;
+}
+
+multiclass Cfcmov<X86TypeInfo t> {
+let isCommutable = 1, SchedRW = [WriteCMOV] in {
+let Predicates = [HasCMOV, HasCF, In64BitMode] in {
+  def rr : ITy<0x40, MRMDestRegCC, t, (outs t.RegClass:$dst),
+               (ins t.RegClass:$src1, ccode:$cond),
+               "cfcmov${cond}", unaryop_ndd_args,
+               [(set t.RegClass:$dst,
+                 (X86cmov 0, t.RegClass:$src1, timm:$cond, EFLAGS))]>, UseEFLAGS, NF;
+  def rr_REV : ITy<0x40, MRMSrcRegCC, t, (outs t.RegClass:$dst),
+                   (ins t.RegClass:$src1, ccode:$cond),
+                   "cfcmov${cond}", unaryop_ndd_args,
+                   []>, UseEFLAGS, EVEX, T_MAP4;
+}
+let Predicates = [HasCMOV, HasCF, HasNDD, In64BitMode] in
+  def rr_ND : ITy<0x40, MRMSrcRegCC, t, (outs t.RegClass:$dst),
+                  (ins t.RegClass:$src1, t.RegClass:$src2, ccode:$cond),
+                  "cfcmov${cond}", binop_ndd_args, []>, UseEFLAGS, NDD<1>, NF;
 }
+let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
+  let Predicates = [HasCMOV, HasCF, In64BitMode], mayLoad = 1 in
+    def rm : ITy<0x40, MRMSrcMemCC, t, (outs t.RegClass:$dst),
+                 (ins t.MemOperand:$src1, ccode:$cond),
+                 "cfcmov${cond}", unaryop_ndd_args, []>, UseEFLAGS, EVEX, T_MAP4;
+  let Predicates = [HasCMOV, HasCF, HasNDD, In64BitMode], mayLoad = 1 in
+    def rm_ND : ITy<0x40, MRMSrcMemCC, t, (outs t.RegClass:$dst),
+                    (ins t.RegClass:$src1, t.MemOperand:$src2, ccode:$cond),
+                    "cfcmov${cond}", binop_ndd_args, []>, UseEFLAGS, NDD<1>, NF;
+}
+let SchedRW = [WriteCMOV, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault],
+    Predicates = [HasCMOV, HasCF, In64BitMode], mayStore = 1 in
+  def mr : ITy<0x40, MRMDestMemCC, t, (outs t.MemOperand:$dst),
+                (ins t.RegClass:$src1, ccode:$cond),
+                "cfcmov${cond}", unaryop_ndd_args, []>, UseEFLAGS, NF;
+}
+
+let isCodeGenOnly = 1, ForceDisassemble = 1 in {
+  let Predicates = [HasCMOV, NoNDD], Constraints = "$dst = $src1" in {
+    defm CMOV16 : Cmov<Xi16, binop_args>, OpSize16, TB;
+    defm CMOV32 : Cmov<Xi32, binop_args>, OpSize32, TB;
+    defm CMOV64 : Cmov<Xi64, binop_args>, TB;
+  }
 
-let Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst",
-    SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
-  def CMOV16rm
-    : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
-        "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
-        [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
-                                  timm:$cond, EFLAGS))]>, TB, OpSize16;
-  def CMOV32rm
-    : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond),
-        "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
-        [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
-                                  timm:$cond, EFLAGS))]>, TB, OpSize32;
-  def CMOV64rm
-    :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond),
-        "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
-        [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
-                                  timm:$cond, EFLAGS))]>, TB;
-} // Uses = [EFLAGS], Predicates = [HasCMOV], Constraints = "$src1 = $dst"
+  let Predicates = [HasCMOV, HasNDD, In64BitMode] in {
+    defm CMOV16 : Cmov<Xi16, binop_ndd_args, 1, "_ND">, PD;
+    defm CMOV32 : Cmov<Xi32, binop_ndd_args, 1, "_ND">;
+    defm CMOV64 : Cmov<Xi64, binop_ndd_args, 1, "_ND">;
+  }
+
+  defm CFCMOV16 : Cfcmov<Xi16>, PD;
+  defm CFCMOV32 : Cfcmov<Xi32>;
+  defm CFCMOV64 : Cfcmov<Xi64>;
 } // isCodeGenOnly = 1, ForceDisassemble = 1
 
 def inv_cond_XFORM : SDNodeXForm<imm, [{
@@ -63,7 +89,7 @@ def inv_cond_XFORM : SDNodeXForm<imm, [{
 
 // Conditional moves with folded loads with operands swapped and conditions
 // inverted.
-let Predicates = [HasCMOV] in {
+let Predicates = [HasCMOV, NoNDD] in {
   def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
             (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
   def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
@@ -72,6 +98,23 @@ let Predicates = [HasCMOV] in {
             (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
 }
 
+let Predicates = [HasCMOV, HasNDD] in {
+  def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
+            (CMOV16rm_ND GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
+            (CMOV32rm_ND GR32:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
+            (CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+}
+let Predicates = [HasCMOV, HasCF] in {
+  def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
+            (CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov GR32:$src1, 0, timm:$cond, EFLAGS),
+            (CFCMOV32rr GR32:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov GR64:$src1, 0, timm:$cond, EFLAGS),
+            (CFCMOV64rr GR64:$src1, (inv_cond_XFORM timm:$cond))>;
+}
+
 // SetCC instructions.
 let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in {
   def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond),
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index f393f86e64aadd..ce3b6af4cab47b 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -507,6 +507,16 @@ def TLS_base_addrX32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   Requires<[In64BitMode, NotLP64]>;
 }
 
+// TLSDESC only clobbers EAX and EFLAGS. ESP is marked as a use to prevent
+// stack-pointer assignments that appear immediately before calls from
+// potentially appearing dead.
+let Defs = [EAX, EFLAGS], usesCustomInserter = 1, Uses = [RSP, SSP] in {
+  def TLS_desc32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                     "# TLS_desc32", [(X86tlsdesc tls32addr:$sym)]>;
+  def TLS_desc64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                     "# TLS_desc64", [(X86tlsdesc tls64addr:$sym)]>;
+}
+
 // Darwin TLS Support
 // For i386, the address of the thunk is passed on the stack, on return the
 // address of the variable is in %eax.  %ecx is trashed during the function
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 13014293267b32..31ee288c6f8bb2 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -28,6 +28,8 @@ def RawFrmImm8    : Format<7>;
 def RawFrmImm16   : Format<8>;
 def AddCCFrm      : Format<9>;
 def PrefixByte    : Format<10>;
+def MRMDestRegCC  : Format<18>;
+def MRMDestMemCC  : Format<19>;
 def MRMDestMem4VOp3CC : Format<20>;
 def MRMr0          : Format<21>;
 def MRMSrcMemFSIB  : Format<22>;
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index adf527d72f5b43..f14c7200af968a 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -223,6 +223,9 @@ def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
 def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+def X86tlsdesc : SDNode<"X86ISD::TLSDESC", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
                         [SDNPHasChain]>;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b65f49527ae5dd..eb42a4b2119d5e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2637,9 +2637,9 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     WorkingMI = CloneIfNew(MI);
     WorkingMI->setDesc(get(Opc));
     break;
-  case X86::CMOV16rr:
-  case X86::CMOV32rr:
-  case X86::CMOV64rr: {
+  CASE_ND(CMOV16rr)
+  CASE_ND(CMOV32rr)
+  CASE_ND(CMOV64rr) {
     WorkingMI = CloneIfNew(MI);
     unsigned OpNo = MI.getDesc().getNumOperands() - 1;
     X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
@@ -3120,7 +3120,8 @@ bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const {
 
 int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) {
   unsigned Opcode = MCID.getOpcode();
-  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode)))
+  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode) ||
+        X86::isCFCMOVCC(Opcode)))
     return -1;
   // Assume that condition code is always the last use operand.
   unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
@@ -3151,6 +3152,11 @@ X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
                                        : X86::COND_INVALID;
 }
 
+X86::CondCode X86::getCondFromCFCMov(const MachineInstr &MI) {
+  return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
+                                         : X86::COND_INVALID;
+}
+
 /// Return the inverse of the specified condition,
 /// e.g. turning COND_E to COND_NE.
 X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
@@ -3312,16 +3318,21 @@ X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
 }
 
 /// Return a cmov opcode for the given register size in bytes, and operand type.
-unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
+unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
+                            bool HasNDD) {
   switch (RegBytes) {
   default:
     llvm_unreachable("Illegal register size!");
+#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
   case 2:
-    return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
+    return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
+                            : GET_ND_IF_ENABLED(X86::CMOV16rr);
   case 4:
-    return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
+    return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
+                            : GET_ND_IF_ENABLED(X86::CMOV32rr);
   case 8:
-    return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
+    return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
+                            : GET_ND_IF_ENABLED(X86::CMOV64rr);
   }
 }
 
@@ -4047,8 +4058,9 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
   const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
   assert(Cond.size() == 1 && "Invalid Cond array");
-  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
-                                    false /*HasMemoryOperand*/);
+  unsigned Opc =
+      X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
+                         false /*HasMemoryOperand*/, Subtarget.hasNDD());
   BuildMI(MBB, I, DL, get(Opc), DstReg)
       .addReg(FalseReg)
       .addReg(TrueReg)
@@ -10565,15 +10577,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
     bool DoRegPressureReduce) const {
   unsigned Opc = Root.getOpcode();
   switch (Opc) {
-  default:
-    return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
-                                                       DoRegPressureReduce);
   case X86::VPDPWSSDrr:
   case X86::VPDPWSSDrm:
   case X86::VPDPWSSDYrr:
   case X86::VPDPWSSDYrm: {
-    Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+    if (!Subtarget.hasFastDPWSSD()) {
+      Patterns.push_back(MachineCombinerPattern::DPWSSD);
+      return true;
+    }
+    break;
   }
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128m:
@@ -10581,11 +10593,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDZ256m:
   case X86::VPDPWSSDZr:
   case X86::VPDPWSSDZm: {
-    if (Subtarget.hasBWI())
+   if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
       Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+      return true;
+    }
+    break;
   }
   }
+  return TargetInstrInfo::getMachineCombinerPatterns(Root,
+                                                     Patterns, DoRegPressureReduce);
 }
 
 static void
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 0e5fcbeda08f79..e719be0caf3ee1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -42,7 +42,8 @@ enum AsmComments {
 std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
 
 /// Return a cmov opcode for the given register size in bytes, and operand type.
-unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false);
+unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand = false,
+                       bool HasNDD = false);
 
 /// Return the source operand # for condition code by \p MCID. If the
 /// instruction doesn't have a condition code, return -1.
@@ -61,6 +62,9 @@ CondCode getCondFromSETCC(const MachineInstr &MI);
 // Turn CMOV instruction into condition code.
 CondCode getCondFromCMov(const MachineInstr &MI);
 
+// Turn CFCMOV instruction into condition code.
+CondCode getCondFromCFCMov(const MachineInstr &MI);
+
 /// GetOppositeBranchCondition - Return the inverse of the specified cond,
 /// e.g. turning COND_E to COND_NE.
 CondCode GetOppositeBranchCondition(CondCode CC);
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7dd51ba6c027ae..9f2709d6b1a206 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -45,6 +45,7 @@ def NoEGPR       : Predicate<"!Subtarget->hasEGPR()">;
 // entries, so that the NDD variant can be selected first to benefit RA.
 def HasNDD       : Predicate<"Subtarget->hasNDD()">;
 def NoNDD        : Predicate<"!Subtarget->hasNDD()">;
+def HasCF        : Predicate<"Subtarget->hasCF()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
 def HasNOPL      : Predicate<"Subtarget->hasNOPL()">;
@@ -238,5 +239,6 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
+def HasFastDPWSSD: Predicate<"Subtarget->hasFastDPWSSD()">;
 def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
 def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index a57c0fe1578840..b69058787a4e25 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -102,7 +102,7 @@ static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB,
   auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
   unsigned AllocaAS = DL.getAllocaAddrSpace();
   AllocaInst *AllocaRes =
-      new AllocaInst(Ty, AllocaAS, "", &F.getEntryBlock().front());
+      new AllocaInst(Ty, AllocaAS, "", F.getEntryBlock().begin());
   AllocaRes->setAlignment(AllocaAlignment);
   return AllocaRes;
 }
@@ -453,7 +453,7 @@ static Value *getAllocaPos(BasicBlock *BB) {
   unsigned AllocaAS = DL.getAllocaAddrSpace();
   Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
   AllocaInst *AllocaRes =
-      new AllocaInst(V256I32Ty, AllocaAS, "", &F->getEntryBlock().front());
+      new AllocaInst(V256I32Ty, AllocaAS, "", F->getEntryBlock().begin());
   BasicBlock::iterator Iter = AllocaRes->getIterator();
   ++Iter;
   Builder.SetInsertPoint(&*Iter);
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 64d4d411e7b43b..e2330ff34c1753 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -519,10 +519,8 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
 void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
                                  const MachineInstr &MI) {
   NoAutoPaddingScope NoPadScope(*OutStreamer);
-  bool Is64Bits = MI.getOpcode() != X86::TLS_addr32 &&
-                  MI.getOpcode() != X86::TLS_base_addr32;
-  bool Is64BitsLP64 = MI.getOpcode() == X86::TLS_addr64 ||
-                      MI.getOpcode() == X86::TLS_base_addr64;
+  bool Is64Bits = getSubtarget().is64Bit();
+  bool Is64BitsLP64 = getSubtarget().isTarget64BitLP64();
   MCContext &Ctx = OutStreamer->getContext();
 
   MCSymbolRefExpr::VariantKind SRVK;
@@ -539,6 +537,10 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
   case X86::TLS_base_addrX32:
     SRVK = MCSymbolRefExpr::VK_TLSLD;
     break;
+  case X86::TLS_desc32:
+  case X86::TLS_desc64:
+    SRVK = MCSymbolRefExpr::VK_TLSDESC;
+    break;
   default:
     llvm_unreachable("unexpected opcode");
   }
@@ -554,7 +556,26 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
   bool UseGot = MMI->getModule()->getRtLibUseGOT() &&
                 Ctx.getTargetOptions()->X86RelaxRelocations;
 
-  if (Is64Bits) {
+  if (SRVK == MCSymbolRefExpr::VK_TLSDESC) {
+    const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(
+        MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)),
+        MCSymbolRefExpr::VK_TLSCALL, Ctx);
+    EmitAndCountInstruction(
+        MCInstBuilder(Is64BitsLP64 ? X86::LEA64r : X86::LEA32r)
+            .addReg(Is64BitsLP64 ? X86::RAX : X86::EAX)
+            .addReg(Is64Bits ? X86::RIP : X86::EBX)
+            .addImm(1)
+            .addReg(0)
+            .addExpr(Sym)
+            .addReg(0));
+    EmitAndCountInstruction(
+        MCInstBuilder(Is64Bits ? X86::CALL64m : X86::CALL32m)
+            .addReg(Is64BitsLP64 ? X86::RAX : X86::EAX)
+            .addImm(1)
+            .addReg(0)
+            .addExpr(Expr)
+            .addReg(0));
+  } else if (Is64Bits) {
     bool NeedsPadding = SRVK == MCSymbolRefExpr::VK_TLSGD;
     if (NeedsPadding && Is64BitsLP64)
       EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
@@ -2164,6 +2185,8 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
   case X86::TLS_base_addrX32:
+  case X86::TLS_desc32:
+  case X86::TLS_desc64:
     return LowerTlsAddr(MCInstLowering, *MI);
 
   case X86::MOVPC32r: {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 1a5e6bc886aa67..23035f655098a7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -94,6 +94,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::TuningNoDomainDelayBlend,
       X86::TuningPreferShiftShuffle,
       X86::TuningFastImmVectorShift,
+      X86::TuningFastDPWSSD,
 
       // Perf-tuning flags.
       X86::TuningFastGather,
diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index b5a683de33ab13..5e91cce1068b46 100644
--- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -88,12 +88,15 @@ static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
               BasicBlock *PredBB = PN->getIncomingBlock(I);
               if (PredBB->getTerminator()->getNumSuccessors() > 1)
                 PredBB = SplitEdge(PredBB, PN->getParent());
-              Instruction *InsertPos = PredBB->getTerminator();
-              Instruction *NewInst = CE->getAsInstruction(InsertPos);
+              BasicBlock::iterator InsertPos =
+                  PredBB->getTerminator()->getIterator();
+              Instruction *NewInst = CE->getAsInstruction();
+              NewInst->insertBefore(*PredBB, InsertPos);
               PN->setOperand(I, NewInst);
             }
         } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
-          Instruction *NewInst = CE->getAsInstruction(Instr);
+          Instruction *NewInst = CE->getAsInstruction();
+          NewInst->insertBefore(*Instr->getParent(), Instr->getIterator());
           Instr->replaceUsesOfWith(CE, NewInst);
         } else {
           ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index f65ed259eade4c..d2c9bae97364e9 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -220,6 +220,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xd05", "cortex-a55")
         .Case("0xd46", "cortex-a510")
         .Case("0xd80", "cortex-a520")
+        .Case("0xd88", "cortex-a520ae")
         .Case("0xd07", "cortex-a57")
         .Case("0xd06", "cortex-a65")
         .Case("0xd43", "cortex-a65ae")
@@ -235,6 +236,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xd47", "cortex-a710")
         .Case("0xd4d", "cortex-a715")
         .Case("0xd81", "cortex-a720")
+        .Case("0xd89", "cortex-a720ae")
         .Case("0xd44", "cortex-x1")
         .Case("0xd4c", "cortex-x1c")
         .Case("0xd48", "cortex-x2")
diff --git a/llvm/lib/TextAPI/TextStub.cpp b/llvm/lib/TextAPI/TextStub.cpp
index 24a52607a981ca..0f742523f8207c 100644
--- a/llvm/lib/TextAPI/TextStub.cpp
+++ b/llvm/lib/TextAPI/TextStub.cpp
@@ -1079,16 +1079,16 @@ Expected<FileType> TextAPIReader::canRead(MemoryBufferRef InputBuffer) {
   if (!TAPIFile.ends_with("..."))
     return createStringError(std::errc::not_supported, "unsupported file type");
 
-  if (TAPIFile.starts_with("--- !tapi-tbd\n"))
+  if (TAPIFile.starts_with("--- !tapi-tbd"))
     return FileType::TBD_V4;
 
-  if (TAPIFile.starts_with("--- !tapi-tbd-v3\n"))
+  if (TAPIFile.starts_with("--- !tapi-tbd-v3"))
     return FileType::TBD_V3;
 
-  if (TAPIFile.starts_with("--- !tapi-tbd-v2\n"))
+  if (TAPIFile.starts_with("--- !tapi-tbd-v2"))
     return FileType::TBD_V2;
 
-  if (TAPIFile.starts_with("--- !tapi-tbd-v1\n") ||
+  if (TAPIFile.starts_with("--- !tapi-tbd-v1") ||
       TAPIFile.starts_with("---\narchs:"))
     return FileType::TBD_V1;
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index ae2a06a1d4748a..08a4522e3fac6d 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -971,7 +971,7 @@ static void cacheDIVar(FrameDataInfo &FrameData,
         DIVarCache.insert({V, (*I)->getVariable()});
     };
     CacheIt(findDbgDeclares(V));
-    CacheIt(findDPVDeclares(V));
+    CacheIt(findDVRDeclares(V));
   }
 }
 
@@ -1123,7 +1123,7 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
          "Coroutine with switch ABI should own Promise alloca");
 
   TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(PromiseAlloca);
-  TinyPtrVector<DPValue *> DPVs = findDPVDeclares(PromiseAlloca);
+  TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(PromiseAlloca);
 
   DILocalVariable *PromiseDIVariable = nullptr;
   DILocation *DILoc = nullptr;
@@ -1131,10 +1131,10 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
     DbgDeclareInst *PromiseDDI = DIs.front();
     PromiseDIVariable = PromiseDDI->getVariable();
     DILoc = PromiseDDI->getDebugLoc().get();
-  } else if (!DPVs.empty()) {
-    DPValue *PromiseDPV = DPVs.front();
-    PromiseDIVariable = PromiseDPV->getVariable();
-    DILoc = PromiseDPV->getDebugLoc().get();
+  } else if (!DVRs.empty()) {
+    DbgVariableRecord *PromiseDVR = DVRs.front();
+    PromiseDIVariable = PromiseDVR->getVariable();
+    DILoc = PromiseDVR->getDebugLoc().get();
   } else {
     return;
   }
@@ -1273,11 +1273,12 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
   }
 
   if (UseNewDbgInfoFormat) {
-    DPValue *NewDPV = new DPValue(ValueAsMetadata::get(Shape.FramePtr),
-                                  FrameDIVar, DBuilder.createExpression(),
-                                  DILoc, DPValue::LocationType::Declare);
+    DbgVariableRecord *NewDVR =
+        new DbgVariableRecord(ValueAsMetadata::get(Shape.FramePtr), FrameDIVar,
+                              DBuilder.createExpression(), DILoc,
+                              DbgVariableRecord::LocationType::Declare);
     BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr();
-    It->getParent()->insertDbgRecordBefore(NewDPV, It);
+    It->getParent()->insertDbgRecordBefore(NewDVR, It);
   } else {
     DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar,
                            DBuilder.createExpression(), DILoc,
@@ -1862,13 +1863,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
               SpillAlignment, E.first->getName() + Twine(".reload"));
 
         TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(Def);
-        TinyPtrVector<DPValue *> DPVs = findDPVDeclares(Def);
+        TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(Def);
         // Try best to find dbg.declare. If the spill is a temp, there may not
         // be a direct dbg.declare. Walk up the load chain to find one from an
         // alias.
         if (F->getSubprogram()) {
           auto *CurDef = Def;
-          while (DIs.empty() && DPVs.empty() && isa<LoadInst>(CurDef)) {
+          while (DIs.empty() && DVRs.empty() && isa<LoadInst>(CurDef)) {
             auto *LdInst = cast<LoadInst>(CurDef);
             // Only consider ptr to ptr same type load.
             if (LdInst->getPointerOperandType() != LdInst->getType())
@@ -1877,7 +1878,7 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
             if (!isa<AllocaInst, LoadInst>(CurDef))
               break;
             DIs = findDbgDeclares(CurDef);
-            DPVs = findDPVDeclares(CurDef);
+            DVRs = findDVRDeclares(CurDef);
           }
         }
 
@@ -1887,12 +1888,12 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
           // fragments. It will be unreachable in the main function, and
           // processed by coro::salvageDebugInfo() by CoroCloner.
           if (UseNewDbgInfoFormat) {
-            DPValue *NewDPV =
-                new DPValue(ValueAsMetadata::get(CurrentReload),
-                            DDI->getVariable(), DDI->getExpression(),
-                            DDI->getDebugLoc(), DPValue::LocationType::Declare);
+            DbgVariableRecord *NewDVR = new DbgVariableRecord(
+                ValueAsMetadata::get(CurrentReload), DDI->getVariable(),
+                DDI->getExpression(), DDI->getDebugLoc(),
+                DbgVariableRecord::LocationType::Declare);
             Builder.GetInsertPoint()->getParent()->insertDbgRecordBefore(
-                NewDPV, Builder.GetInsertPoint());
+                NewDVR, Builder.GetInsertPoint());
           } else {
             DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved)
                 .insertDeclare(CurrentReload, DDI->getVariable(),
@@ -1905,7 +1906,7 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
                                  false /*UseEntryValue*/);
         };
         for_each(DIs, SalvageOne);
-        for_each(DPVs, SalvageOne);
+        for_each(DVRs, SalvageOne);
       }
 
       // If we have a single edge PHINode, remove it and replace it with a
@@ -1925,8 +1926,8 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
       U->replaceUsesOfWith(Def, CurrentReload);
       // Instructions are added to Def's user list if the attached
       // debug records use Def. Update those now.
-      for (DPValue &DPV : filterDbgVars(U->getDbgRecordRange()))
-        DPV.replaceVariableLocationOp(Def, CurrentReload, true);
+      for (DbgVariableRecord &DVR : filterDbgVars(U->getDbgRecordRange()))
+        DVR.replaceVariableLocationOp(Def, CurrentReload, true);
     }
   }
 
@@ -1977,12 +1978,12 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
     G->setName(Alloca->getName() + Twine(".reload.addr"));
 
     SmallVector<DbgVariableIntrinsic *, 4> DIs;
-    SmallVector<DPValue *> DPValues;
-    findDbgUsers(DIs, Alloca, &DPValues);
+    SmallVector<DbgVariableRecord *> DbgVariableRecords;
+    findDbgUsers(DIs, Alloca, &DbgVariableRecords);
     for (auto *DVI : DIs)
       DVI->replaceUsesOfWith(Alloca, G);
-    for (auto *DPV : DPValues)
-      DPV->replaceVariableLocationOp(Alloca, G);
+    for (auto *DVR : DbgVariableRecords)
+      DVR->replaceVariableLocationOp(Alloca, G);
 
     for (Instruction *I : UsersToUpdate) {
       // It is meaningless to retain the lifetime intrinsics refer for the
@@ -2960,30 +2961,30 @@ void coro::salvageDebugInfo(
 }
 
 void coro::salvageDebugInfo(
-    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, DPValue &DPV,
-    bool OptimizeFrame, bool UseEntryValue) {
+    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
+    DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue) {
 
-  Function *F = DPV.getFunction();
+  Function *F = DVR.getFunction();
   // Follow the pointer arithmetic all the way to the incoming
   // function argument and convert into a DIExpression.
-  bool SkipOutermostLoad = DPV.isDbgDeclare();
-  Value *OriginalStorage = DPV.getVariableLocationOp(0);
+  bool SkipOutermostLoad = DVR.isDbgDeclare();
+  Value *OriginalStorage = DVR.getVariableLocationOp(0);
 
   auto SalvagedInfo = ::salvageDebugInfoImpl(
       ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage,
-      DPV.getExpression(), SkipOutermostLoad);
+      DVR.getExpression(), SkipOutermostLoad);
   if (!SalvagedInfo)
     return;
 
   Value *Storage = &SalvagedInfo->first;
   DIExpression *Expr = &SalvagedInfo->second;
 
-  DPV.replaceVariableLocationOp(OriginalStorage, Storage);
-  DPV.setExpression(Expr);
+  DVR.replaceVariableLocationOp(OriginalStorage, Storage);
+  DVR.setExpression(Expr);
   // We only hoist dbg.declare today since it doesn't make sense to hoist
   // dbg.value since it does not have the same function wide guarantees that
   // dbg.declare does.
-  if (DPV.getType() == DPValue::LocationType::Declare) {
+  if (DVR.getType() == DbgVariableRecord::LocationType::Declare) {
     std::optional<BasicBlock::iterator> InsertPt;
     if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
@@ -2991,12 +2992,12 @@ void coro::salvageDebugInfo(
       // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
       // an example.
       if (!OptimizeFrame && I->getDebugLoc())
-        DPV.setDebugLoc(I->getDebugLoc());
+        DVR.setDebugLoc(I->getDebugLoc());
     } else if (isa<Argument>(Storage))
       InsertPt = F->getEntryBlock().begin();
     if (InsertPt) {
-      DPV.removeFromParent();
-      (*InsertPt)->getParent()->insertDbgRecordBefore(&DPV, *InsertPt);
+      DVR.removeFromParent();
+      (*InsertPt)->getParent()->insertDbgRecordBefore(&DVR, *InsertPt);
     }
   }
 }
@@ -3188,15 +3189,15 @@ void coro::buildCoroutineFrame(
   for (auto &Iter : FrameData.Spills) {
     auto *V = Iter.first;
     SmallVector<DbgValueInst *, 16> DVIs;
-    SmallVector<DPValue *, 16> DPVs;
-    findDbgValues(DVIs, V, &DPVs);
+    SmallVector<DbgVariableRecord *, 16> DVRs;
+    findDbgValues(DVIs, V, &DVRs);
     for (DbgValueInst *DVI : DVIs)
       if (Checker.isDefinitionAcrossSuspend(*V, DVI))
         FrameData.Spills[V].push_back(DVI);
     // Add the instructions which carry debug info that is in the frame.
-    for (DPValue *DPV : DPVs)
-      if (Checker.isDefinitionAcrossSuspend(*V, DPV->Marker->MarkedInstr))
-        FrameData.Spills[V].push_back(DPV->Marker->MarkedInstr);
+    for (DbgVariableRecord *DVR : DVRs)
+      if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr))
+        FrameData.Spills[V].push_back(DVR->Marker->MarkedInstr);
   }
 
   LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills));
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 09d1430b4c559e..84fd88806154e3 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -35,8 +35,8 @@ void salvageDebugInfo(
     SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
     DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool IsEntryPoint);
 void salvageDebugInfo(
-    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap, DPValue &DPV,
-    bool OptimizeFrame, bool UseEntryValue);
+    SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
+    DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue);
 
 // Keeps data and helper functions for lowering coroutine intrinsics.
 struct LowererBase {
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 0fce596d3e2ca0..3f3d81474fafe4 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -679,17 +679,18 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
 }
 
 /// Returns all DbgVariableIntrinsic in F.
-static std::pair<SmallVector<DbgVariableIntrinsic *, 8>, SmallVector<DPValue *>>
+static std::pair<SmallVector<DbgVariableIntrinsic *, 8>,
+                 SmallVector<DbgVariableRecord *>>
 collectDbgVariableIntrinsics(Function &F) {
   SmallVector<DbgVariableIntrinsic *, 8> Intrinsics;
-  SmallVector<DPValue *> DPValues;
+  SmallVector<DbgVariableRecord *> DbgVariableRecords;
   for (auto &I : instructions(F)) {
-    for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange()))
-      DPValues.push_back(&DPV);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      DbgVariableRecords.push_back(&DVR);
     if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
       Intrinsics.push_back(DVI);
   }
-  return {Intrinsics, DPValues};
+  return {Intrinsics, DbgVariableRecords};
 }
 
 void CoroCloner::replaceSwiftErrorOps() {
@@ -697,7 +698,7 @@ void CoroCloner::replaceSwiftErrorOps() {
 }
 
 void CoroCloner::salvageDebugInfo() {
-  auto [Worklist, DPValues] = collectDbgVariableIntrinsics(*NewF);
+  auto [Worklist, DbgVariableRecords] = collectDbgVariableIntrinsics(*NewF);
   SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap;
 
   // Only 64-bit ABIs have a register we can refer to with the entry value.
@@ -706,8 +707,8 @@ void CoroCloner::salvageDebugInfo() {
   for (DbgVariableIntrinsic *DVI : Worklist)
     coro::salvageDebugInfo(ArgToAllocaMap, *DVI, Shape.OptimizeFrame,
                            UseEntryValue);
-  for (DPValue *DPV : DPValues)
-    coro::salvageDebugInfo(ArgToAllocaMap, *DPV, Shape.OptimizeFrame,
+  for (DbgVariableRecord *DVR : DbgVariableRecords)
+    coro::salvageDebugInfo(ArgToAllocaMap, *DVR, Shape.OptimizeFrame,
                            UseEntryValue);
 
   // Remove all salvaged dbg.declare intrinsics that became
@@ -732,7 +733,7 @@ void CoroCloner::salvageDebugInfo() {
     }
   };
   for_each(Worklist, RemoveOne);
-  for_each(DPValues, RemoveOne);
+  for_each(DbgVariableRecords, RemoveOne);
 }
 
 void CoroCloner::replaceEntryBlock() {
@@ -2107,12 +2108,12 @@ splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
   // original function. The Cloner has already salvaged debug info in the new
   // coroutine funclets.
   SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap;
-  auto [DbgInsts, DPValues] = collectDbgVariableIntrinsics(F);
+  auto [DbgInsts, DbgVariableRecords] = collectDbgVariableIntrinsics(F);
   for (auto *DDI : DbgInsts)
     coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame,
                            false /*UseEntryValue*/);
-  for (DPValue *DPV : DPValues)
-    coro::salvageDebugInfo(ArgToAllocaMap, *DPV, Shape.OptimizeFrame,
+  for (DbgVariableRecord *DVR : DbgVariableRecords)
+    coro::salvageDebugInfo(ArgToAllocaMap, *DVR, Shape.OptimizeFrame,
                            false /*UseEntryValue*/);
   return Shape;
 }
diff --git a/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index 84e9c219f935cd..dced3a5fdc7875 100644
--- a/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -47,28 +47,66 @@ static bool splitGlobal(GlobalVariable &GV) {
   if (!Init)
     return false;
 
-  // Verify that each user of the global is an inrange getelementptr constant.
-  // From this it follows that any loads from or stores to that global must use
-  // a pointer derived from an inrange getelementptr constant, which is
-  // sufficient to allow us to apply the splitting transform.
+  const DataLayout &DL = GV.getParent()->getDataLayout();
+  const StructLayout *SL = DL.getStructLayout(Init->getType());
+  ArrayRef<TypeSize> MemberOffsets = SL->getMemberOffsets();
+  unsigned IndexWidth = DL.getIndexTypeSizeInBits(GV.getType());
+
+  // Verify that each user of the global is an inrange getelementptr constant,
+  // and collect information on how it relates to the global.
+  struct GEPInfo {
+    GEPOperator *GEP;
+    unsigned MemberIndex;
+    APInt MemberRelativeOffset;
+
+    GEPInfo(GEPOperator *GEP, unsigned MemberIndex, APInt MemberRelativeOffset)
+        : GEP(GEP), MemberIndex(MemberIndex),
+          MemberRelativeOffset(std::move(MemberRelativeOffset)) {}
+  };
+  SmallVector<GEPInfo> Infos;
   for (User *U : GV.users()) {
-    if (!isa<Constant>(U))
+    auto *GEP = dyn_cast<GEPOperator>(U);
+    if (!GEP)
       return false;
 
-    auto *GEP = dyn_cast<GEPOperator>(U);
-    if (!GEP || !GEP->getInRangeIndex() || *GEP->getInRangeIndex() != 1 ||
-        !isa<ConstantInt>(GEP->getOperand(1)) ||
-        !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
-        !isa<ConstantInt>(GEP->getOperand(2)))
+    std::optional<ConstantRange> InRange = GEP->getInRange();
+    if (!InRange)
+      return false;
+
+    APInt Offset(IndexWidth, 0);
+    if (!GEP->accumulateConstantOffset(DL, Offset))
+      return false;
+
+    // Determine source-relative inrange.
+    ConstantRange SrcInRange = InRange->sextOrTrunc(IndexWidth).add(Offset);
+
+    // Check that the GEP offset is in the range (treating upper bound as
+    // inclusive here).
+    if (!SrcInRange.contains(Offset) && SrcInRange.getUpper() != Offset)
+      return false;
+
+    // Find which struct member the range corresponds to.
+    if (SrcInRange.getLower().uge(SL->getSizeInBytes()))
       return false;
+
+    unsigned MemberIndex =
+        SL->getElementContainingOffset(SrcInRange.getLower().getZExtValue());
+    TypeSize MemberStart = MemberOffsets[MemberIndex];
+    TypeSize MemberEnd = MemberIndex == MemberOffsets.size() - 1
+                             ? SL->getSizeInBytes()
+                             : MemberOffsets[MemberIndex + 1];
+
+    // Verify that the range matches that struct member.
+    if (SrcInRange.getLower() != MemberStart ||
+        SrcInRange.getUpper() != MemberEnd)
+      return false;
+
+    Infos.emplace_back(GEP, MemberIndex, Offset - MemberStart);
   }
 
   SmallVector<MDNode *, 2> Types;
   GV.getMetadata(LLVMContext::MD_type, Types);
 
-  const DataLayout &DL = GV.getParent()->getDataLayout();
-  const StructLayout *SL = DL.getStructLayout(Init->getType());
-
   IntegerType *Int32Ty = Type::getInt32Ty(GV.getContext());
 
   std::vector<GlobalVariable *> SplitGlobals(Init->getNumOperands());
@@ -114,21 +152,13 @@ static bool splitGlobal(GlobalVariable &GV) {
       SplitGV->setVCallVisibilityMetadata(GV.getVCallVisibility());
   }
 
-  for (User *U : GV.users()) {
-    auto *GEP = cast<GEPOperator>(U);
-    unsigned I = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
-    if (I >= SplitGlobals.size())
-      continue;
-
-    SmallVector<Value *, 4> Ops;
-    Ops.push_back(ConstantInt::get(Int32Ty, 0));
-    for (unsigned I = 3; I != GEP->getNumOperands(); ++I)
-      Ops.push_back(GEP->getOperand(I));
-
+  for (const GEPInfo &Info : Infos) {
+    assert(Info.MemberIndex < SplitGlobals.size() && "Invalid member");
     auto *NewGEP = ConstantExpr::getGetElementPtr(
-        SplitGlobals[I]->getInitializer()->getType(), SplitGlobals[I], Ops,
-        GEP->isInBounds());
-    GEP->replaceAllUsesWith(NewGEP);
+        Type::getInt8Ty(GV.getContext()), SplitGlobals[Info.MemberIndex],
+        ConstantInt::get(GV.getContext(), Info.MemberRelativeOffset),
+        Info.GEP->isInBounds());
+    Info.GEP->replaceAllUsesWith(NewGEP);
   }
 
   // Finally, remove the original global. Any remaining uses refer to invalid
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 37f4a8749fd8e8..29f9264f2fc249 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -723,8 +723,8 @@ static void moveFunctionData(Function &Old, Function &New,
     for (Instruction &Val : CurrBB) {
       // Since debug-info originates from many different locations in the
       // program, it will cause incorrect reporting from a debugger if we keep
-      // the same debug instructions. Drop non-intrinsic DPValues here,
-      // collect intrinsics for removal later.
+      // the same debug instructions. Drop non-intrinsic DbgVariableRecords
+      // here, collect intrinsics for removal later.
       Val.dropDbgRecords();
 
       // We must handle the scoping of called functions differently than
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 4ff14b671af61a..05a3b169aaaf4c 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -256,20 +256,22 @@ class MergeFunctions {
 
   /// Fill PDIUnrelatedWL with instructions from the entry block that are
   /// unrelated to parameter related debug info.
-  /// \param PDPVUnrelatedWL The equivalent non-intrinsic debug records.
-  void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
-                                 std::vector<Instruction *> &PDIUnrelatedWL,
-                                 std::vector<DPValue *> &PDPVUnrelatedWL);
+  /// \param PDVRUnrelatedWL The equivalent non-intrinsic debug records.
+  void
+  filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
+                            std::vector<Instruction *> &PDIUnrelatedWL,
+                            std::vector<DbgVariableRecord *> &PDVRUnrelatedWL);
 
   /// Erase the rest of the CFG (i.e. barring the entry block).
   void eraseTail(Function *G);
 
   /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
   /// parameter debug info, from the entry block.
-  /// \param PDPVUnrelatedWL contains the equivalent set of non-instruction
+  /// \param PDVRUnrelatedWL contains the equivalent set of non-instruction
   /// debug-info records.
-  void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL,
-                                std::vector<DPValue *> &PDPVUnrelatedWL);
+  void
+  eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL,
+                           std::vector<DbgVariableRecord *> &PDVRUnrelatedWL);
 
   /// Replace G with a simple tail call to bitcast(F). Also (unless
   /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
@@ -512,7 +514,7 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
 // parameter debug info, from the entry block.
 void MergeFunctions::eraseInstsUnrelatedToPDI(
     std::vector<Instruction *> &PDIUnrelatedWL,
-    std::vector<DPValue *> &PDPVUnrelatedWL) {
+    std::vector<DbgVariableRecord *> &PDVRUnrelatedWL) {
   LLVM_DEBUG(
       dbgs() << " Erasing instructions (in reverse order of appearance in "
                 "entry block) unrelated to parameter debug info from entry "
@@ -526,13 +528,13 @@ void MergeFunctions::eraseInstsUnrelatedToPDI(
     PDIUnrelatedWL.pop_back();
   }
 
-  while (!PDPVUnrelatedWL.empty()) {
-    DPValue *DPV = PDPVUnrelatedWL.back();
-    LLVM_DEBUG(dbgs() << "  Deleting DPValue ");
-    LLVM_DEBUG(DPV->print(dbgs()));
+  while (!PDVRUnrelatedWL.empty()) {
+    DbgVariableRecord *DVR = PDVRUnrelatedWL.back();
+    LLVM_DEBUG(dbgs() << "  Deleting DbgVariableRecord ");
+    LLVM_DEBUG(DVR->print(dbgs()));
     LLVM_DEBUG(dbgs() << "\n");
-    DPV->eraseFromParent();
-    PDPVUnrelatedWL.pop_back();
+    DVR->eraseFromParent();
+    PDVRUnrelatedWL.pop_back();
   }
 
   LLVM_DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
@@ -564,12 +566,12 @@ void MergeFunctions::eraseTail(Function *G) {
 // PDIUnrelatedWL with such instructions.
 void MergeFunctions::filterInstsUnrelatedToPDI(
     BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL,
-    std::vector<DPValue *> &PDPVUnrelatedWL) {
+    std::vector<DbgVariableRecord *> &PDVRUnrelatedWL) {
   std::set<Instruction *> PDIRelated;
-  std::set<DPValue *> PDPVRelated;
+  std::set<DbgVariableRecord *> PDVRRelated;
 
-  // Work out whether a dbg.value intrinsic or an equivalent DPValue is a
-  // parameter to be preserved.
+  // Work out whether a dbg.value intrinsic or an equivalent DbgVariableRecord
+  // is a parameter to be preserved.
   auto ExamineDbgValue = [](auto *DbgVal, auto &Container) {
     LLVM_DEBUG(dbgs() << " Deciding: ");
     LLVM_DEBUG(DbgVal->print(dbgs()));
@@ -641,14 +643,14 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
 
   for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
        BI != BIE; ++BI) {
-    // Examine DPValues as they happen "before" the instruction. Are they
-    // connected to parameters?
-    for (DPValue &DPV : filterDbgVars(BI->getDbgRecordRange())) {
-      if (DPV.isDbgValue() || DPV.isDbgAssign()) {
-        ExamineDbgValue(&DPV, PDPVRelated);
+    // Examine DbgVariableRecords as they happen "before" the instruction. Are
+    // they connected to parameters?
+    for (DbgVariableRecord &DVR : filterDbgVars(BI->getDbgRecordRange())) {
+      if (DVR.isDbgValue() || DVR.isDbgAssign()) {
+        ExamineDbgValue(&DVR, PDVRRelated);
       } else {
-        assert(DPV.isDbgDeclare());
-        ExamineDbgDeclare(&DPV, PDPVRelated);
+        assert(DVR.isDbgDeclare());
+        ExamineDbgDeclare(&DVR, PDVRRelated);
       }
     }
 
@@ -686,8 +688,8 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
 
   // Collect the set of unrelated instructions and debug records.
   for (Instruction &I : *GEntryBlock) {
-    for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange()))
-      IsPDIRelated(&DPV, PDPVRelated, PDPVUnrelatedWL);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      IsPDIRelated(&DVR, PDVRRelated, PDVRUnrelatedWL);
     IsPDIRelated(&I, PDIRelated, PDIUnrelatedWL);
   }
   LLVM_DEBUG(dbgs() << " }\n");
@@ -728,7 +730,7 @@ static void copyMetadataIfPresent(Function *From, Function *To, StringRef Key) {
 void MergeFunctions::writeThunk(Function *F, Function *G) {
   BasicBlock *GEntryBlock = nullptr;
   std::vector<Instruction *> PDIUnrelatedWL;
-  std::vector<DPValue *> PDPVUnrelatedWL;
+  std::vector<DbgVariableRecord *> PDVRUnrelatedWL;
   BasicBlock *BB = nullptr;
   Function *NewG = nullptr;
   if (MergeFunctionsPDI) {
@@ -740,7 +742,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
         dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
                   "debug info for "
                << G->getName() << "() {\n");
-    filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL, PDPVUnrelatedWL);
+    filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL, PDVRUnrelatedWL);
     GEntryBlock->getTerminator()->eraseFromParent();
     BB = GEntryBlock;
   } else {
@@ -790,7 +792,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
                  << G->getName() << "()\n");
     }
     eraseTail(G);
-    eraseInstsUnrelatedToPDI(PDIUnrelatedWL, PDPVUnrelatedWL);
+    eraseInstsUnrelatedToPDI(PDIUnrelatedWL, PDVRUnrelatedWL);
     LLVM_DEBUG(
         dbgs() << "} // End of parameter related debug info filtering for: "
                << G->getName() << "()\n");
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index ffc26f1a0a972d..9a8040bc4b064e 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1167,6 +1167,9 @@ void SampleProfileLoader::findExternalInlineCandidate(
   // For AutoFDO profile, retrieve candidate profiles by walking over
   // the nested inlinee profiles.
   if (!FunctionSamples::ProfileIsCS) {
+    // Set threshold to zero to honor pre-inliner decision.
+    if (UsePreInlinerDecision)
+      Threshold = 0;
     Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
     return;
   }
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index dd6062d303d42e..3986359b6a5a35 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -575,16 +575,17 @@ bool writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
 }
 
 } // anonymous namespace
-
+extern bool WriteNewDbgInfoFormatToBitcode;
 PreservedAnalyses
 llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  // RemoveDIs: there's no bitcode representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
+  // RemoveDIs: there's no bitcode representation of the DbgVariableRecord
+  // debug-info, convert to dbg.values before writing out.
+  bool ConvertToOldDbgFormatForWrite =
+      M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode;
+  if (ConvertToOldDbgFormatForWrite)
     M.convertFromNewDbgValues();
 
   bool Changed = writeThinLTOBitcode(
@@ -594,7 +595,7 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
       },
       M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
 
-  if (IsNewDbgInfoFormat)
+  if (ConvertToOldDbgFormatForWrite)
     M.convertToNewDbgValues();
 
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 694b18017babc5..426b548c074adf 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -274,7 +274,7 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
         DbgAssign->replaceVariableLocationOp(FillC, FillVal);
     };
     for_each(at::getAssignmentMarkers(S), replaceOpForAssignmentMarkers);
-    for_each(at::getDPVAssignmentMarkers(S), replaceOpForAssignmentMarkers);
+    for_each(at::getDVRAssignmentMarkers(S), replaceOpForAssignmentMarkers);
 
     S->setAlignment(Alignment);
     if (isa<AtomicMemSetInst>(MI))
@@ -2289,13 +2289,14 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         default:
           llvm_unreachable("unexpected intrinsic ID");
         }
-        Instruction *NewCall = Builder.CreateBinaryIntrinsic(
+        Value *V = Builder.CreateBinaryIntrinsic(
             IID, X, ConstantFP::get(Arg0->getType(), Res), II);
         // TODO: Conservatively intersecting FMF. If Res == C2, the transform
         //       was a simplification (so Arg0 and its original flags could
         //       propagate?)
-        NewCall->andIRFlags(M);
-        return replaceInstUsesWith(*II, NewCall);
+        if (auto *CI = dyn_cast<CallInst>(V))
+          CI->andIRFlags(M);
+        return replaceInstUsesWith(*II, V);
       }
     }
 
@@ -2472,6 +2473,16 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X))))
       return replaceOperand(*II, 1, X);
 
+    // Clear sign-bit of constant magnitude:
+    // copysign -MagC, X --> copysign MagC, X
+    // TODO: Support constant folding for fabs
+    const APFloat *MagC;
+    if (match(Mag, m_APFloat(MagC)) && MagC->isNegative()) {
+      APFloat PosMagC = *MagC;
+      PosMagC.clearSign();
+      return replaceOperand(*II, 0, ConstantFP::get(Mag->getType(), PosMagC));
+    }
+
     // Peek through changes of magnitude's sign-bit. This call rewrites those:
     // copysign (fabs X), Sign --> copysign X, Sign
     // copysign (fneg X), Sign --> copysign X, Sign
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 381cd858d26293..089a70c6e6cca1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1923,10 +1923,26 @@ Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) {
   return replaceInstUsesWith(FI, X);
 }
 
+static Instruction *foldFPtoI(Instruction &FI, InstCombiner &IC) {
+  // fpto{u/s}i non-norm --> 0
+  FPClassTest Mask =
+      FI.getOpcode() == Instruction::FPToUI ? fcPosNormal : fcNormal;
+  KnownFPClass FPClass =
+      computeKnownFPClass(FI.getOperand(0), Mask, /*Depth=*/0,
+                          IC.getSimplifyQuery().getWithInstruction(&FI));
+  if (FPClass.isKnownNever(Mask))
+    return IC.replaceInstUsesWith(FI, ConstantInt::getNullValue(FI.getType()));
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitFPToUI(FPToUIInst &FI) {
   if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
+  if (Instruction *I = foldFPtoI(FI, *this))
+    return I;
+
   return commonCastTransforms(FI);
 }
 
@@ -1934,6 +1950,9 @@ Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
   if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
+  if (Instruction *I = foldFPtoI(FI, *this))
+    return I;
+
   return commonCastTransforms(FI);
 }
 
@@ -1942,11 +1961,7 @@ Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) {
 }
 
 Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) {
-  if (Instruction *R = commonCastTransforms(CI))
-    return R;
-  if (isKnownNonNegative(CI.getOperand(0), SQ))
-    return new UIToFPInst(CI.getOperand(0), CI.getType());
-  return nullptr;
+  return commonCastTransforms(CI);
 }
 
 Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 0dce0077bf1588..db302d7e526844 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4177,7 +4177,9 @@ static bool isMaskOrZero(const Value *V, bool Not, const SimplifyQuery &Q,
 /// a check for a lossy truncation.
 /// Folds:
 ///   icmp SrcPred (x & Mask), x    to    icmp DstPred x, Mask
+///   icmp SrcPred (x & ~Mask), ~Mask    to    icmp DstPred x, ~Mask
 ///   icmp eq/ne (x & ~Mask), 0     to    icmp DstPred x, Mask
+///   icmp eq/ne (~x | Mask), -1     to    icmp DstPred x, Mask
 /// Where Mask is some pattern that produces all-ones in low bits:
 ///    (-1 >> y)
 ///    ((-1 << y) >> y)     <- non-canonical, has extra uses
@@ -4189,82 +4191,126 @@ static bool isMaskOrZero(const Value *V, bool Not, const SimplifyQuery &Q,
 static Value *foldICmpWithLowBitMaskedVal(ICmpInst::Predicate Pred, Value *Op0,
                                           Value *Op1, const SimplifyQuery &Q,
                                           InstCombiner &IC) {
-  Value *X, *M;
-  bool NeedsNot = false;
-
-  auto CheckMask = [&](Value *V, bool Not) {
-    if (ICmpInst::isSigned(Pred) && !match(V, m_ImmConstant()))
-      return false;
-    return isMaskOrZero(V, Not, Q);
-  };
-
-  if (match(Op0, m_c_And(m_Specific(Op1), m_Value(M))) &&
-      CheckMask(M, /*Not*/ false)) {
-    X = Op1;
-  } else if (match(Op1, m_Zero()) && ICmpInst::isEquality(Pred) &&
-             match(Op0, m_OneUse(m_And(m_Value(X), m_Value(M))))) {
-    NeedsNot = true;
-    if (IC.isFreeToInvert(X, X->hasOneUse()) && CheckMask(X, /*Not*/ true))
-      std::swap(X, M);
-    else if (!IC.isFreeToInvert(M, M->hasOneUse()) ||
-             !CheckMask(M, /*Not*/ true))
-      return nullptr;
-  } else {
-    return nullptr;
-  }
 
   ICmpInst::Predicate DstPred;
   switch (Pred) {
   case ICmpInst::Predicate::ICMP_EQ:
-    //  x & (-1 >> y) == x    ->    x u<= (-1 >> y)
+    //  x & Mask == x
+    //  x & ~Mask == 0
+    //  ~x | Mask == -1
+    //    ->    x u<= Mask
+    //  x & ~Mask == ~Mask
+    //    ->    ~Mask u<= x
     DstPred = ICmpInst::Predicate::ICMP_ULE;
     break;
   case ICmpInst::Predicate::ICMP_NE:
-    //  x & (-1 >> y) != x    ->    x u> (-1 >> y)
+    //  x & Mask != x
+    //  x & ~Mask != 0
+    //  ~x | Mask != -1
+    //    ->    x u> Mask
+    //  x & ~Mask != ~Mask
+    //    ->    ~Mask u> x
     DstPred = ICmpInst::Predicate::ICMP_UGT;
     break;
   case ICmpInst::Predicate::ICMP_ULT:
-    //  x & (-1 >> y) u< x    ->    x u> (-1 >> y)
-    //  x u> x & (-1 >> y)    ->    x u> (-1 >> y)
+    //  x & Mask u< x
+    //    -> x u> Mask
+    //  x & ~Mask u< ~Mask
+    //    -> ~Mask u> x
     DstPred = ICmpInst::Predicate::ICMP_UGT;
     break;
   case ICmpInst::Predicate::ICMP_UGE:
-    //  x & (-1 >> y) u>= x    ->    x u<= (-1 >> y)
-    //  x u<= x & (-1 >> y)    ->    x u<= (-1 >> y)
+    //  x & Mask u>= x
+    //    -> x u<= Mask
+    //  x & ~Mask u>= ~Mask
+    //    -> ~Mask u<= x
     DstPred = ICmpInst::Predicate::ICMP_ULE;
     break;
   case ICmpInst::Predicate::ICMP_SLT:
-    //  x & (-1 >> y) s< x    ->    x s> (-1 >> y)
-    //  x s> x & (-1 >> y)    ->    x s> (-1 >> y)
-    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
-      return nullptr;
-    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
-      return nullptr;
+    //  x & Mask s< x [iff Mask s>= 0]
+    //    -> x s> Mask
+    //  x & ~Mask s< ~Mask [iff ~Mask != 0]
+    //    -> ~Mask s> x
     DstPred = ICmpInst::Predicate::ICMP_SGT;
     break;
   case ICmpInst::Predicate::ICMP_SGE:
-    //  x & (-1 >> y) s>= x    ->    x s<= (-1 >> y)
-    //  x s<= x & (-1 >> y)    ->    x s<= (-1 >> y)
-    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
-      return nullptr;
-    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
-      return nullptr;
+    //  x & Mask s>= x [iff Mask s>= 0]
+    //    -> x s<= Mask
+    //  x & ~Mask s>= ~Mask [iff ~Mask != 0]
+    //    -> ~Mask s<= x
     DstPred = ICmpInst::Predicate::ICMP_SLE;
     break;
-  case ICmpInst::Predicate::ICMP_SGT:
-  case ICmpInst::Predicate::ICMP_SLE:
-    return nullptr;
-  case ICmpInst::Predicate::ICMP_UGT:
-  case ICmpInst::Predicate::ICMP_ULE:
-    llvm_unreachable("Instsimplify took care of commut. variant");
-    break;
   default:
-    llvm_unreachable("All possible folds are handled.");
+    // We don't support sgt,sle
+    // ult/ugt are simplified to true/false respectively.
+    return nullptr;
   }
 
-  // The mask value may be a vector constant that has undefined elements. But it
-  // may not be safe to propagate those undefs into the new compare, so replace
-  // those elements by copying an existing, defined, and safe scalar constant.
+  Value *X, *M;
+  // Put search code in lambda for early positive returns.
+  auto IsLowBitMask = [&]() {
+    if (match(Op0, m_c_And(m_Specific(Op1), m_Value(M)))) {
+      X = Op1;
+      // Look for: x & Mask pred x
+      if (isMaskOrZero(M, /*Not=*/false, Q)) {
+        return !ICmpInst::isSigned(Pred) ||
+               (match(M, m_NonNegative()) || isKnownNonNegative(M, Q));
+      }
+
+      // Look for: x & ~Mask pred ~Mask
+      if (isMaskOrZero(X, /*Not=*/true, Q)) {
+        return !ICmpInst::isSigned(Pred) ||
+               isKnownNonZero(X, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+      }
+      return false;
+    }
+    if (ICmpInst::isEquality(Pred) && match(Op1, m_AllOnes()) &&
+        match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(M))))) {
+
+      auto Check = [&]() {
+        // Look for: ~x | Mask == -1
+        if (isMaskOrZero(M, /*Not=*/false, Q)) {
+          if (Value *NotX =
+                  IC.getFreelyInverted(X, X->hasOneUse(), &IC.Builder)) {
+            X = NotX;
+            return true;
+          }
+        }
+        return false;
+      };
+      if (Check())
+        return true;
+      std::swap(X, M);
+      return Check();
+    }
+    if (ICmpInst::isEquality(Pred) && match(Op1, m_Zero()) &&
+        match(Op0, m_OneUse(m_And(m_Value(X), m_Value(M))))) {
+      auto Check = [&]() {
+        // Look for: x & ~Mask == 0
+        if (isMaskOrZero(M, /*Not=*/true, Q)) {
+          if (Value *NotM =
+                  IC.getFreelyInverted(M, M->hasOneUse(), &IC.Builder)) {
+            M = NotM;
+            return true;
+          }
+        }
+        return false;
+      };
+      if (Check())
+        return true;
+      std::swap(X, M);
+      return Check();
+    }
+    return false;
+  };
+
+  if (!IsLowBitMask())
+    return nullptr;
+
+  // The mask value may be a vector constant that has undefined elements. But
+  // it may not be safe to propagate those undefs into the new compare, so
+  // replace those elements by copying an existing, defined, and safe scalar
+  // constant.
   Type *OpTy = M->getType();
   auto *VecC = dyn_cast<Constant>(M);
   auto *OpVTy = dyn_cast<FixedVectorType>(OpTy);
@@ -4280,8 +4326,6 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst::Predicate Pred, Value *Op0,
     M = Constant::replaceUndefsWith(VecC, SafeReplacementConstant);
   }
 
-  if (NeedsNot)
-    M = IC.Builder.CreateNot(M);
   return IC.Builder.CreateICmp(DstPred, X, M);
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 6a1ef6edeb4077..b9ad3a74007929 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -98,6 +98,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *visitSub(BinaryOperator &I);
   Instruction *visitFSub(BinaryOperator &I);
   Instruction *visitMul(BinaryOperator &I);
+  Instruction *foldPowiReassoc(BinaryOperator &I);
   Instruction *foldFMulReassoc(BinaryOperator &I);
   Instruction *visitFMul(BinaryOperator &I);
   Instruction *visitURem(BinaryOperator &I);
@@ -757,10 +758,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   void tryToSinkInstructionDbgValues(
       Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
       BasicBlock *DestBlock, SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers);
-  void tryToSinkInstructionDPValues(Instruction *I,
-                                    BasicBlock::iterator InsertPos,
-                                    BasicBlock *SrcBlock, BasicBlock *DestBlock,
-                                    SmallVectorImpl<DPValue *> &DPUsers);
+  void tryToSinkInstructionDbgVariableRecords(
+      Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
+      BasicBlock *DestBlock, SmallVectorImpl<DbgVariableRecord *> &DPUsers);
 
   bool removeInstructionsBeforeUnreachable(Instruction &I);
   void addDeadEdge(BasicBlock *From, BasicBlock *To,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 278be6233f4b8a..9d4c271f990d19 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -448,6 +448,14 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(X, Op0, ConstantInt::getNullValue(Ty));
 
+  // mul (sext X), Y -> select X, -Y, 0
+  // mul Y, (sext X) -> select X, -Y, 0
+  if (match(&I, m_c_Mul(m_OneUse(m_SExt(m_Value(X))), m_Value(Y))) &&
+      X->getType()->isIntOrIntVectorTy(1))
+    return SelectInst::Create(
+        X, Builder.CreateNeg(Y, "", /*HasNUW=*/false, I.hasNoSignedWrap()),
+        ConstantInt::getNullValue(Op0->getType()));
+
   Constant *ImmC;
   if (match(Op1, m_ImmConstant(ImmC))) {
     // (sext bool X) * C --> X ? -C : 0
@@ -571,6 +579,42 @@ Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) {
   return nullptr;
 }
 
+Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
+  auto createPowiExpr = [](BinaryOperator &I, InstCombinerImpl &IC, Value *X,
+                           Value *Y, Value *Z) {
+    InstCombiner::BuilderTy &Builder = IC.Builder;
+    Value *YZ = Builder.CreateAdd(Y, Z);
+    auto *NewPow = Builder.CreateIntrinsic(
+        Intrinsic::powi, {X->getType(), YZ->getType()}, {X, YZ}, &I);
+    return IC.replaceInstUsesWith(I, NewPow);
+  };
+
+  Value *X, *Y, *Z;
+
+  // powi(X, Y) * X --> powi(X, Y+1)
+  // X * powi(X, Y) --> powi(X, Y+1)
+  if (match(&I, m_c_FMul(m_OneUse(m_AllowReassoc(m_Intrinsic<Intrinsic::powi>(
+                             m_Value(X), m_Value(Y)))),
+                         m_Deferred(X)))) {
+    Constant *One = ConstantInt::get(Y->getType(), 1);
+    if (willNotOverflowSignedAdd(Y, One, I))
+      return createPowiExpr(I, *this, X, Y, One);
+  }
+
+  // powi(x, y) * powi(x, z) -> powi(x, y + z)
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  if (I.isOnlyUserOfAnyOperand() &&
+      match(Op0, m_AllowReassoc(
+                     m_Intrinsic<Intrinsic::powi>(m_Value(X), m_Value(Y)))) &&
+      match(Op1, m_AllowReassoc(m_Intrinsic<Intrinsic::powi>(m_Specific(X),
+                                                             m_Value(Z)))) &&
+      Y->getType() == Z->getType())
+    return createPowiExpr(I, *this, X, Y, Z);
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0);
   Value *Op1 = I.getOperand(1);
@@ -683,6 +727,9 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
     return replaceInstUsesWith(I, Pow);
   }
 
+  if (Instruction *FoldedPowi = foldPowiReassoc(I))
+    return FoldedPowi;
+
   if (I.isOnlyUserOfAnyOperand()) {
     // pow(X, Y) * pow(X, Z) -> pow(X, Y + Z)
     if (match(Op0, m_Intrinsic<Intrinsic::pow>(m_Value(X), m_Value(Y))) &&
@@ -699,16 +746,6 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
       return replaceInstUsesWith(I, NewPow);
     }
 
-    // powi(x, y) * powi(x, z) -> powi(x, y + z)
-    if (match(Op0, m_Intrinsic<Intrinsic::powi>(m_Value(X), m_Value(Y))) &&
-        match(Op1, m_Intrinsic<Intrinsic::powi>(m_Specific(X), m_Value(Z))) &&
-        Y->getType() == Z->getType()) {
-      auto *YZ = Builder.CreateAdd(Y, Z);
-      auto *NewPow = Builder.CreateIntrinsic(
-          Intrinsic::powi, {X->getType(), YZ->getType()}, {X, YZ}, &I);
-      return replaceInstUsesWith(I, NewPow);
-    }
-
     // exp(X) * exp(Y) -> exp(X + Y)
     if (match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))) &&
         match(Op1, m_Intrinsic<Intrinsic::exp>(m_Value(Y)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index aee18d770f729d..9ab2bd8f70aa15 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -536,19 +536,29 @@ Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
     // between 0, 1 and -1.
     const APInt *OOpC;
     bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
-    if (!isa<Constant>(OOp) ||
-        (OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
-      Value *NewSel = Builder.CreateSelect(SI.getCondition(), Swapped ? C : OOp,
-                                           Swapped ? OOp : C, "", &SI);
-      if (isa<FPMathOperator>(&SI))
-        cast<Instruction>(NewSel)->setFastMathFlags(FMF);
-      NewSel->takeName(TVI);
-      BinaryOperator *BO =
-          BinaryOperator::Create(TVI->getOpcode(), FalseVal, NewSel);
-      BO->copyIRFlags(TVI);
-      return BO;
-    }
-    return nullptr;
+    if (isa<Constant>(OOp) &&
+        (!OOpIsAPInt || !isSelect01(C->getUniqueInteger(), *OOpC)))
+      return nullptr;
+
+    // If the false value is a NaN then we have that the floating point math
+    // operation in the transformed code may not preserve the exact NaN
+    // bit-pattern -- e.g. `fadd sNaN, 0.0 -> qNaN`.
+    // This makes the transformation incorrect since the original program would
+    // have preserved the exact NaN bit-pattern.
+    // Avoid the folding if the false value might be a NaN.
+    if (isa<FPMathOperator>(&SI) &&
+        !computeKnownFPClass(FalseVal, FMF, fcNan, &SI).isKnownNeverNaN())
+      return nullptr;
+
+    Value *NewSel = Builder.CreateSelect(SI.getCondition(), Swapped ? C : OOp,
+                                         Swapped ? OOp : C, "", &SI);
+    if (isa<FPMathOperator>(&SI))
+      cast<Instruction>(NewSel)->setFastMathFlags(FMF);
+    NewSel->takeName(TVI);
+    BinaryOperator *BO =
+        BinaryOperator::Create(TVI->getOpcode(), FalseVal, NewSel);
+    BO->copyIRFlags(TVI);
+    return BO;
   };
 
   if (Instruction *R = TryFoldSelectIntoOp(SI, TrueVal, FalseVal, false))
@@ -1191,7 +1201,7 @@ static Value *canonicalizeSPF(ICmpInst &Cmp, Value *TrueVal, Value *FalseVal,
                           match(RHS, m_NSWNeg(m_Specific(LHS)));
     Constant *IntMinIsPoisonC =
         ConstantInt::get(Type::getInt1Ty(Cmp.getContext()), IntMinIsPoison);
-    Instruction *Abs =
+    Value *Abs =
         IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
 
     if (SPF == SelectPatternFlavor::SPF_NABS)
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1d73f8d96300f0..7c40fb4fc86082 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1481,6 +1481,11 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign(
 
   // If we have a constant rhs, see if we can losslessly convert it to an int.
   if (Op1FpC != nullptr) {
+    // Signed + Mul req non-zero
+    if (OpsFromSigned && BO.getOpcode() == Instruction::FMul &&
+        !match(Op1FpC, m_NonZeroFP()))
+      return nullptr;
+
     Constant *Op1IntC = ConstantFoldCastOperand(
         OpsFromSigned ? Instruction::FPToSI : Instruction::FPToUI, Op1FpC,
         IntTy, DL);
@@ -1645,6 +1650,7 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
                                              Value *NewOp, InstCombiner &IC) {
   Instruction *Clone = I.clone();
   Clone->replaceUsesOfWith(SI, NewOp);
+  Clone->dropUBImplyingAttrsAndMetadata();
   IC.InsertNewInstBefore(Clone, SI->getIterator());
   return Clone;
 }
@@ -2596,6 +2602,45 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
     return nullptr;
   }
 
+  // De Morgan's Laws:
+  // (~(A | B)) -> (~A & ~B)
+  // (~(A & B)) -> (~A | ~B)
+  auto TryInvertAndOrUsingDeMorgan = [&](Instruction::BinaryOps Opcode,
+                                         bool IsLogical, Value *A,
+                                         Value *B) -> Value * {
+    bool LocalDoesConsume = DoesConsume;
+    if (!getFreelyInvertedImpl(B, B->hasOneUse(), /*Builder=*/nullptr,
+                               LocalDoesConsume, Depth))
+      return nullptr;
+    if (auto *NotA = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+                                           LocalDoesConsume, Depth)) {
+      auto *NotB = getFreelyInvertedImpl(B, B->hasOneUse(), Builder,
+                                         LocalDoesConsume, Depth);
+      DoesConsume = LocalDoesConsume;
+      if (IsLogical)
+        return Builder ? Builder->CreateLogicalOp(Opcode, NotA, NotB) : NonNull;
+      return Builder ? Builder->CreateBinOp(Opcode, NotA, NotB) : NonNull;
+    }
+
+    return nullptr;
+  };
+
+  if (match(V, m_Or(m_Value(A), m_Value(B))))
+    return TryInvertAndOrUsingDeMorgan(Instruction::And, /*IsLogical=*/false, A,
+                                       B);
+
+  if (match(V, m_And(m_Value(A), m_Value(B))))
+    return TryInvertAndOrUsingDeMorgan(Instruction::Or, /*IsLogical=*/false, A,
+                                       B);
+
+  if (match(V, m_LogicalOr(m_Value(A), m_Value(B))))
+    return TryInvertAndOrUsingDeMorgan(Instruction::And, /*IsLogical=*/true, A,
+                                       B);
+
+  if (match(V, m_LogicalAnd(m_Value(A), m_Value(B))))
+    return TryInvertAndOrUsingDeMorgan(Instruction::Or, /*IsLogical=*/true, A,
+                                       B);
+
   return nullptr;
 }
 
@@ -3075,10 +3120,10 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
   // If we are removing an alloca with a dbg.declare, insert dbg.value calls
   // before each store.
   SmallVector<DbgVariableIntrinsic *, 8> DVIs;
-  SmallVector<DPValue *, 8> DPVs;
+  SmallVector<DbgVariableRecord *, 8> DVRs;
   std::unique_ptr<DIBuilder> DIB;
   if (isa<AllocaInst>(MI)) {
-    findDbgUsers(DVIs, &MI, &DPVs);
+    findDbgUsers(DVIs, &MI, &DVRs);
     DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
   }
 
@@ -3118,9 +3163,9 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
         for (auto *DVI : DVIs)
           if (DVI->isAddressOfVariable())
             ConvertDebugDeclareToDebugValue(DVI, SI, *DIB);
-        for (auto *DPV : DPVs)
-          if (DPV->isAddressOfVariable())
-            ConvertDebugDeclareToDebugValue(DPV, SI, *DIB);
+        for (auto *DVR : DVRs)
+          if (DVR->isAddressOfVariable())
+            ConvertDebugDeclareToDebugValue(DVR, SI, *DIB);
       } else {
         // Casts, GEP, or anything else: we're about to delete this instruction,
         // so it can not have any valid uses.
@@ -3165,9 +3210,9 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
     for (auto *DVI : DVIs)
       if (DVI->isAddressOfVariable() || DVI->getExpression()->startsWithDeref())
         DVI->eraseFromParent();
-    for (auto *DPV : DPVs)
-      if (DPV->isAddressOfVariable() || DPV->getExpression()->startsWithDeref())
-        DPV->eraseFromParent();
+    for (auto *DVR : DVRs)
+      if (DVR->isAddressOfVariable() || DVR->getExpression()->startsWithDeref())
+        DVR->eraseFromParent();
 
     return eraseInstFromFunction(MI);
   }
@@ -4568,12 +4613,13 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I,
   // mark the location undef: we know it was supposed to receive a new location
   // here, but that computation has been sunk.
   SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
-  SmallVector<DPValue *, 2> DPValues;
-  findDbgUsers(DbgUsers, I, &DPValues);
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
+  findDbgUsers(DbgUsers, I, &DbgVariableRecords);
   if (!DbgUsers.empty())
     tryToSinkInstructionDbgValues(I, InsertPos, SrcBlock, DestBlock, DbgUsers);
-  if (!DPValues.empty())
-    tryToSinkInstructionDPValues(I, InsertPos, SrcBlock, DestBlock, DPValues);
+  if (!DbgVariableRecords.empty())
+    tryToSinkInstructionDbgVariableRecords(I, InsertPos, SrcBlock, DestBlock,
+                                           DbgVariableRecords);
 
   // PS: there are numerous flaws with this behaviour, not least that right now
   // assignments can be re-ordered past other assignments to the same variable
@@ -4646,47 +4692,48 @@ void InstCombinerImpl::tryToSinkInstructionDbgValues(
   }
 }
 
-void InstCombinerImpl::tryToSinkInstructionDPValues(
+void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords(
     Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock,
-    BasicBlock *DestBlock, SmallVectorImpl<DPValue *> &DPValues) {
-  // Implementation of tryToSinkInstructionDbgValues, but for the DPValue of
-  // variable assignments rather than dbg.values.
-
-  // Fetch all DPValues not already in the destination.
-  SmallVector<DPValue *, 2> DPValuesToSalvage;
-  for (auto &DPV : DPValues)
-    if (DPV->getParent() != DestBlock)
-      DPValuesToSalvage.push_back(DPV);
-
-  // Fetch a second collection, of DPValues in the source block that we're going
-  // to sink.
-  SmallVector<DPValue *> DPValuesToSink;
-  for (DPValue *DPV : DPValuesToSalvage)
-    if (DPV->getParent() == SrcBlock)
-      DPValuesToSink.push_back(DPV);
-
-  // Sort DPValues according to their position in the block. This is a partial
-  // order: DPValues attached to different instructions will be ordered by the
-  // instruction order, but DPValues attached to the same instruction won't
-  // have an order.
-  auto Order = [](DPValue *A, DPValue *B) -> bool {
+    BasicBlock *DestBlock,
+    SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+  // Implementation of tryToSinkInstructionDbgValues, but for the
+  // DbgVariableRecord of variable assignments rather than dbg.values.
+
+  // Fetch all DbgVariableRecords not already in the destination.
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecordsToSalvage;
+  for (auto &DVR : DbgVariableRecords)
+    if (DVR->getParent() != DestBlock)
+      DbgVariableRecordsToSalvage.push_back(DVR);
+
+  // Fetch a second collection, of DbgVariableRecords in the source block that
+  // we're going to sink.
+  SmallVector<DbgVariableRecord *> DbgVariableRecordsToSink;
+  for (DbgVariableRecord *DVR : DbgVariableRecordsToSalvage)
+    if (DVR->getParent() == SrcBlock)
+      DbgVariableRecordsToSink.push_back(DVR);
+
+  // Sort DbgVariableRecords according to their position in the block. This is a
+  // partial order: DbgVariableRecords attached to different instructions will
+  // be ordered by the instruction order, but DbgVariableRecords attached to the
+  // same instruction won't have an order.
+  auto Order = [](DbgVariableRecord *A, DbgVariableRecord *B) -> bool {
     return B->getInstruction()->comesBefore(A->getInstruction());
   };
-  llvm::stable_sort(DPValuesToSink, Order);
+  llvm::stable_sort(DbgVariableRecordsToSink, Order);
 
   // If there are two assignments to the same variable attached to the same
   // instruction, the ordering between the two assignments is important. Scan
   // for this (rare) case and establish which is the last assignment.
   using InstVarPair = std::pair<const Instruction *, DebugVariable>;
-  SmallDenseMap<InstVarPair, DPValue *> FilterOutMap;
-  if (DPValuesToSink.size() > 1) {
+  SmallDenseMap<InstVarPair, DbgVariableRecord *> FilterOutMap;
+  if (DbgVariableRecordsToSink.size() > 1) {
     SmallDenseMap<InstVarPair, unsigned> CountMap;
     // Count how many assignments to each variable there is per instruction.
-    for (DPValue *DPV : DPValuesToSink) {
+    for (DbgVariableRecord *DVR : DbgVariableRecordsToSink) {
       DebugVariable DbgUserVariable =
-          DebugVariable(DPV->getVariable(), DPV->getExpression(),
-                        DPV->getDebugLoc()->getInlinedAt());
-      CountMap[std::make_pair(DPV->getInstruction(), DbgUserVariable)] += 1;
+          DebugVariable(DVR->getVariable(), DVR->getExpression(),
+                        DVR->getDebugLoc()->getInlinedAt());
+      CountMap[std::make_pair(DVR->getInstruction(), DbgUserVariable)] += 1;
     }
 
     // If there are any instructions with two assignments, add them to the
@@ -4702,74 +4749,74 @@ void InstCombinerImpl::tryToSinkInstructionDPValues(
     // For all instruction/variable pairs needing extra filtering, find the
     // latest assignment.
     for (const Instruction *Inst : DupSet) {
-      for (DPValue &DPV :
+      for (DbgVariableRecord &DVR :
            llvm::reverse(filterDbgVars(Inst->getDbgRecordRange()))) {
         DebugVariable DbgUserVariable =
-            DebugVariable(DPV.getVariable(), DPV.getExpression(),
-                          DPV.getDebugLoc()->getInlinedAt());
+            DebugVariable(DVR.getVariable(), DVR.getExpression(),
+                          DVR.getDebugLoc()->getInlinedAt());
         auto FilterIt =
             FilterOutMap.find(std::make_pair(Inst, DbgUserVariable));
         if (FilterIt == FilterOutMap.end())
           continue;
         if (FilterIt->second != nullptr)
           continue;
-        FilterIt->second = &DPV;
+        FilterIt->second = &DVR;
       }
     }
   }
 
-  // Perform cloning of the DPValues that we plan on sinking, filter out any
-  // duplicate assignments identified above.
-  SmallVector<DPValue *, 2> DPVClones;
+  // Perform cloning of the DbgVariableRecords that we plan on sinking, filter
+  // out any duplicate assignments identified above.
+  SmallVector<DbgVariableRecord *, 2> DVRClones;
   SmallSet<DebugVariable, 4> SunkVariables;
-  for (DPValue *DPV : DPValuesToSink) {
-    if (DPV->Type == DPValue::LocationType::Declare)
+  for (DbgVariableRecord *DVR : DbgVariableRecordsToSink) {
+    if (DVR->Type == DbgVariableRecord::LocationType::Declare)
       continue;
 
     DebugVariable DbgUserVariable =
-        DebugVariable(DPV->getVariable(), DPV->getExpression(),
-                      DPV->getDebugLoc()->getInlinedAt());
+        DebugVariable(DVR->getVariable(), DVR->getExpression(),
+                      DVR->getDebugLoc()->getInlinedAt());
 
     // For any variable where there were multiple assignments in the same place,
     // ignore all but the last assignment.
     if (!FilterOutMap.empty()) {
-      InstVarPair IVP = std::make_pair(DPV->getInstruction(), DbgUserVariable);
+      InstVarPair IVP = std::make_pair(DVR->getInstruction(), DbgUserVariable);
       auto It = FilterOutMap.find(IVP);
 
       // Filter out.
-      if (It != FilterOutMap.end() && It->second != DPV)
+      if (It != FilterOutMap.end() && It->second != DVR)
         continue;
     }
 
     if (!SunkVariables.insert(DbgUserVariable).second)
       continue;
 
-    if (DPV->isDbgAssign())
+    if (DVR->isDbgAssign())
       continue;
 
-    DPVClones.emplace_back(DPV->clone());
-    LLVM_DEBUG(dbgs() << "CLONE: " << *DPVClones.back() << '\n');
+    DVRClones.emplace_back(DVR->clone());
+    LLVM_DEBUG(dbgs() << "CLONE: " << *DVRClones.back() << '\n');
   }
 
   // Perform salvaging without the clones, then sink the clones.
-  if (DPVClones.empty())
+  if (DVRClones.empty())
     return;
 
-  salvageDebugInfoForDbgValues(*I, {}, DPValuesToSalvage);
+  salvageDebugInfoForDbgValues(*I, {}, DbgVariableRecordsToSalvage);
 
   // The clones are in reverse order of original appearance. Assert that the
   // head bit is set on the iterator as we _should_ have received it via
   // getFirstInsertionPt. Inserting like this will reverse the clone order as
   // we'll repeatedly insert at the head, such as:
-  //   DPV-3 (third insertion goes here)
-  //   DPV-2 (second insertion goes here)
-  //   DPV-1 (first insertion goes here)
-  //   Any-Prior-DPVs
+  //   DVR-3 (third insertion goes here)
+  //   DVR-2 (second insertion goes here)
+  //   DVR-1 (first insertion goes here)
+  //   Any-Prior-DVRs
   //   InsertPtInst
   assert(InsertPos.getHeadBit());
-  for (DPValue *DPVClone : DPVClones) {
-    InsertPos->getParent()->insertDbgRecordBefore(DPVClone, InsertPos);
-    LLVM_DEBUG(dbgs() << "SINK: " << *DPVClone << '\n');
+  for (DbgVariableRecord *DVRClone : DVRClones) {
+    InsertPos->getParent()->insertDbgRecordBefore(DVRClone, InsertPos);
+    LLVM_DEBUG(dbgs() << "SINK: " << *DVRClone << '\n');
   }
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 87584dace32dd4..4bdeb6bbab85a9 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -357,7 +357,6 @@ class HWAddressSanitizer {
   bool instrumentStack(memtag::StackInfo &Info, Value *StackTag, Value *UARTag,
                        const DominatorTree &DT, const PostDominatorTree &PDT,
                        const LoopInfo &LI);
-  Value *readRegister(IRBuilder<> &IRB, StringRef Name);
   bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
   Value *getNextTagWithCall(IRBuilder<> &IRB);
   Value *getStackBaseTag(IRBuilder<> &IRB);
@@ -373,8 +372,7 @@ class HWAddressSanitizer {
   void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
   void instrumentGlobals();
 
-  Value *getPC(IRBuilder<> &IRB);
-  Value *getFP(IRBuilder<> &IRB);
+  Value *getCachedFP(IRBuilder<> &IRB);
   Value *getFrameRecordInfo(IRBuilder<> &IRB);
 
   void instrumentPersonalityFunctions();
@@ -410,8 +408,8 @@ class HWAddressSanitizer {
   ShadowMapping Mapping;
 
   Type *VoidTy = Type::getVoidTy(M.getContext());
-  Type *IntptrTy;
-  PointerType *PtrTy;
+  Type *IntptrTy = M.getDataLayout().getIntPtrType(M.getContext());
+  PointerType *PtrTy = PointerType::getUnqual(M.getContext());
   Type *Int8Ty = Type::getInt8Ty(M.getContext());
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
   Type *Int64Ty = Type::getInt64Ty(M.getContext());
@@ -449,7 +447,7 @@ class HWAddressSanitizer {
 
   Value *ShadowBase = nullptr;
   Value *StackBaseTag = nullptr;
-  Value *CachedSP = nullptr;
+  Value *CachedFP = nullptr;
   GlobalValue *ThreadPtrGlobal = nullptr;
 };
 
@@ -594,8 +592,6 @@ void HWAddressSanitizer::createHwasanCtorComdat() {
 /// inserts a call to __hwasan_init to the module's constructor list.
 void HWAddressSanitizer::initializeModule() {
   LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
-  auto &DL = M.getDataLayout();
-
   TargetTriple = Triple(M.getTargetTriple());
 
   // x86_64 currently has two modes:
@@ -613,8 +609,6 @@ void HWAddressSanitizer::initializeModule() {
 
   C = &(M.getContext());
   IRBuilder<> IRB(*C);
-  IntptrTy = IRB.getIntPtrTy(DL);
-  PtrTy = IRB.getPtrTy();
 
   HwasanCtorFunction = nullptr;
 
@@ -1163,10 +1157,10 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
   // Extract some entropy from the stack pointer for the tags.
   // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
   // between functions).
-  Value *StackPointerLong = getFP(IRB);
+  Value *FramePointerLong = getCachedFP(IRB);
   Value *StackTag =
-      applyTagMask(IRB, IRB.CreateXor(StackPointerLong,
-                                      IRB.CreateLShr(StackPointerLong, 20)));
+      applyTagMask(IRB, IRB.CreateXor(FramePointerLong,
+                                      IRB.CreateLShr(FramePointerLong, 20)));
   StackTag->setName("hwasan.stack.base.tag");
   return StackTag;
 }
@@ -1180,9 +1174,9 @@ Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
 }
 
 Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB) {
-  Value *StackPointerLong = getFP(IRB);
+  Value *FramePointerLong = getCachedFP(IRB);
   Value *UARTag =
-      applyTagMask(IRB, IRB.CreateLShr(StackPointerLong, PointerTagShift));
+      applyTagMask(IRB, IRB.CreateLShr(FramePointerLong, PointerTagShift));
 
   UARTag->setName("hwasan.uar.tag");
   return UARTag;
@@ -1241,41 +1235,25 @@ Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
   return nullptr;
 }
 
-Value *HWAddressSanitizer::getPC(IRBuilder<> &IRB) {
-  if (TargetTriple.getArch() == Triple::aarch64)
-    return readRegister(IRB, "pc");
-  return IRB.CreatePtrToInt(IRB.GetInsertBlock()->getParent(), IntptrTy);
-}
-
-Value *HWAddressSanitizer::getFP(IRBuilder<> &IRB) {
-  if (!CachedSP) {
-    // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
-    // first).
-    Function *F = IRB.GetInsertBlock()->getParent();
-    Module *M = F->getParent();
-    auto *GetStackPointerFn = Intrinsic::getDeclaration(
-        M, Intrinsic::frameaddress,
-        IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()));
-    CachedSP = IRB.CreatePtrToInt(
-        IRB.CreateCall(GetStackPointerFn, {Constant::getNullValue(Int32Ty)}),
-        IntptrTy);
-  }
-  return CachedSP;
+Value *HWAddressSanitizer::getCachedFP(IRBuilder<> &IRB) {
+  if (!CachedFP)
+    CachedFP = memtag::getFP(IRB);
+  return CachedFP;
 }
 
 Value *HWAddressSanitizer::getFrameRecordInfo(IRBuilder<> &IRB) {
   // Prepare ring buffer data.
-  Value *PC = getPC(IRB);
-  Value *SP = getFP(IRB);
+  Value *PC = memtag::getPC(TargetTriple, IRB);
+  Value *FP = getCachedFP(IRB);
 
-  // Mix SP and PC.
+  // Mix FP and PC.
   // Assumptions:
   // PC is 0x0000PPPPPPPPPPPP  (48 bits are meaningful, others are zero)
-  // SP is 0xsssssssssssSSSS0  (4 lower bits are zero)
-  // We only really need ~20 lower non-zero bits (SSSS), so we mix like this:
-  //       0xSSSSPPPPPPPPPPPP
-  SP = IRB.CreateShl(SP, 44);
-  return IRB.CreateOr(PC, SP);
+  // FP is 0xfffffffffffFFFF0  (4 lower bits are zero)
+  // We only really need ~20 lower non-zero bits (FFFF), so we mix like this:
+  //       0xFFFFPPPPPPPPPPPP
+  FP = IRB.CreateShl(FP, 44);
+  return IRB.CreateOr(PC, FP);
 }
 
 void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
@@ -1360,38 +1338,24 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
   }
 }
 
-Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) {
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  Function *ReadRegister =
-      Intrinsic::getDeclaration(M, Intrinsic::read_register, IntptrTy);
-  MDNode *MD = MDNode::get(*C, {MDString::get(*C, Name)});
-  Value *Args[] = {MetadataAsValue::get(*C, MD)};
-  return IRB.CreateCall(ReadRegister, Args);
-}
-
 bool HWAddressSanitizer::instrumentLandingPads(
     SmallVectorImpl<Instruction *> &LandingPadVec) {
   for (auto *LP : LandingPadVec) {
     IRBuilder<> IRB(LP->getNextNonDebugInstruction());
     IRB.CreateCall(
         HwasanHandleVfork,
-        {readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp"
-                                                                      : "sp")});
+        {memtag::readRegister(
+            IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp" : "sp")});
   }
   return true;
 }
 
-static bool isLifetimeIntrinsic(Value *V) {
-  auto *II = dyn_cast<IntrinsicInst>(V);
-  return II && II->isLifetimeStartOrEnd();
-}
-
 static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) {
   return dyn_cast<DbgAssignIntrinsic>(DVI);
 }
 
-static DPValue *DynCastToDbgAssign(DPValue *DPV) {
-  return DPV->isDbgAssign() ? DPV : nullptr;
+static DbgVariableRecord *DynCastToDbgAssign(DbgVariableRecord *DVR) {
+  return DVR->isDbgAssign() ? DVR : nullptr;
 }
 
 bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
@@ -1445,11 +1409,13 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
 
     AI->replaceUsesWithIf(Replacement, [AICast, AILong](const Use &U) {
       auto *User = U.getUser();
-      return User != AILong && User != AICast && !isLifetimeIntrinsic(User);
+      return User != AILong && User != AICast &&
+             !memtag::isLifetimeIntrinsic(User);
     });
 
     // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records,
-    // abstracted over whether they're intrinsic-stored or DPValue stored.
+    // abstracted over whether they're intrinsic-stored or DbgVariableRecord
+    // stored.
     auto AnnotateDbgRecord = [&](auto *DPtr) {
       // Prepend "tag_offset, N" to the dwarf expression.
       // Tag offset logically applies to the alloca pointer, and it makes sense
@@ -1644,7 +1610,7 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   ShadowBase = nullptr;
   StackBaseTag = nullptr;
-  CachedSP = nullptr;
+  CachedFP = nullptr;
 }
 
 void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 8ee0bca7e354f0..956ebe8fc8b9a3 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -43,7 +43,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/EscapeEnumerator.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -738,8 +737,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     Value *Args[] = {Addr,
                      IRB.CreateBitOrPointerCast(SI->getValueOperand(), Ty),
                      createOrdering(&IRB, SI->getOrdering())};
-    CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
-    ReplaceInstWithInst(I, C);
+    IRB.CreateCall(TsanAtomicStore[Idx], Args);
+    SI->eraseFromParent();
   } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
     Value *Addr = RMWI->getPointerOperand();
     int Idx =
@@ -752,11 +751,12 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     const unsigned ByteSize = 1U << Idx;
     const unsigned BitSize = ByteSize * 8;
     Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
-    Value *Args[] = {Addr,
-                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+    Value *Val = RMWI->getValOperand();
+    Value *Args[] = {Addr, IRB.CreateBitOrPointerCast(Val, Ty),
                      createOrdering(&IRB, RMWI->getOrdering())};
-    CallInst *C = CallInst::Create(F, Args);
-    ReplaceInstWithInst(I, C);
+    Value *C = IRB.CreateCall(F, Args);
+    I->replaceAllUsesWith(IRB.CreateBitOrPointerCast(C, Val->getType()));
+    I->eraseFromParent();
   } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
     Value *Addr = CASI->getPointerOperand();
     Type *OrigOldValTy = CASI->getNewValOperand()->getType();
@@ -794,8 +794,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     FunctionCallee F = FI->getSyncScopeID() == SyncScope::SingleThread
                            ? TsanAtomicSignalFence
                            : TsanAtomicThreadFence;
-    CallInst *C = CallInst::Create(F, Args);
-    ReplaceInstWithInst(I, C);
+    IRB.CreateCall(F, Args);
+    FI->eraseFromParent();
   }
   return true;
 }
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index 4d901310efe520..96ecd7f368a000 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -544,15 +544,16 @@ ADCEChanged AggressiveDeadCodeElimination::removeDeadInstructions() {
   // value of the function, and may therefore be deleted safely.
   // NOTE: We reuse the Worklist vector here for memory efficiency.
   for (Instruction &I : llvm::reverse(instructions(F))) {
-    // With "RemoveDIs" debug-info stored in DPValue objects, debug-info
-    // attached to this instruction, and drop any for scopes that aren't alive,
-    // like the rest of this loop does. Extending support to assignment tracking
-    // is future work.
+    // With "RemoveDIs" debug-info stored in DbgVariableRecord objects,
+    // debug-info attached to this instruction, and drop any for scopes that
+    // aren't alive, like the rest of this loop does. Extending support to
+    // assignment tracking is future work.
     for (DbgRecord &DR : make_early_inc_range(I.getDbgRecordRange())) {
-      // Avoid removing a DPV that is linked to instructions because it holds
+      // Avoid removing a DVR that is linked to instructions because it holds
       // information about an existing store.
-      if (DPValue *DPV = dyn_cast<DPValue>(&DR); DPV && DPV->isDbgAssign())
-        if (!at::getAssignmentInsts(DPV).empty())
+      if (DbgVariableRecord *DVR = dyn_cast<DbgVariableRecord>(&DR);
+          DVR && DVR->isDbgAssign())
+        if (!at::getAssignmentInsts(DVR).empty())
           continue;
       if (AliveScopes.count(DR.getDebugLoc()->getScope()))
         continue;
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 49f8761a139232..68774cf31ce9b3 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -162,27 +162,27 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
 
 void ConstantHoistingPass::collectMatInsertPts(
     const RebasedConstantListType &RebasedConstants,
-    SmallVectorImpl<Instruction *> &MatInsertPts) const {
+    SmallVectorImpl<BasicBlock::iterator> &MatInsertPts) const {
   for (const RebasedConstantInfo &RCI : RebasedConstants)
     for (const ConstantUser &U : RCI.Uses)
       MatInsertPts.emplace_back(findMatInsertPt(U.Inst, U.OpndIdx));
 }
 
 /// Find the constant materialization insertion point.
-Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
-                                                   unsigned Idx) const {
+BasicBlock::iterator ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
+                                                           unsigned Idx) const {
   // If the operand is a cast instruction, then we have to materialize the
   // constant before the cast instruction.
   if (Idx != ~0U) {
     Value *Opnd = Inst->getOperand(Idx);
     if (auto CastInst = dyn_cast<Instruction>(Opnd))
       if (CastInst->isCast())
-        return CastInst;
+        return CastInst->getIterator();
   }
 
   // The simple and common case. This also includes constant expressions.
   if (!isa<PHINode>(Inst) && !Inst->isEHPad())
-    return Inst;
+    return Inst->getIterator();
 
   // We can't insert directly before a phi node or an eh pad. Insert before
   // the terminator of the incoming or dominating block.
@@ -191,7 +191,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
   if (Idx != ~0U && isa<PHINode>(Inst)) {
     InsertionBlock = cast<PHINode>(Inst)->getIncomingBlock(Idx);
     if (!InsertionBlock->isEHPad()) {
-      return InsertionBlock->getTerminator();
+      return InsertionBlock->getTerminator()->getIterator();
     }
   } else {
     InsertionBlock = Inst->getParent();
@@ -206,7 +206,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
     IDom = IDom->getIDom();
   }
 
-  return IDom->getBlock()->getTerminator();
+  return IDom->getBlock()->getTerminator()->getIterator();
 }
 
 /// Given \p BBs as input, find another set of BBs which collectively
@@ -314,26 +314,27 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
 }
 
 /// Find an insertion point that dominates all uses.
-SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
+SetVector<BasicBlock::iterator>
+ConstantHoistingPass::findConstantInsertionPoint(
     const ConstantInfo &ConstInfo,
-    const ArrayRef<Instruction *> MatInsertPts) const {
+    const ArrayRef<BasicBlock::iterator> MatInsertPts) const {
   assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
   // Collect all basic blocks.
   SetVector<BasicBlock *> BBs;
-  SetVector<Instruction *> InsertPts;
+  SetVector<BasicBlock::iterator> InsertPts;
 
-  for (Instruction *MatInsertPt : MatInsertPts)
+  for (BasicBlock::iterator MatInsertPt : MatInsertPts)
     BBs.insert(MatInsertPt->getParent());
 
   if (BBs.count(Entry)) {
-    InsertPts.insert(&Entry->front());
+    InsertPts.insert(Entry->begin());
     return InsertPts;
   }
 
   if (BFI) {
     findBestInsertionSet(*DT, *BFI, Entry, BBs);
     for (BasicBlock *BB : BBs)
-      InsertPts.insert(&*BB->getFirstInsertionPt());
+      InsertPts.insert(BB->getFirstInsertionPt());
     return InsertPts;
   }
 
@@ -343,7 +344,7 @@ SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
     BB2 = BBs.pop_back_val();
     BB = DT->findNearestCommonDominator(BB1, BB2);
     if (BB == Entry) {
-      InsertPts.insert(&Entry->front());
+      InsertPts.insert(Entry->begin());
       return InsertPts;
     }
     BBs.insert(BB);
@@ -363,6 +364,9 @@ SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
 void ConstantHoistingPass::collectConstantCandidates(
     ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
     ConstantInt *ConstInt) {
+  if (ConstInt->getType()->isVectorTy())
+    return;
+
   InstructionCost Cost;
   // Ask the target about the cost of materializing the constant for the given
   // instruction and operand index.
@@ -761,11 +765,13 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
       Mat = GetElementPtrInst::Create(Type::getInt8Ty(*Ctx), Base, Adj->Offset,
                                       "mat_gep", Adj->MatInsertPt);
       // Hide it behind a bitcast.
-      Mat = new BitCastInst(Mat, Adj->Ty, "mat_bitcast", Adj->MatInsertPt);
+      Mat = new BitCastInst(Mat, Adj->Ty, "mat_bitcast",
+                            Adj->MatInsertPt->getIterator());
     } else
       // Constant being rebased is a ConstantInt.
-      Mat = BinaryOperator::Create(Instruction::Add, Base, Adj->Offset,
-                                   "const_mat", Adj->MatInsertPt);
+      Mat =
+          BinaryOperator::Create(Instruction::Add, Base, Adj->Offset,
+                                 "const_mat", Adj->MatInsertPt->getIterator());
 
     LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
                       << " + " << *Adj->Offset << ") in BB "
@@ -816,7 +822,8 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
 
     // Aside from constant GEPs, only constant cast expressions are collected.
     assert(ConstExpr->isCast() && "ConstExpr should be a cast");
-    Instruction *ConstExprInst = ConstExpr->getAsInstruction(Adj->MatInsertPt);
+    Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+    ConstExprInst->insertBefore(Adj->MatInsertPt);
     ConstExprInst->setOperand(0, Mat);
 
     // Use the same debug location as the instruction we are about to update.
@@ -842,9 +849,9 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
   SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec =
       BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
   for (const consthoist::ConstantInfo &ConstInfo : ConstInfoVec) {
-    SmallVector<Instruction *, 4> MatInsertPts;
+    SmallVector<BasicBlock::iterator, 4> MatInsertPts;
     collectMatInsertPts(ConstInfo.RebasedConstants, MatInsertPts);
-    SetVector<Instruction *> IPSet =
+    SetVector<BasicBlock::iterator> IPSet =
         findConstantInsertionPoint(ConstInfo, MatInsertPts);
     // We can have an empty set if the function contains unreachable blocks.
     if (IPSet.empty())
@@ -853,7 +860,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
     unsigned UsesNum = 0;
     unsigned ReBasesNum = 0;
     unsigned NotRebasedNum = 0;
-    for (Instruction *IP : IPSet) {
+    for (const BasicBlock::iterator &IP : IPSet) {
       // First, collect constants depending on this IP of the base.
       UsesNum = 0;
       SmallVector<UserAdjustment, 4> ToBeRebased;
@@ -861,7 +868,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
       for (auto const &RCI : ConstInfo.RebasedConstants) {
         UsesNum += RCI.Uses.size();
         for (auto const &U : RCI.Uses) {
-          Instruction *MatInsertPt = MatInsertPts[MatCtr++];
+          const BasicBlock::iterator &MatInsertPt = MatInsertPts[MatCtr++];
           BasicBlock *OrigMatInsertBB = MatInsertPt->getParent();
           // If Base constant is to be inserted in multiple places,
           // generate rebase for U using the Base dominating U.
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 85d4065286e41f..1caed93b1b668d 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -65,6 +65,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
@@ -95,6 +96,11 @@ static cl::opt<bool>
                     cl::desc("View the CFG before DFA Jump Threading"),
                     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> EarlyExitHeuristic(
+    "dfa-early-exit-heuristic",
+    cl::desc("Exit early if an unpredictable value come from the same loop"),
+    cl::Hidden, cl::init(true));
+
 static cl::opt<unsigned> MaxPathLength(
     "dfa-max-path-length",
     cl::desc("Max number of blocks searched to find a threading path"),
@@ -131,9 +137,9 @@ void unfold(DomTreeUpdater *DTU, SelectInstToUnfold SIToUnfold,
 
 class DFAJumpThreading {
 public:
-  DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT,
+  DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
                    TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE)
-      : AC(AC), DT(DT), TTI(TTI), ORE(ORE) {}
+      : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {}
 
   bool run(Function &F);
 
@@ -161,6 +167,7 @@ class DFAJumpThreading {
 
   AssumptionCache *AC;
   DominatorTree *DT;
+  LoopInfo *LI;
   TargetTransformInfo *TTI;
   OptimizationRemarkEmitter *ORE;
 };
@@ -378,7 +385,8 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) {
 #endif
 
 struct MainSwitch {
-  MainSwitch(SwitchInst *SI, OptimizationRemarkEmitter *ORE) {
+  MainSwitch(SwitchInst *SI, LoopInfo *LI, OptimizationRemarkEmitter *ORE)
+      : LI(LI) {
     if (isCandidate(SI)) {
       Instr = SI;
     } else {
@@ -402,7 +410,7 @@ struct MainSwitch {
   ///
   /// Also, collect select instructions to unfold.
   bool isCandidate(const SwitchInst *SI) {
-    std::deque<Value *> Q;
+    std::deque<std::pair<Value *, BasicBlock *>> Q;
     SmallSet<Value *, 16> SeenValues;
     SelectInsts.clear();
 
@@ -411,22 +419,29 @@ struct MainSwitch {
     if (!isa<PHINode>(SICond))
       return false;
 
-    addToQueue(SICond, Q, SeenValues);
+    // The switch must be in a loop.
+    const Loop *L = LI->getLoopFor(SI->getParent());
+    if (!L)
+      return false;
+
+    addToQueue(SICond, nullptr, Q, SeenValues);
 
     while (!Q.empty()) {
-      Value *Current = Q.front();
+      Value *Current = Q.front().first;
+      BasicBlock *CurrentIncomingBB = Q.front().second;
       Q.pop_front();
 
       if (auto *Phi = dyn_cast<PHINode>(Current)) {
-        for (Value *Incoming : Phi->incoming_values()) {
-          addToQueue(Incoming, Q, SeenValues);
+        for (BasicBlock *IncomingBB : Phi->blocks()) {
+          Value *Incoming = Phi->getIncomingValueForBlock(IncomingBB);
+          addToQueue(Incoming, IncomingBB, Q, SeenValues);
         }
         LLVM_DEBUG(dbgs() << "\tphi: " << *Phi << "\n");
       } else if (SelectInst *SelI = dyn_cast<SelectInst>(Current)) {
         if (!isValidSelectInst(SelI))
           return false;
-        addToQueue(SelI->getTrueValue(), Q, SeenValues);
-        addToQueue(SelI->getFalseValue(), Q, SeenValues);
+        addToQueue(SelI->getTrueValue(), CurrentIncomingBB, Q, SeenValues);
+        addToQueue(SelI->getFalseValue(), CurrentIncomingBB, Q, SeenValues);
         LLVM_DEBUG(dbgs() << "\tselect: " << *SelI << "\n");
         if (auto *SelIUse = dyn_cast<PHINode>(SelI->user_back()))
           SelectInsts.push_back(SelectInstToUnfold(SelI, SelIUse));
@@ -439,6 +454,18 @@ struct MainSwitch {
         // initial switch values that can be ignored (they will hit the
         // unthreaded switch) but this assumption will get checked later after
         // paths have been enumerated (in function getStateDefMap).
+
+        // If the unpredictable value comes from the same inner loop it is
+        // likely that it will also be on the enumerated paths, causing us to
+        // exit after we have enumerated all the paths. This heuristic save
+        // compile time because a search for all the paths can become expensive.
+        if (EarlyExitHeuristic &&
+            L->contains(LI->getLoopFor(CurrentIncomingBB))) {
+          LLVM_DEBUG(dbgs()
+                     << "\tExiting early due to unpredictability heuristic.\n");
+          return false;
+        }
+
         continue;
       }
     }
@@ -446,11 +473,12 @@ struct MainSwitch {
     return true;
   }
 
-  void addToQueue(Value *Val, std::deque<Value *> &Q,
+  void addToQueue(Value *Val, BasicBlock *BB,
+                  std::deque<std::pair<Value *, BasicBlock *>> &Q,
                   SmallSet<Value *, 16> &SeenValues) {
     if (SeenValues.contains(Val))
       return;
-    Q.push_back(Val);
+    Q.push_back({Val, BB});
     SeenValues.insert(Val);
   }
 
@@ -488,6 +516,7 @@ struct MainSwitch {
     return true;
   }
 
+  LoopInfo *LI;
   SwitchInst *Instr = nullptr;
   SmallVector<SelectInstToUnfold, 4> SelectInsts;
 };
@@ -1262,7 +1291,7 @@ bool DFAJumpThreading::run(Function &F) {
 
     LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName()
                       << " is a candidate\n");
-    MainSwitch Switch(SI, ORE);
+    MainSwitch Switch(SI, LI, ORE);
 
     if (!Switch.getInstr())
       continue;
@@ -1315,10 +1344,11 @@ PreservedAnalyses DFAJumpThreadingPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   OptimizationRemarkEmitter ORE(&F);
 
-  if (!DFAJumpThreading(&AC, &DT, &TTI, &ORE).run(F))
+  if (!DFAJumpThreading(&AC, &DT, &LI, &TTI, &ORE).run(F))
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 392e6ad5a66bb9..bfc8bd5970bf27 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -526,7 +526,8 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
   // returned by getAssignmentMarkers so save a copy of the markers to iterate
   // over.
   auto LinkedRange = at::getAssignmentMarkers(Inst);
-  SmallVector<DPValue *> LinkedDPVAssigns = at::getDPVAssignmentMarkers(Inst);
+  SmallVector<DbgVariableRecord *> LinkedDVRAssigns =
+      at::getDVRAssignmentMarkers(Inst);
   SmallVector<DbgAssignIntrinsic *> Linked(LinkedRange.begin(),
                                            LinkedRange.end());
   auto InsertAssignForOverlap = [&](auto *Assign) {
@@ -554,7 +555,7 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
     NewAssign->setKillAddress();
   };
   for_each(Linked, InsertAssignForOverlap);
-  for_each(LinkedDPVAssigns, InsertAssignForOverlap);
+  for_each(LinkedDVRAssigns, InsertAssignForOverlap);
 }
 
 static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 6ad4be169b589a..ccca8bcc1a56ac 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -311,7 +311,7 @@ void Float2IntPass::walkForwards() {
 }
 
 // If there is a valid transform to be done, do it.
-bool Float2IntPass::validateAndTransform(const DataLayout &DL) {
+bool Float2IntPass::validateAndTransform() {
   bool MadeChange = false;
 
   // Iterate over every disjoint partition of the def-use graph.
@@ -376,24 +376,16 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) {
       LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
       continue;
     }
-
-    // OK, R is known to be representable.
-    // Pick the smallest legal type that will fit.
-    Type *Ty = DL.getSmallestLegalIntType(*Ctx, MinBW);
-    if (!Ty) {
-      // Every supported target supports 64-bit and 32-bit integers,
-      // so fallback to a 32 or 64-bit integer if the value fits.
-      if (MinBW <= 32) {
-        Ty = Type::getInt32Ty(*Ctx);
-      } else if (MinBW <= 64) {
-        Ty = Type::getInt64Ty(*Ctx);
-      } else {
-        LLVM_DEBUG(dbgs() << "F2I: Value requires more than bits to represent "
-                             "than the target supports!\n");
-        continue;
-      }
+    if (MinBW > 64) {
+      LLVM_DEBUG(
+          dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+      continue;
     }
 
+    // OK, R is known to be representable. Now pick a type for it.
+    // FIXME: Pick the smallest legal type that will fit.
+    Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+
     for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
          MI != ME; ++MI)
       convert(*MI, Ty);
@@ -499,8 +491,7 @@ bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
   walkBackwards();
   walkForwards();
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  bool Modified = validateAndTransform(DL);
+  bool Modified = validateAndTransform();
   if (Modified)
     cleanup();
   return Modified;
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 5d7b050ed5b2e3..fd68359525707f 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -401,8 +401,8 @@ static bool replaceFoldableUses(Instruction *Cond, Value *ToVal,
     Changed |= replaceNonLocalUsesWith(Cond, ToVal);
   for (Instruction &I : reverse(*KnownAtEndOfBB)) {
     // Replace any debug-info record users of Cond with ToVal.
-    for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange()))
-      DPV.replaceVariableLocationOp(Cond, ToVal, true);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+      DVR.replaceVariableLocationOp(Cond, ToVal, true);
 
     // Reached the Cond whose uses we are trying to replace, so there are no
     // more uses.
@@ -1955,7 +1955,7 @@ void JumpThreadingPass::updateSSA(
   SSAUpdater SSAUpdate;
   SmallVector<Use *, 16> UsesToRename;
   SmallVector<DbgValueInst *, 4> DbgValues;
-  SmallVector<DPValue *, 4> DPValues;
+  SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
 
   for (Instruction &I : *BB) {
     // Scan all uses of this instruction to see if it is used outside of its
@@ -1972,16 +1972,16 @@ void JumpThreadingPass::updateSSA(
     }
 
     // Find debug values outside of the block
-    findDbgValues(DbgValues, &I, &DPValues);
+    findDbgValues(DbgValues, &I, &DbgVariableRecords);
     llvm::erase_if(DbgValues, [&](const DbgValueInst *DbgVal) {
       return DbgVal->getParent() == BB;
     });
-    llvm::erase_if(DPValues, [&](const DPValue *DPVal) {
-      return DPVal->getParent() == BB;
+    llvm::erase_if(DbgVariableRecords, [&](const DbgVariableRecord *DbgVarRec) {
+      return DbgVarRec->getParent() == BB;
     });
 
     // If there are no uses outside the block, we're done with this instruction.
-    if (UsesToRename.empty() && DbgValues.empty() && DPValues.empty())
+    if (UsesToRename.empty() && DbgValues.empty() && DbgVariableRecords.empty())
       continue;
     LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
 
@@ -1994,11 +1994,11 @@ void JumpThreadingPass::updateSSA(
 
     while (!UsesToRename.empty())
       SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
-    if (!DbgValues.empty() || !DPValues.empty()) {
+    if (!DbgValues.empty() || !DbgVariableRecords.empty()) {
       SSAUpdate.UpdateDebugValues(&I, DbgValues);
-      SSAUpdate.UpdateDebugValues(&I, DPValues);
+      SSAUpdate.UpdateDebugValues(&I, DbgVariableRecords);
       DbgValues.clear();
-      DPValues.clear();
+      DbgVariableRecords.clear();
     }
 
     LLVM_DEBUG(dbgs() << "\n");
@@ -2041,11 +2041,11 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
     return true;
   };
 
-  // Duplicate implementation of the above dbg.value code, using DPValues
-  // instead.
-  auto RetargetDPValueIfPossible = [&](DPValue *DPV) {
+  // Duplicate implementation of the above dbg.value code, using
+  // DbgVariableRecords instead.
+  auto RetargetDbgVariableRecordIfPossible = [&](DbgVariableRecord *DVR) {
     SmallSet<std::pair<Value *, Value *>, 16> OperandsToRemap;
-    for (auto *Op : DPV->location_ops()) {
+    for (auto *Op : DVR->location_ops()) {
       Instruction *OpInst = dyn_cast<Instruction>(Op);
       if (!OpInst)
         continue;
@@ -2056,7 +2056,7 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
     }
 
     for (auto &[OldOp, MappedOp] : OperandsToRemap)
-      DPV->replaceVariableLocationOp(OldOp, MappedOp);
+      DVR->replaceVariableLocationOp(OldOp, MappedOp);
   };
 
   BasicBlock *RangeBB = BI->getParent();
@@ -2080,9 +2080,9 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
   cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context);
 
   auto CloneAndRemapDbgInfo = [&](Instruction *NewInst, Instruction *From) {
-    auto DPVRange = NewInst->cloneDebugInfoFrom(From);
-    for (DPValue &DPV : filterDbgVars(DPVRange))
-      RetargetDPValueIfPossible(&DPV);
+    auto DVRRange = NewInst->cloneDebugInfoFrom(From);
+    for (DbgVariableRecord &DVR : filterDbgVars(DVRRange))
+      RetargetDbgVariableRecordIfPossible(&DVR);
   };
 
   // Clone the non-phi instructions of the source basic block into NewBB,
@@ -2109,15 +2109,15 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
       }
   }
 
-  // There may be DPValues on the terminator, clone directly from marker
-  // to marker as there isn't an instruction there.
+  // There may be DbgVariableRecords on the terminator, clone directly from
+  // marker to marker as there isn't an instruction there.
   if (BE != RangeBB->end() && BE->hasDbgRecords()) {
     // Dump them at the end.
     DPMarker *Marker = RangeBB->getMarker(BE);
     DPMarker *EndMarker = NewBB->createMarker(NewBB->end());
-    auto DPVRange = EndMarker->cloneDebugInfoFrom(Marker, std::nullopt);
-    for (DPValue &DPV : filterDbgVars(DPVRange))
-      RetargetDPValueIfPossible(&DPV);
+    auto DVRRange = EndMarker->cloneDebugInfoFrom(Marker, std::nullopt);
+    for (DbgVariableRecord &DVR : filterDbgVars(DVRRange))
+      RetargetDbgVariableRecordIfPossible(&DVR);
   }
 
   return ValueMapping;
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 40bc16f9b57516..e50413de46b1b4 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2701,6 +2701,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
 
   // First, we need to make sure we should do the transformation.
   SmallVector<Use *> Changes;
+  SmallVector<BinaryOperator *> Adds;
   SmallVector<BinaryOperator *> Worklist;
   if (BinaryOperator *VariantBinOp = dyn_cast<BinaryOperator>(VariantOp))
     Worklist.push_back(VariantBinOp);
@@ -2713,6 +2714,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
         isa<BinaryOperator>(BO->getOperand(1))) {
       Worklist.push_back(cast<BinaryOperator>(BO->getOperand(0)));
       Worklist.push_back(cast<BinaryOperator>(BO->getOperand(1)));
+      Adds.push_back(BO);
       continue;
     }
     if (!isReassociableOp(BO, Instruction::Mul, Instruction::FMul) ||
@@ -2735,6 +2737,12 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
   if (Changes.empty())
     return false;
 
+  // Drop the poison flags for any adds we looked through.
+  if (I.getType()->isIntOrIntVectorTy()) {
+    for (auto *Add : Adds)
+      Add->dropPoisonGeneratingFlags();
+  }
+
   // We know we should do it so let's do the transformation.
   auto *Preheader = L.getLoopPreheader();
   assert(Preheader && "Loop is not in simplify form?");
@@ -2743,9 +2751,11 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
     assert(L.isLoopInvariant(U->get()));
     Instruction *Ins = cast<Instruction>(U->getUser());
     Value *Mul;
-    if (I.getType()->isIntOrIntVectorTy())
+    if (I.getType()->isIntOrIntVectorTy()) {
       Mul = Builder.CreateMul(U->get(), Factor, "factor.op.mul");
-    else
+      // Drop the poison flags on the original multiply.
+      Ins->dropPoisonGeneratingFlags();
+    } else
       Mul = Builder.CreateFMulFMF(U->get(), Factor, Ins, "factor.op.fmul");
     U->set(Mul);
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 7641ba2a219cc3..ec42e2d6e193a6 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6367,10 +6367,10 @@ struct DVIRecoveryRec {
   DVIRecoveryRec(DbgValueInst *DbgValue)
       : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
         HadLocationArgList(false) {}
-  DVIRecoveryRec(DPValue *DPV)
-      : DbgRef(DPV), Expr(DPV->getExpression()), HadLocationArgList(false) {}
+  DVIRecoveryRec(DbgVariableRecord *DVR)
+      : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
 
-  PointerUnion<DbgValueInst *, DPValue *> DbgRef;
+  PointerUnion<DbgValueInst *, DbgVariableRecord *> DbgRef;
   DIExpression *Expr;
   bool HadLocationArgList;
   SmallVector<WeakVH, 2> LocationOps;
@@ -6466,7 +6466,7 @@ static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
   if (isa<DbgValueInst *>(DVIRec.DbgRef))
     UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
   else
-    UpdateDbgValueInstImpl(cast<DPValue *>(DVIRec.DbgRef));
+    UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
 }
 
 /// Cached location ops may be erased during LSR, in which case a poison is
@@ -6512,7 +6512,7 @@ static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
   if (isa<DbgValueInst *>(DVIRec.DbgRef))
     RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
   else
-    RestorePreTransformStateImpl(cast<DPValue *>(DVIRec.DbgRef));
+    RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
 }
 
 static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
@@ -6522,7 +6522,7 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
 
   if (isa<DbgValueInst *>(DVIRec.DbgRef)
           ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
-          : !cast<DPValue *>(DVIRec.DbgRef)->isKillLocation())
+          : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
     return false;
 
   // LSR may have caused several changes to the dbg.value in the failed salvage
@@ -6620,7 +6620,7 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
                       << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
   else
     LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
-                      << *cast<DPValue *>(DVIRec.DbgRef) << "\n");
+                      << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
   return true;
 }
 
@@ -6711,9 +6711,9 @@ static void DbgGatherSalvagableDVI(
         SalvageableDVISCEVs.push_back(std::move(NewRec));
         return true;
       };
-      for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange())) {
-        if (DPV.isDbgValue() || DPV.isDbgAssign())
-          ProcessDbgValue(&DPV);
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (DVR.isDbgValue() || DVR.isDbgAssign())
+          ProcessDbgValue(&DVR);
       }
       auto DVI = dyn_cast<DbgValueInst>(&I);
       if (!DVI)
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 67c011b747acfd..e991296bd2fb07 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -19,6 +19,7 @@
 
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -990,12 +991,15 @@ class LowerMatrixIntrinsics {
     bool Changed = false;
     SmallVector<CallInst *, 16> MaybeFusableInsts;
     SmallVector<Instruction *, 16> MatrixInsts;
+    SmallVector<IntrinsicInst *, 16> LifetimeEnds;
 
     // First, collect all instructions with shape information and candidates for
     // fusion (currently only matrix multiplies).
     ReversePostOrderTraversal<Function *> RPOT(&Func);
     for (auto *BB : RPOT)
       for (Instruction &I : *BB) {
+        if (match(&I, m_Intrinsic<Intrinsic::lifetime_end>()))
+          LifetimeEnds.push_back(cast<IntrinsicInst>(&I));
         if (ShapeMap.find(&I) == ShapeMap.end())
           continue;
         if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))
@@ -1010,7 +1014,7 @@ class LowerMatrixIntrinsics {
 
     // Third, try to fuse candidates.
     for (CallInst *CI : MaybeFusableInsts)
-      LowerMatrixMultiplyFused(CI, FusedInsts);
+      LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds);
 
     Changed = !FusedInsts.empty();
 
@@ -1856,8 +1860,10 @@ class LowerMatrixIntrinsics {
   ///
   /// Call finalizeLowering on lowered instructions.  Instructions that are
   /// completely eliminated by fusion are added to \p FusedInsts.
-  void LowerMatrixMultiplyFused(CallInst *MatMul,
-                                SmallPtrSetImpl<Instruction *> &FusedInsts) {
+  void
+  LowerMatrixMultiplyFused(CallInst *MatMul,
+                           SmallPtrSetImpl<Instruction *> &FusedInsts,
+                           SmallVector<IntrinsicInst *, 16> &LifetimeEnds) {
     if (!FuseMatrix || !DT)
       return;
 
@@ -1946,6 +1952,55 @@ class LowerMatrixIntrinsics {
       for (Instruction *I : ToHoist)
         I->moveBefore(MatMul);
 
+      // Deal with lifetime.end calls that might be between Load0/Load1 and the
+      // store. To avoid introducing loads to dead objects (i.e. after the
+      // lifetime has been termined by @llvm.lifetime.end), either sink them
+      // after the store if in the same block, or remove the lifetime.end marker
+      // otherwise. This might pessimize further optimizations, by extending the
+      // lifetime of the object until the function returns, but should be
+      // conservatively correct.
+      MemoryLocation Load0Loc = MemoryLocation::get(LoadOp0);
+      MemoryLocation Load1Loc = MemoryLocation::get(LoadOp1);
+      BasicBlock *StoreParent = Store->getParent();
+      bool FusableOpsInSameBlock = LoadOp0->getParent() == StoreParent &&
+                                   LoadOp1->getParent() == StoreParent;
+      for (unsigned Idx = 0; Idx != LifetimeEnds.size();) {
+        IntrinsicInst *End = LifetimeEnds[Idx];
+        auto Inc = make_scope_exit([&Idx]() { Idx++; });
+        // If the lifetime.end is guaranteed to be before the loads or after the
+        // store, it won't interfere with fusion.
+        if (DT->dominates(End, LoadOp0) && DT->dominates(End, LoadOp1))
+          continue;
+        if (DT->dominates(Store, End))
+          continue;
+        // If all fusable ops are in the same block and the lifetime.end is in a
+        // different block, it won't interfere with fusion.
+        if (FusableOpsInSameBlock && End->getParent() != StoreParent)
+          continue;
+
+        // If the loads don't alias the lifetime.end, it won't interfere with
+        // fusion.
+        MemoryLocation EndLoc = MemoryLocation::getForArgument(End, 1, nullptr);
+        if (!EndLoc.Ptr)
+          continue;
+        if (AA->isNoAlias(Load0Loc, EndLoc) && AA->isNoAlias(Load1Loc, EndLoc))
+          continue;
+
+        // If both lifetime.end and the store are in the same block, extend the
+        // lifetime until after the store, so the new lifetime covers the loads
+        // we introduce later.
+        if (End->getParent() == StoreParent) {
+          End->moveAfter(Store);
+          continue;
+        }
+
+        // Otherwise remove the conflicting lifetime.end marker.
+        ToRemove.push_back(End);
+        std::swap(LifetimeEnds[Idx], LifetimeEnds.back());
+        LifetimeEnds.pop_back();
+        Inc.release();
+      }
+
       emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
       return;
     }
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index e238f311a15bc2..096c6d1b1fad27 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -319,17 +319,17 @@ static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
   return DebugVariable(DVI->getVariable(), std::nullopt,
                        DVI->getDebugLoc().getInlinedAt());
 }
-static DebugVariable getAggregateVariable(DPValue *DPV) {
-  return DebugVariable(DPV->getVariable(), std::nullopt,
-                       DPV->getDebugLoc().getInlinedAt());
+static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
+  return DebugVariable(DVR->getVariable(), std::nullopt,
+                       DVR->getDebugLoc().getInlinedAt());
 }
 
 /// Helpers for handling new and old debug info modes in migrateDebugInfo.
 /// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based
 /// on the \p Unused parameter type.
-DPValue *UnwrapDbgInstPtr(DbgInstPtr P, DPValue *Unused) {
+DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) {
   (void)Unused;
-  return static_cast<DPValue *>(cast<DbgRecord *>(P));
+  return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P));
 }
 DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) {
   (void)Unused;
@@ -356,9 +356,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
                              Instruction *Inst, Value *Dest, Value *Value,
                              const DataLayout &DL) {
   auto MarkerRange = at::getAssignmentMarkers(OldInst);
-  auto DPVAssignMarkerRange = at::getDPVAssignmentMarkers(OldInst);
+  auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
   // Nothing to do if OldInst has no linked dbg.assign intrinsics.
-  if (MarkerRange.empty() && DPVAssignMarkerRange.empty())
+  if (MarkerRange.empty() && DVRAssignMarkerRange.empty())
     return;
 
   LLVM_DEBUG(dbgs() << "  migrateDebugInfo\n");
@@ -379,9 +379,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   for (auto *DAI : at::getAssignmentMarkers(OldAlloca))
     BaseFragments[getAggregateVariable(DAI)] =
         DAI->getExpression()->getFragmentInfo();
-  for (auto *DPV : at::getDPVAssignmentMarkers(OldAlloca))
-    BaseFragments[getAggregateVariable(DPV)] =
-        DPV->getExpression()->getFragmentInfo();
+  for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
+    BaseFragments[getAggregateVariable(DVR)] =
+        DVR->getExpression()->getFragmentInfo();
 
   // The new inst needs a DIAssignID unique metadata tag (if OldInst has
   // one). It shouldn't already have one: assert this assumption.
@@ -488,7 +488,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   };
 
   for_each(MarkerRange, MigrateDbgAssign);
-  for_each(DPVAssignMarkerRange, MigrateDbgAssign);
+  for_each(DVRAssignMarkerRange, MigrateDbgAssign);
 }
 
 namespace {
@@ -3195,7 +3195,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       // emit dbg.assign intrinsics for mem intrinsics storing through non-
       // constant geps, or storing a variable number of bytes.
       assert(at::getAssignmentMarkers(&II).empty() &&
-             at::getDPVAssignmentMarkers(&II).empty() &&
+             at::getDVRAssignmentMarkers(&II).empty() &&
              "AT: Unexpected link to non-const GEP");
       deleteIfTriviallyDead(OldPtr);
       return false;
@@ -3350,7 +3350,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
             DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
         };
         for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress);
-        for_each(at::getDPVAssignmentMarkers(&II), UpdateAssignAddress);
+        for_each(at::getDVRAssignmentMarkers(&II), UpdateAssignAddress);
         II.setDest(AdjustedPtr);
         II.setDestAlignment(SliceAlign);
       } else {
@@ -3939,7 +3939,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
                          DL);
       } else {
         assert(at::getAssignmentMarkers(Store).empty() &&
-               at::getDPVAssignmentMarkers(Store).empty() &&
+               at::getDVRAssignmentMarkers(Store).empty() &&
                "AT: unexpected debug.assign linked to store through "
                "unbounded GEP");
       }
@@ -5034,14 +5034,14 @@ static void insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig,
   LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
   (void)NewAssign;
 }
-static void insertNewDbgInst(DIBuilder &DIB, DPValue *Orig, AllocaInst *NewAddr,
-                             DIExpression *NewFragmentExpr,
+static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig,
+                             AllocaInst *NewAddr, DIExpression *NewFragmentExpr,
                              Instruction *BeforeInst) {
   (void)DIB;
   if (Orig->isDbgDeclare()) {
-    DPValue *DPV = DPValue::createDPVDeclare(
+    DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
         NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
-    BeforeInst->getParent()->insertDbgRecordBefore(DPV,
+    BeforeInst->getParent()->insertDbgRecordBefore(DVR,
                                                    BeforeInst->getIterator());
     return;
   }
@@ -5049,10 +5049,10 @@ static void insertNewDbgInst(DIBuilder &DIB, DPValue *Orig, AllocaInst *NewAddr,
     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
                          DIAssignID::getDistinct(NewAddr->getContext()));
   }
-  DPValue *NewAssign = DPValue::createLinkedDPVAssign(
+  DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
       NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
       Orig->getAddressExpression(), Orig->getDebugLoc());
-  LLVM_DEBUG(dbgs() << "Created new DPVAssign: " << *NewAssign << "\n");
+  LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
   (void)NewAssign;
 }
 
@@ -5220,7 +5220,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
           OldDII->eraseFromParent();
       };
       for_each(findDbgDeclares(Fragment.Alloca), RemoveOne);
-      for_each(findDPVDeclares(Fragment.Alloca), RemoveOne);
+      for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
 
       insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, FragmentExpr, &AI);
     }
@@ -5229,9 +5229,9 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
   for_each(findDbgDeclares(&AI), MigrateOne);
-  for_each(findDPVDeclares(&AI), MigrateOne);
+  for_each(findDVRDeclares(&AI), MigrateOne);
   for_each(at::getAssignmentMarkers(&AI), MigrateOne);
-  for_each(at::getDPVAssignmentMarkers(&AI), MigrateOne);
+  for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
 
   return Changed;
 }
@@ -5355,7 +5355,7 @@ bool SROA::deleteDeadInstructions(
       DeletedAllocas.insert(AI);
       for (DbgDeclareInst *OldDII : findDbgDeclares(AI))
         OldDII->eraseFromParent();
-      for (DPValue *OldDII : findDPVDeclares(AI))
+      for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
         OldDII->eraseFromParent();
     }
 
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 9fcaf2a89e5639..64b850a6a5c2ff 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1260,8 +1260,9 @@ static BasicBlock *buildClonedLoopBlocks(
   Module *M = ClonedPH->getParent()->getParent();
   for (auto *ClonedBB : NewBlocks)
     for (Instruction &I : *ClonedBB) {
-      RemapDPValueRange(M, I.getDbgRecordRange(), VMap,
-                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      RemapDbgVariableRecordRange(M, I.getDbgRecordRange(), VMap,
+                                  RF_NoModuleLevelChanges |
+                                      RF_IgnoreMissingLocals);
       RemapInstruction(&I, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       if (auto *II = dyn_cast<AssumeInst>(&I))
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index 44ea6e9ffefa2b..400b56894174db 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -263,7 +263,8 @@ static InstructionCost ComputeSpeculationCost(const Instruction *I,
 bool SpeculativeExecutionPass::considerHoistingFromTo(
     BasicBlock &FromBlock, BasicBlock &ToBlock) {
   SmallPtrSet<const Instruction *, 8> NotHoisted;
-  SmallDenseMap<const Instruction *, SmallVector<DPValue *>> DPValuesToHoist;
+  SmallDenseMap<const Instruction *, SmallVector<DbgVariableRecord *>>
+      DbgVariableRecordsToHoist;
   auto HasNoUnhoistedInstr = [&NotHoisted](auto Values) {
     for (const Value *V : Values) {
       if (const auto *I = dyn_cast_or_null<Instruction>(V))
@@ -291,11 +292,11 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   InstructionCost TotalSpeculationCost = 0;
   unsigned NotHoistedInstCount = 0;
   for (const auto &I : FromBlock) {
-    // Make note of any DPValues that need hoisting. DPLabels
+    // Make note of any DbgVariableRecords that need hoisting. DPLabels
     // get left behind just like llvm.dbg.labels.
-    for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange())) {
-      if (HasNoUnhoistedInstr(DPV.location_ops()))
-        DPValuesToHoist[DPV.getInstruction()].push_back(&DPV);
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      if (HasNoUnhoistedInstr(DVR.location_ops()))
+        DbgVariableRecordsToHoist[DVR.getInstruction()].push_back(&DVR);
     }
     const InstructionCost Cost = ComputeSpeculationCost(&I, *TTI);
     if (Cost.isValid() && isSafeToSpeculativelyExecute(&I) &&
@@ -314,13 +315,13 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   }
 
   for (auto I = FromBlock.begin(); I != FromBlock.end();) {
-    // If any DPValues attached to this instruction should be hoisted, hoist
-    // them now - they will end up attached to either the next hoisted
+    // If any DbgVariableRecords attached to this instruction should be hoisted,
+    // hoist them now - they will end up attached to either the next hoisted
     // instruction or the ToBlock terminator.
-    if (DPValuesToHoist.contains(&*I)) {
-      for (auto *DPV : DPValuesToHoist[&*I]) {
-        DPV->removeFromParent();
-        ToBlock.insertDbgRecordBefore(DPV,
+    if (DbgVariableRecordsToHoist.contains(&*I)) {
+      for (auto *DVR : DbgVariableRecordsToHoist[&*I]) {
+        DVR->removeFromParent();
+        ToBlock.insertDbgRecordBefore(DVR,
                                       ToBlock.getTerminator()->getIterator());
       }
     }
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index e7ad18df106d8d..bf1de05a647dd2 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -382,8 +382,9 @@ bool llvm::MergeBlockSuccessorsIntoGivenBlocks(
 /// - Check fully overlapping fragments and not only identical fragments.
 /// - Support dbg.declare. dbg.label, and possibly other meta instructions being
 ///   part of the sequence of consecutive instructions.
-static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
-  SmallVector<DPValue *, 8> ToBeRemoved;
+static bool
+DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
+  SmallVector<DbgVariableRecord *, 8> ToBeRemoved;
   SmallDenseSet<DebugVariable> VariableSet;
   for (auto &I : reverse(*BB)) {
     for (DbgRecord &DR : reverse(I.getDbgRecordRange())) {
@@ -394,10 +395,10 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
         continue;
       }
 
-      DPValue &DPV = cast<DPValue>(DR);
+      DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
       // Skip declare-type records, as the debug intrinsic method only works
       // on dbg.value intrinsics.
-      if (DPV.getType() == DPValue::LocationType::Declare) {
+      if (DVR.getType() == DbgVariableRecord::LocationType::Declare) {
         // The debug intrinsic method treats dbg.declares are "non-debug"
         // instructions (i.e., a break in a consecutive range of debug
         // intrinsics). Emulate that to create identical outputs. See
@@ -407,8 +408,8 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
         continue;
       }
 
-      DebugVariable Key(DPV.getVariable(), DPV.getExpression(),
-                        DPV.getDebugLoc()->getInlinedAt());
+      DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
+                        DVR.getDebugLoc()->getInlinedAt());
       auto R = VariableSet.insert(Key);
       // If the same variable fragment is described more than once it is enough
       // to keep the last one (i.e. the first found since we for reverse
@@ -416,14 +417,14 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
       if (R.second)
         continue;
 
-      if (DPV.isDbgAssign()) {
+      if (DVR.isDbgAssign()) {
         // Don't delete dbg.assign intrinsics that are linked to instructions.
-        if (!at::getAssignmentInsts(&DPV).empty())
+        if (!at::getAssignmentInsts(&DVR).empty())
           continue;
         // Unlinked dbg.assign intrinsics can be treated like dbg.values.
       }
 
-      ToBeRemoved.push_back(&DPV);
+      ToBeRemoved.push_back(&DVR);
       continue;
     }
     // Sequence with consecutive dbg.value instrs ended. Clear the map to
@@ -432,15 +433,15 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
     VariableSet.clear();
   }
 
-  for (auto &DPV : ToBeRemoved)
-    DPV->eraseFromParent();
+  for (auto &DVR : ToBeRemoved)
+    DVR->eraseFromParent();
 
   return !ToBeRemoved.empty();
 }
 
 static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
   if (BB->IsNewDbgInfoFormat)
-    return DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BB);
+    return DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BB);
 
   SmallVector<DbgValueInst *, 8> ToBeRemoved;
   SmallDenseSet<DebugVariable> VariableSet;
@@ -499,29 +500,30 @@ static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
 ///
 /// Possible improvements:
 /// - Keep track of non-overlapping fragments.
-static bool DPValuesRemoveRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
-  SmallVector<DPValue *, 8> ToBeRemoved;
+static bool
+DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
+  SmallVector<DbgVariableRecord *, 8> ToBeRemoved;
   DenseMap<DebugVariable, std::pair<SmallVector<Value *, 4>, DIExpression *>>
       VariableMap;
   for (auto &I : *BB) {
-    for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange())) {
-      if (DPV.getType() == DPValue::LocationType::Declare)
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      if (DVR.getType() == DbgVariableRecord::LocationType::Declare)
         continue;
-      DebugVariable Key(DPV.getVariable(), std::nullopt,
-                        DPV.getDebugLoc()->getInlinedAt());
+      DebugVariable Key(DVR.getVariable(), std::nullopt,
+                        DVR.getDebugLoc()->getInlinedAt());
       auto VMI = VariableMap.find(Key);
       // A dbg.assign with no linked instructions can be treated like a
       // dbg.value (i.e. can be deleted).
       bool IsDbgValueKind =
-          (!DPV.isDbgAssign() || at::getAssignmentInsts(&DPV).empty());
+          (!DVR.isDbgAssign() || at::getAssignmentInsts(&DVR).empty());
 
       // Update the map if we found a new value/expression describing the
       // variable, or if the variable wasn't mapped already.
-      SmallVector<Value *, 4> Values(DPV.location_ops());
+      SmallVector<Value *, 4> Values(DVR.location_ops());
       if (VMI == VariableMap.end() || VMI->second.first != Values ||
-          VMI->second.second != DPV.getExpression()) {
+          VMI->second.second != DVR.getExpression()) {
         if (IsDbgValueKind)
-          VariableMap[Key] = {Values, DPV.getExpression()};
+          VariableMap[Key] = {Values, DVR.getExpression()};
         else
           VariableMap[Key] = {Values, nullptr};
         continue;
@@ -530,55 +532,56 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
       if (!IsDbgValueKind)
         continue;
       // Found an identical mapping. Remember the instruction for later removal.
-      ToBeRemoved.push_back(&DPV);
+      ToBeRemoved.push_back(&DVR);
     }
   }
 
-  for (auto *DPV : ToBeRemoved)
-    DPV->eraseFromParent();
+  for (auto *DVR : ToBeRemoved)
+    DVR->eraseFromParent();
 
   return !ToBeRemoved.empty();
 }
 
-static bool DPValuesRemoveUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
+static bool
+DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
   assert(BB->isEntryBlock() && "expected entry block");
-  SmallVector<DPValue *, 8> ToBeRemoved;
+  SmallVector<DbgVariableRecord *, 8> ToBeRemoved;
   DenseSet<DebugVariable> SeenDefForAggregate;
   // Returns the DebugVariable for DVI with no fragment info.
-  auto GetAggregateVariable = [](const DPValue &DPV) {
-    return DebugVariable(DPV.getVariable(), std::nullopt,
-                         DPV.getDebugLoc().getInlinedAt());
+  auto GetAggregateVariable = [](const DbgVariableRecord &DVR) {
+    return DebugVariable(DVR.getVariable(), std::nullopt,
+                         DVR.getDebugLoc().getInlinedAt());
   };
 
   // Remove undef dbg.assign intrinsics that are encountered before
   // any non-undef intrinsics from the entry block.
   for (auto &I : *BB) {
-    for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange())) {
-      if (!DPV.isDbgValue() && !DPV.isDbgAssign())
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      if (!DVR.isDbgValue() && !DVR.isDbgAssign())
         continue;
       bool IsDbgValueKind =
-          (DPV.isDbgValue() || at::getAssignmentInsts(&DPV).empty());
-      DebugVariable Aggregate = GetAggregateVariable(DPV);
+          (DVR.isDbgValue() || at::getAssignmentInsts(&DVR).empty());
+      DebugVariable Aggregate = GetAggregateVariable(DVR);
       if (!SeenDefForAggregate.contains(Aggregate)) {
-        bool IsKill = DPV.isKillLocation() && IsDbgValueKind;
+        bool IsKill = DVR.isKillLocation() && IsDbgValueKind;
         if (!IsKill) {
           SeenDefForAggregate.insert(Aggregate);
-        } else if (DPV.isDbgAssign()) {
-          ToBeRemoved.push_back(&DPV);
+        } else if (DVR.isDbgAssign()) {
+          ToBeRemoved.push_back(&DVR);
         }
       }
     }
   }
 
-  for (DPValue *DPV : ToBeRemoved)
-    DPV->eraseFromParent();
+  for (DbgVariableRecord *DVR : ToBeRemoved)
+    DVR->eraseFromParent();
 
   return !ToBeRemoved.empty();
 }
 
 static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
   if (BB->IsNewDbgInfoFormat)
-    return DPValuesRemoveRedundantDbgInstrsUsingForwardScan(BB);
+    return DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BB);
 
   SmallVector<DbgValueInst *, 8> ToBeRemoved;
   DenseMap<DebugVariable, std::pair<SmallVector<Value *, 4>, DIExpression *>>
@@ -642,7 +645,7 @@ static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
 /// - Keep track of non-overlapping fragments.
 static bool removeUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
   if (BB->IsNewDbgInfoFormat)
-    return DPValuesRemoveUndefDbgAssignsFromEntryBlock(BB);
+    return DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BB);
 
   assert(BB->isEntryBlock() && "expected entry block");
   SmallVector<DbgAssignIntrinsic *, 8> ToBeRemoved;
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 6931d1997aa6b2..3eac726994ae13 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -276,8 +276,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     // attached debug-info records.
     for (Instruction &II : *BB) {
       RemapInstruction(&II, VMap, RemapFlag, TypeMapper, Materializer);
-      RemapDPValueRange(II.getModule(), II.getDbgRecordRange(), VMap, RemapFlag,
-                        TypeMapper, Materializer);
+      RemapDbgVariableRecordRange(II.getModule(), II.getDbgRecordRange(), VMap,
+                                  RemapFlag, TypeMapper, Materializer);
     }
 
   // Only update !llvm.dbg.cu for DifferentModule (not CloneModule). In the
@@ -641,8 +641,8 @@ void PruningFunctionCloner::CloneBlock(
     // Recursively clone any reachable successor blocks.
     append_range(ToClone, successors(BB->getTerminator()));
   } else {
-    // If we didn't create a new terminator, clone DPValues from the old
-    // terminator onto the new terminator.
+    // If we didn't create a new terminator, clone DbgVariableRecords from the
+    // old terminator onto the new terminator.
     Instruction *NewInst = NewBB->getTerminator();
     assert(NewInst);
 
@@ -884,14 +884,15 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
                        TypeMapper, Materializer);
   }
 
-  // Do the same for DPValues, touching all the instructions in the cloned
-  // range of blocks.
+  // Do the same for DbgVariableRecords, touching all the instructions in the
+  // cloned range of blocks.
   Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
   for (BasicBlock &BB : make_range(Begin, NewFunc->end())) {
     for (Instruction &I : BB) {
-      RemapDPValueRange(I.getModule(), I.getDbgRecordRange(), VMap,
-                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                        TypeMapper, Materializer);
+      RemapDbgVariableRecordRange(I.getModule(), I.getDbgRecordRange(), VMap,
+                                  ModuleLevelChanges ? RF_None
+                                                     : RF_NoModuleLevelChanges,
+                                  TypeMapper, Materializer);
     }
   }
 
@@ -990,8 +991,9 @@ void llvm::remapInstructionsInBlocks(ArrayRef<BasicBlock *> Blocks,
   // Rewrite the code to refer to itself.
   for (auto *BB : Blocks) {
     for (auto &Inst : *BB) {
-      RemapDPValueRange(Inst.getModule(), Inst.getDbgRecordRange(), VMap,
-                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      RemapDbgVariableRecordRange(
+          Inst.getModule(), Inst.getDbgRecordRange(), VMap,
+          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       RemapInstruction(&Inst, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     }
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 0fac5fc02e3f02..122b7a9747b675 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1010,6 +1010,18 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
 
     newFunction->addFnAttr(Attr);
   }
+
+  if (NumExitBlocks == 0) {
+    // Mark the new function `noreturn` if applicable. Terminators which resume
+    // exception propagation are treated as returning instructions. This is to
+    // avoid inserting traps after calls to outlined functions which unwind.
+    if (none_of(Blocks, [](const BasicBlock *BB) {
+          const Instruction *Term = BB->getTerminator();
+          return isa<ReturnInst>(Term) || isa<ResumeInst>(Term);
+        }))
+      newFunction->setDoesNotReturn();
+  }
+
   newFunction->insert(newFunction->end(), newRootNode);
 
   // Create scalar and aggregate iterators to name all of the arguments we
@@ -1392,19 +1404,23 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   case 0:
     // There are no successors (the block containing the switch itself), which
     // means that previously this was the last part of the function, and hence
-    // this should be rewritten as a `ret'
-
-    // Check if the function should return a value
-    if (OldFnRetTy->isVoidTy()) {
-      ReturnInst::Create(Context, nullptr, TheSwitch->getIterator());  // Return void
+    // this should be rewritten as a `ret` or `unreachable`.
+    if (newFunction->doesNotReturn()) {
+      // If fn is no return, end with an unreachable terminator.
+      (void)new UnreachableInst(Context, TheSwitch->getIterator());
+    } else if (OldFnRetTy->isVoidTy()) {
+      // We have no return value.
+      ReturnInst::Create(Context, nullptr,
+                         TheSwitch->getIterator()); // Return void
     } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
       // return what we have
-      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch->getIterator());
+      ReturnInst::Create(Context, TheSwitch->getCondition(),
+                         TheSwitch->getIterator());
     } else {
       // Otherwise we must have code extracted an unwind or something, just
       // return whatever we want.
-      ReturnInst::Create(Context,
-                         Constant::getNullValue(OldFnRetTy), TheSwitch->getIterator());
+      ReturnInst::Create(Context, Constant::getNullValue(OldFnRetTy),
+                         TheSwitch->getIterator());
     }
 
     TheSwitch->eraseFromParent();
@@ -1508,14 +1524,14 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
 static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
   for (Instruction &I : instructions(F)) {
     SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
-    SmallVector<DPValue *, 4> DPValues;
-    findDbgUsers(DbgUsers, &I, &DPValues);
+    SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
+    findDbgUsers(DbgUsers, &I, &DbgVariableRecords);
     for (DbgVariableIntrinsic *DVI : DbgUsers)
       if (DVI->getFunction() != &F)
         DVI->eraseFromParent();
-    for (DPValue *DPV : DPValues)
-      if (DPV->getFunction() != &F)
-        DPV->eraseFromParent();
+    for (DbgVariableRecord *DVR : DbgVariableRecords)
+      if (DVR->getFunction() != &F)
+        DVR->eraseFromParent();
   }
 }
 
@@ -1569,7 +1585,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
   //     point to a variable in the wrong scope.
   SmallDenseMap<DINode *, DINode *> RemappedMetadata;
   SmallVector<Instruction *, 4> DebugIntrinsicsToDelete;
-  SmallVector<DPValue *, 4> DPVsToDelete;
+  SmallVector<DbgVariableRecord *, 4> DVRsToDelete;
   DenseMap<const MDNode *, MDNode *> Cache;
 
   auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar) {
@@ -1608,19 +1624,19 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
         continue;
       }
 
-      DPValue &DPV = cast<DPValue>(DR);
+      DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
       // Apply the two updates that dbg.values get: invalid operands, and
       // variable metadata fixup.
-      if (any_of(DPV.location_ops(), IsInvalidLocation)) {
-        DPVsToDelete.push_back(&DPV);
+      if (any_of(DVR.location_ops(), IsInvalidLocation)) {
+        DVRsToDelete.push_back(&DVR);
         continue;
       }
-      if (DPV.isDbgAssign() && IsInvalidLocation(DPV.getAddress())) {
-        DPVsToDelete.push_back(&DPV);
+      if (DVR.isDbgAssign() && IsInvalidLocation(DVR.getAddress())) {
+        DVRsToDelete.push_back(&DVR);
         continue;
       }
-      if (!DPV.getDebugLoc().getInlinedAt())
-        DPV.setVariable(GetUpdatedDIVariable(DPV.getVariable()));
+      if (!DVR.getDebugLoc().getInlinedAt())
+        DVR.setVariable(GetUpdatedDIVariable(DVR.getVariable()));
     }
   };
 
@@ -1658,8 +1674,8 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
 
   for (auto *DII : DebugIntrinsicsToDelete)
     DII->eraseFromParent();
-  for (auto *DPV : DPVsToDelete)
-    DPV->getMarker()->MarkedInstr->dropOneDbgRecord(DPV);
+  for (auto *DVR : DVRsToDelete)
+    DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
   DIB.finalizeSubprogram(NewSP);
 
   // Fix up the scope information attached to the line locations in the new
@@ -1895,16 +1911,6 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC,
 
   fixupDebugInfoPostExtraction(*oldFunction, *newFunction, *TheCall);
 
-  // Mark the new function `noreturn` if applicable. Terminators which resume
-  // exception propagation are treated as returning instructions. This is to
-  // avoid inserting traps after calls to outlined functions which unwind.
-  bool doesNotReturn = none_of(*newFunction, [](const BasicBlock &BB) {
-    const Instruction *Term = BB.getTerminator();
-    return isa<ReturnInst>(Term) || isa<ResumeInst>(Term);
-  });
-  if (doesNotReturn)
-    newFunction->setDoesNotReturn();
-
   LLVM_DEBUG(if (verifyFunction(*newFunction, &errs())) {
     newFunction->dump();
     report_fatal_error("verification of newFunction failed!");
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 09e19be0d293ba..8cfb8ed07d240e 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -426,9 +426,19 @@ int FunctionComparator::cmpConstants(const Constant *L,
         return Res;
       if (int Res = cmpNumbers(GEPL->isInBounds(), GEPR->isInBounds()))
         return Res;
-      if (int Res = cmpNumbers(GEPL->getInRangeIndex().value_or(unsigned(-1)),
-                               GEPR->getInRangeIndex().value_or(unsigned(-1))))
-        return Res;
+
+      std::optional<ConstantRange> InRangeL = GEPL->getInRange();
+      std::optional<ConstantRange> InRangeR = GEPR->getInRange();
+      if (InRangeL) {
+        if (!InRangeR)
+          return 1;
+        if (int Res = cmpAPInts(InRangeL->getLower(), InRangeR->getLower()))
+          return Res;
+        if (int Res = cmpAPInts(InRangeL->getUpper(), InRangeR->getUpper()))
+          return Res;
+      } else if (InRangeR) {
+        return -1;
+      }
     }
     if (auto *OBOL = dyn_cast<OverflowingBinaryOperator>(LE)) {
       auto *OBOR = cast<OverflowingBinaryOperator>(RE);
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 33d32720a33317..833dcbec228b88 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1710,17 +1710,17 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   };
 
   // Helper-util for updating debug-info records attached to instructions.
-  auto UpdateDPV = [&](DbgRecord *DPV) {
-    assert(DPV->getDebugLoc() && "Debug Value must have debug loc");
+  auto UpdateDVR = [&](DbgRecord *DVR) {
+    assert(DVR->getDebugLoc() && "Debug Value must have debug loc");
     if (NoInlineLineTables) {
-      DPV->setDebugLoc(TheCallDL);
+      DVR->setDebugLoc(TheCallDL);
       return;
     }
-    DebugLoc DL = DPV->getDebugLoc();
+    DebugLoc DL = DVR->getDebugLoc();
     DebugLoc IDL =
         inlineDebugLoc(DL, InlinedAtNode,
-                       DPV->getMarker()->getParent()->getContext(), IANodes);
-    DPV->setDebugLoc(IDL);
+                       DVR->getMarker()->getParent()->getContext(), IANodes);
+    DVR->setDebugLoc(IDL);
   };
 
   // Iterate over all instructions, updating metadata and debug-info records.
@@ -1728,8 +1728,8 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
          ++BI) {
       UpdateInst(*BI);
-      for (DbgRecord &DPV : BI->getDbgRecordRange()) {
-        UpdateDPV(&DPV);
+      for (DbgRecord &DVR : BI->getDbgRecordRange()) {
+        UpdateDVR(&DVR);
       }
     }
 
@@ -1797,7 +1797,7 @@ static at::StorageToVarsMap collectEscapedLocals(const DataLayout &DL,
       EscapedLocals[Base].insert(at::VarRecord(DbgAssign));
     };
     for_each(at::getAssignmentMarkers(Base), CollectAssignsForStorage);
-    for_each(at::getDPVAssignmentMarkers(Base), CollectAssignsForStorage);
+    for_each(at::getDVRAssignmentMarkers(Base), CollectAssignsForStorage);
   }
   return EscapedLocals;
 }
@@ -1829,9 +1829,9 @@ static void fixupAssignments(Function::iterator Start, Function::iterator End) {
   // attachment or use, replace it with a new version.
   for (auto BBI = Start; BBI != End; ++BBI) {
     for (Instruction &I : *BBI) {
-      for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange())) {
-        if (DPV.isDbgAssign())
-          DPV.setAssignId(GetNewID(DPV.getAssignID()));
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (DVR.isDbgAssign())
+          DVR.setAssignId(GetNewID(DVR.getAssignID()));
       }
       if (auto *ID = I.getMetadata(LLVMContext::MD_DIAssignID))
         I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index 5e0c312fe149e7..ab1edf47d8db0a 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -242,8 +242,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     }
 
     SmallVector<DbgValueInst *, 4> DbgValues;
-    SmallVector<DPValue *, 4> DPValues;
-    llvm::findDbgValues(DbgValues, I, &DPValues);
+    SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
+    llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
 
     // Update pre-existing debug value uses that reside outside the loop.
     for (auto *DVI : DbgValues) {
@@ -261,8 +261,8 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
 
     // RemoveDIs: copy-paste of block above, using non-instruction debug-info
     // records.
-    for (DPValue *DPV : DPValues) {
-      BasicBlock *UserBB = DPV->getMarker()->getParent();
+    for (DbgVariableRecord *DVR : DbgVariableRecords) {
+      BasicBlock *UserBB = DVR->getMarker()->getParent();
       if (InstBB == UserBB || L->contains(UserBB))
         continue;
       // We currently only handle debug values residing in blocks that were
@@ -271,7 +271,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0]
                                        : SSAUpdate.FindValueForBlock(UserBB);
       if (V)
-        DPV->replaceVariableLocationOp(I, V);
+        DVR->replaceVariableLocationOp(I, V);
     }
 
     // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 422319eb185134..3d3b97eca92b3d 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -609,12 +609,12 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions(
 
 bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(DbgUsers, I, &DPUsers);
   for (auto *DII : DbgUsers)
     DII->setKillLocation();
-  for (auto *DPV : DPUsers)
-    DPV->setKillLocation();
+  for (auto *DVR : DPUsers)
+    DVR->setKillLocation();
   return !DbgUsers.empty() || !DPUsers.empty();
 }
 
@@ -1570,16 +1570,16 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
   // is removed by LowerDbgDeclare(), we need to make sure that we are
   // not inserting the same dbg.value intrinsic over and over.
   SmallVector<DbgValueInst *, 1> DbgValues;
-  SmallVector<DPValue *, 1> DPValues;
-  findDbgValues(DbgValues, APN, &DPValues);
+  SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
+  findDbgValues(DbgValues, APN, &DbgVariableRecords);
   for (auto *DVI : DbgValues) {
     assert(is_contained(DVI->getValues(), APN));
     if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
       return true;
   }
-  for (auto *DPV : DPValues) {
-    assert(is_contained(DPV->location_ops(), APN));
-    if ((DPV->getVariable() == DIVar) && (DPV->getExpression() == DIExpr))
+  for (auto *DVR : DbgVariableRecords) {
+    assert(is_contained(DVR->location_ops(), APN));
+    if ((DVR->getVariable() == DIVar) && (DVR->getExpression() == DIExpr))
       return true;
   }
   return false;
@@ -1617,23 +1617,23 @@ static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
   // Could not determine size of variable. Conservatively return false.
   return false;
 }
-// RemoveDIs: duplicate implementation of the above, using DPValues, the
-// replacement for dbg.values.
-static bool valueCoversEntireFragment(Type *ValTy, DPValue *DPV) {
-  const DataLayout &DL = DPV->getModule()->getDataLayout();
+// RemoveDIs: duplicate implementation of the above, using DbgVariableRecords,
+// the replacement for dbg.values.
+static bool valueCoversEntireFragment(Type *ValTy, DbgVariableRecord *DVR) {
+  const DataLayout &DL = DVR->getModule()->getDataLayout();
   TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
-  if (std::optional<uint64_t> FragmentSize = DPV->getFragmentSizeInBits())
+  if (std::optional<uint64_t> FragmentSize = DVR->getFragmentSizeInBits())
     return TypeSize::isKnownGE(ValueSize, TypeSize::getFixed(*FragmentSize));
 
   // We can't always calculate the size of the DI variable (e.g. if it is a
   // VLA). Try to use the size of the alloca that the dbg intrinsic describes
   // intead.
-  if (DPV->isAddressOfVariable()) {
-    // DPV should have exactly 1 location when it is an address.
-    assert(DPV->getNumVariableLocationOps() == 1 &&
+  if (DVR->isAddressOfVariable()) {
+    // DVR should have exactly 1 location when it is an address.
+    assert(DVR->getNumVariableLocationOps() == 1 &&
            "address of variable must have exactly 1 location operand.");
     if (auto *AI =
-            dyn_cast_or_null<AllocaInst>(DPV->getVariableLocationOp(0))) {
+            dyn_cast_or_null<AllocaInst>(DVR->getVariableLocationOp(0))) {
       if (std::optional<TypeSize> FragmentSize = AI->getAllocationSizeInBits(DL)) {
         return TypeSize::isKnownGE(ValueSize, *FragmentSize);
       }
@@ -1643,38 +1643,38 @@ static bool valueCoversEntireFragment(Type *ValTy, DPValue *DPV) {
   return false;
 }
 
-static void insertDbgValueOrDPValue(DIBuilder &Builder, Value *DV,
-                                    DILocalVariable *DIVar,
-                                    DIExpression *DIExpr,
-                                    const DebugLoc &NewLoc,
-                                    BasicBlock::iterator Instr) {
+static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV,
+                                              DILocalVariable *DIVar,
+                                              DIExpression *DIExpr,
+                                              const DebugLoc &NewLoc,
+                                              BasicBlock::iterator Instr) {
   if (!UseNewDbgInfoFormat) {
     auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
                                                   (Instruction *)nullptr);
     DbgVal.get<Instruction *>()->insertBefore(Instr);
   } else {
     // RemoveDIs: if we're using the new debug-info format, allocate a
-    // DPValue directly instead of a dbg.value intrinsic.
+    // DbgVariableRecord directly instead of a dbg.value intrinsic.
     ValueAsMetadata *DVAM = ValueAsMetadata::get(DV);
-    DPValue *DV = new DPValue(DVAM, DIVar, DIExpr, NewLoc.get());
+    DbgVariableRecord *DV =
+        new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get());
     Instr->getParent()->insertDbgRecordBefore(DV, Instr);
   }
 }
 
-static void insertDbgValueOrDPValueAfter(DIBuilder &Builder, Value *DV,
-                                         DILocalVariable *DIVar,
-                                         DIExpression *DIExpr,
-                                         const DebugLoc &NewLoc,
-                                         BasicBlock::iterator Instr) {
+static void insertDbgValueOrDbgVariableRecordAfter(
+    DIBuilder &Builder, Value *DV, DILocalVariable *DIVar, DIExpression *DIExpr,
+    const DebugLoc &NewLoc, BasicBlock::iterator Instr) {
   if (!UseNewDbgInfoFormat) {
     auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
                                                   (Instruction *)nullptr);
     DbgVal.get<Instruction *>()->insertAfter(&*Instr);
   } else {
     // RemoveDIs: if we're using the new debug-info format, allocate a
-    // DPValue directly instead of a dbg.value intrinsic.
+    // DbgVariableRecord directly instead of a dbg.value intrinsic.
     ValueAsMetadata *DVAM = ValueAsMetadata::get(DV);
-    DPValue *DV = new DPValue(DVAM, DIVar, DIExpr, NewLoc.get());
+    DbgVariableRecord *DV =
+        new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get());
     Instr->getParent()->insertDbgRecordAfter(DV, &*Instr);
   }
 }
@@ -1707,8 +1707,8 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
       DIExpr->isDeref() || (!DIExpr->startsWithDeref() &&
                             valueCoversEntireFragment(DV->getType(), DII));
   if (CanConvert) {
-    insertDbgValueOrDPValue(Builder, DV, DIVar, DIExpr, NewLoc,
-                            SI->getIterator());
+    insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
+                                      SI->getIterator());
     return;
   }
 
@@ -1720,8 +1720,8 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
   // know which part) we insert an dbg.value intrinsic to indicate that we
   // know nothing about the variable's content.
   DV = UndefValue::get(DV->getType());
-  insertDbgValueOrDPValue(Builder, DV, DIVar, DIExpr, NewLoc,
-                          SI->getIterator());
+  insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
+                                    SI->getIterator());
 }
 
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1747,19 +1747,19 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
   // future if multi-location support is added to the IR, it might be
   // preferable to keep tracking both the loaded value and the original
   // address in case the alloca can not be elided.
-  insertDbgValueOrDPValueAfter(Builder, LI, DIVar, DIExpr, NewLoc,
-                               LI->getIterator());
+  insertDbgValueOrDbgVariableRecordAfter(Builder, LI, DIVar, DIExpr, NewLoc,
+                                         LI->getIterator());
 }
 
-void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, StoreInst *SI,
-                                           DIBuilder &Builder) {
-  assert(DPV->isAddressOfVariable() || DPV->isDbgAssign());
-  auto *DIVar = DPV->getVariable();
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR,
+                                           StoreInst *SI, DIBuilder &Builder) {
+  assert(DVR->isAddressOfVariable() || DVR->isDbgAssign());
+  auto *DIVar = DVR->getVariable();
   assert(DIVar && "Missing variable");
-  auto *DIExpr = DPV->getExpression();
+  auto *DIExpr = DVR->getExpression();
   Value *DV = SI->getValueOperand();
 
-  DebugLoc NewLoc = getDebugValueLoc(DPV);
+  DebugLoc NewLoc = getDebugValueLoc(DVR);
 
   // If the alloca describes the variable itself, i.e. the expression in the
   // dbg.declare doesn't start with a dereference, we can perform the
@@ -1775,16 +1775,16 @@ void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, StoreInst *SI,
   // deref expression.
   bool CanConvert =
       DIExpr->isDeref() || (!DIExpr->startsWithDeref() &&
-                            valueCoversEntireFragment(DV->getType(), DPV));
+                            valueCoversEntireFragment(DV->getType(), DVR));
   if (CanConvert) {
-    insertDbgValueOrDPValue(Builder, DV, DIVar, DIExpr, NewLoc,
-                            SI->getIterator());
+    insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc,
+                                      SI->getIterator());
     return;
   }
 
   // FIXME: If storing to a part of the variable described by the dbg.declare,
   // then we want to insert a dbg.value for the corresponding fragment.
-  LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " << *DPV
+  LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " << *DVR
                     << '\n');
   assert(UseNewDbgInfoFormat);
 
@@ -1793,8 +1793,9 @@ void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, StoreInst *SI,
   // know nothing about the variable's content.
   DV = UndefValue::get(DV->getType());
   ValueAsMetadata *DVAM = ValueAsMetadata::get(DV);
-  DPValue *NewDPV = new DPValue(DVAM, DIVar, DIExpr, NewLoc.get());
-  SI->getParent()->insertDbgRecordBefore(NewDPV, SI->getIterator());
+  DbgVariableRecord *NewDVR =
+      new DbgVariableRecord(DVAM, DIVar, DIExpr, NewLoc.get());
+  SI->getParent()->insertDbgRecordBefore(NewDVR, SI->getIterator());
 }
 
 /// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
@@ -1826,26 +1827,27 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
   // insertion point.
   // FIXME: Insert dbg.value markers in the successors when appropriate.
   if (InsertionPt != BB->end()) {
-    insertDbgValueOrDPValue(Builder, APN, DIVar, DIExpr, NewLoc, InsertionPt);
+    insertDbgValueOrDbgVariableRecord(Builder, APN, DIVar, DIExpr, NewLoc,
+                                      InsertionPt);
   }
 }
 
-void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, LoadInst *LI,
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, LoadInst *LI,
                                            DIBuilder &Builder) {
-  auto *DIVar = DPV->getVariable();
-  auto *DIExpr = DPV->getExpression();
+  auto *DIVar = DVR->getVariable();
+  auto *DIExpr = DVR->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (!valueCoversEntireFragment(LI->getType(), DPV)) {
+  if (!valueCoversEntireFragment(LI->getType(), DVR)) {
     // FIXME: If only referring to a part of the variable described by the
-    // dbg.declare, then we want to insert a DPValue for the corresponding
-    // fragment.
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to DPValue: " << *DPV
-                      << '\n');
+    // dbg.declare, then we want to insert a DbgVariableRecord for the
+    // corresponding fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to DbgVariableRecord: "
+                      << *DVR << '\n');
     return;
   }
 
-  DebugLoc NewLoc = getDebugValueLoc(DPV);
+  DebugLoc NewLoc = getDebugValueLoc(DVR);
 
   // We are now tracking the loaded value instead of the address. In the
   // future if multi-location support is added to the IR, it might be
@@ -1853,9 +1855,10 @@ void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, LoadInst *LI,
   // address in case the alloca can not be elided.
   assert(UseNewDbgInfoFormat);
 
-  // Create a DPValue directly and insert.
+  // Create a DbgVariableRecord directly and insert.
   ValueAsMetadata *LIVAM = ValueAsMetadata::get(LI);
-  DPValue *DV = new DPValue(LIVAM, DIVar, DIExpr, NewLoc.get());
+  DbgVariableRecord *DV =
+      new DbgVariableRecord(LIVAM, DIVar, DIExpr, NewLoc.get());
   LI->getParent()->insertDbgRecordAfter(DV, LI);
 }
 
@@ -1869,34 +1872,35 @@ static bool isArray(AllocaInst *AI) {
 static bool isStructure(AllocaInst *AI) {
   return AI->getAllocatedType() && AI->getAllocatedType()->isStructTy();
 }
-void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, PHINode *APN,
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, PHINode *APN,
                                            DIBuilder &Builder) {
-  auto *DIVar = DPV->getVariable();
-  auto *DIExpr = DPV->getExpression();
+  auto *DIVar = DVR->getVariable();
+  auto *DIExpr = DVR->getExpression();
   assert(DIVar && "Missing variable");
 
   if (PhiHasDebugValue(DIVar, DIExpr, APN))
     return;
 
-  if (!valueCoversEntireFragment(APN->getType(), DPV)) {
+  if (!valueCoversEntireFragment(APN->getType(), DVR)) {
     // FIXME: If only referring to a part of the variable described by the
-    // dbg.declare, then we want to insert a DPValue for the corresponding
-    // fragment.
-    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to DPValue: " << *DPV
-                      << '\n');
+    // dbg.declare, then we want to insert a DbgVariableRecord for the
+    // corresponding fragment.
+    LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to DbgVariableRecord: "
+                      << *DVR << '\n');
     return;
   }
 
   BasicBlock *BB = APN->getParent();
   auto InsertionPt = BB->getFirstInsertionPt();
 
-  DebugLoc NewLoc = getDebugValueLoc(DPV);
+  DebugLoc NewLoc = getDebugValueLoc(DVR);
 
   // The block may be a catchswitch block, which does not have a valid
   // insertion point.
-  // FIXME: Insert DPValue markers in the successors when appropriate.
+  // FIXME: Insert DbgVariableRecord markers in the successors when appropriate.
   if (InsertionPt != BB->end()) {
-    insertDbgValueOrDPValue(Builder, APN, DIVar, DIExpr, NewLoc, InsertionPt);
+    insertDbgValueOrDbgVariableRecord(Builder, APN, DIVar, DIExpr, NewLoc,
+                                      InsertionPt);
   }
 }
 
@@ -1906,19 +1910,19 @@ bool llvm::LowerDbgDeclare(Function &F) {
   bool Changed = false;
   DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
   SmallVector<DbgDeclareInst *, 4> Dbgs;
-  SmallVector<DPValue *> DPVs;
+  SmallVector<DbgVariableRecord *> DVRs;
   for (auto &FI : F) {
     for (Instruction &BI : FI) {
       if (auto *DDI = dyn_cast<DbgDeclareInst>(&BI))
         Dbgs.push_back(DDI);
-      for (DPValue &DPV : filterDbgVars(BI.getDbgRecordRange())) {
-        if (DPV.getType() == DPValue::LocationType::Declare)
-          DPVs.push_back(&DPV);
+      for (DbgVariableRecord &DVR : filterDbgVars(BI.getDbgRecordRange())) {
+        if (DVR.getType() == DbgVariableRecord::LocationType::Declare)
+          DVRs.push_back(&DVR);
       }
     }
   }
 
-  if (Dbgs.empty() && DPVs.empty())
+  if (Dbgs.empty() && DVRs.empty())
     return Changed;
 
   auto LowerOne = [&](auto *DDI) {
@@ -1962,8 +1966,9 @@ bool llvm::LowerDbgDeclare(Function &F) {
             DebugLoc NewLoc = getDebugValueLoc(DDI);
             auto *DerefExpr =
                 DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
-            insertDbgValueOrDPValue(DIB, AI, DDI->getVariable(), DerefExpr,
-                                    NewLoc, CI->getIterator());
+            insertDbgValueOrDbgVariableRecord(DIB, AI, DDI->getVariable(),
+                                              DerefExpr, NewLoc,
+                                              CI->getIterator());
           }
         } else if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
           if (BI->getType()->isPointerTy())
@@ -1976,7 +1981,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
   };
 
   for_each(Dbgs, LowerOne);
-  for_each(DPVs, LowerOne);
+  for_each(DVRs, LowerOne);
 
   if (Changed)
     for (BasicBlock &BB : F)
@@ -1986,35 +1991,39 @@ bool llvm::LowerDbgDeclare(Function &F) {
 }
 
 // RemoveDIs: re-implementation of insertDebugValuesForPHIs, but which pulls the
-// debug-info out of the block's DPValues rather than dbg.value intrinsics.
-static void insertDPValuesForPHIs(BasicBlock *BB,
-                                  SmallVectorImpl<PHINode *> &InsertedPHIs) {
-  assert(BB && "No BasicBlock to clone DPValue(s) from.");
+// debug-info out of the block's DbgVariableRecords rather than dbg.value
+// intrinsics.
+static void
+insertDbgVariableRecordsForPHIs(BasicBlock *BB,
+                                SmallVectorImpl<PHINode *> &InsertedPHIs) {
+  assert(BB && "No BasicBlock to clone DbgVariableRecord(s) from.");
   if (InsertedPHIs.size() == 0)
     return;
 
-  // Map existing PHI nodes to their DPValues.
-  DenseMap<Value *, DPValue *> DbgValueMap;
+  // Map existing PHI nodes to their DbgVariableRecords.
+  DenseMap<Value *, DbgVariableRecord *> DbgValueMap;
   for (auto &I : *BB) {
-    for (DPValue &DPV : filterDbgVars(I.getDbgRecordRange())) {
-      for (Value *V : DPV.location_ops())
+    for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+      for (Value *V : DVR.location_ops())
         if (auto *Loc = dyn_cast_or_null<PHINode>(V))
-          DbgValueMap.insert({Loc, &DPV});
+          DbgValueMap.insert({Loc, &DVR});
     }
   }
   if (DbgValueMap.size() == 0)
     return;
 
-  // Map a pair of the destination BB and old DPValue to the new DPValue,
-  // so that if a DPValue is being rewritten to use more than one of the
-  // inserted PHIs in the same destination BB, we can update the same DPValue
-  // with all the new PHIs instead of creating one copy for each.
-  MapVector<std::pair<BasicBlock *, DPValue *>, DPValue *> NewDbgValueMap;
+  // Map a pair of the destination BB and old DbgVariableRecord to the new
+  // DbgVariableRecord, so that if a DbgVariableRecord is being rewritten to use
+  // more than one of the inserted PHIs in the same destination BB, we can
+  // update the same DbgVariableRecord with all the new PHIs instead of creating
+  // one copy for each.
+  MapVector<std::pair<BasicBlock *, DbgVariableRecord *>, DbgVariableRecord *>
+      NewDbgValueMap;
   // Then iterate through the new PHIs and look to see if they use one of the
-  // previously mapped PHIs. If so, create a new DPValue that will propagate
-  // the info through the new PHI. If we use more than one new PHI in a single
-  // destination BB with the same old dbg.value, merge the updates so that we
-  // get a single new DPValue with all the new PHIs.
+  // previously mapped PHIs. If so, create a new DbgVariableRecord that will
+  // propagate the info through the new PHI. If we use more than one new PHI in
+  // a single destination BB with the same old dbg.value, merge the updates so
+  // that we get a single new DbgVariableRecord with all the new PHIs.
   for (auto PHI : InsertedPHIs) {
     BasicBlock *Parent = PHI->getParent();
     // Avoid inserting a debug-info record into an EH block.
@@ -2023,13 +2032,13 @@ static void insertDPValuesForPHIs(BasicBlock *BB,
     for (auto VI : PHI->operand_values()) {
       auto V = DbgValueMap.find(VI);
       if (V != DbgValueMap.end()) {
-        DPValue *DbgII = cast<DPValue>(V->second);
+        DbgVariableRecord *DbgII = cast<DbgVariableRecord>(V->second);
         auto NewDI = NewDbgValueMap.find({Parent, DbgII});
         if (NewDI == NewDbgValueMap.end()) {
-          DPValue *NewDbgII = DbgII->clone();
+          DbgVariableRecord *NewDbgII = DbgII->clone();
           NewDI = NewDbgValueMap.insert({{Parent, DbgII}, NewDbgII}).first;
         }
-        DPValue *NewDbgII = NewDI->second;
+        DbgVariableRecord *NewDbgII = NewDI->second;
         // If PHI contains VI as an operand more than once, we may
         // replaced it in NewDbgII; confirm that it is present.
         if (is_contained(NewDbgII->location_ops(), VI))
@@ -2037,10 +2046,10 @@ static void insertDPValuesForPHIs(BasicBlock *BB,
       }
     }
   }
-  // Insert the new DPValues into their destination blocks.
+  // Insert the new DbgVariableRecords into their destination blocks.
   for (auto DI : NewDbgValueMap) {
     BasicBlock *Parent = DI.first.first;
-    DPValue *NewDbgII = DI.second;
+    DbgVariableRecord *NewDbgII = DI.second;
     auto InsertionPt = Parent->getFirstInsertionPt();
     assert(InsertionPt != Parent->end() && "Ill-formed basic block");
 
@@ -2055,7 +2064,7 @@ void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
   if (InsertedPHIs.size() == 0)
     return;
 
-  insertDPValuesForPHIs(BB, InsertedPHIs);
+  insertDbgVariableRecordsForPHIs(BB, InsertedPHIs);
 
   // Map existing PHI nodes to their dbg.values.
   ValueToValueMapTy DbgValueMap;
@@ -2117,7 +2126,7 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
                              DIBuilder &Builder, uint8_t DIExprFlags,
                              int Offset) {
   TinyPtrVector<DbgDeclareInst *> DbgDeclares = findDbgDeclares(Address);
-  TinyPtrVector<DPValue *> DPVDeclares = findDPVDeclares(Address);
+  TinyPtrVector<DbgVariableRecord *> DVRDeclares = findDVRDeclares(Address);
 
   auto ReplaceOne = [&](auto *DII) {
     assert(DII->getVariable() && "Missing variable");
@@ -2128,21 +2137,22 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
   };
 
   for_each(DbgDeclares, ReplaceOne);
-  for_each(DPVDeclares, ReplaceOne);
+  for_each(DVRDeclares, ReplaceOne);
 
-  return !DbgDeclares.empty() || !DPVDeclares.empty();
+  return !DbgDeclares.empty() || !DVRDeclares.empty();
 }
 
 static void updateOneDbgValueForAlloca(const DebugLoc &Loc,
                                        DILocalVariable *DIVar,
                                        DIExpression *DIExpr, Value *NewAddress,
-                                       DbgValueInst *DVI, DPValue *DPV,
+                                       DbgValueInst *DVI,
+                                       DbgVariableRecord *DVR,
                                        DIBuilder &Builder, int Offset) {
   assert(DIVar && "Missing variable");
 
-  // This is an alloca-based dbg.value/DPValue. The first thing it should do
-  // with the alloca pointer is dereference it. Otherwise we don't know how to
-  // handle it and give up.
+  // This is an alloca-based dbg.value/DbgVariableRecord. The first thing it
+  // should do with the alloca pointer is dereference it. Otherwise we don't
+  // know how to handle it and give up.
   if (!DIExpr || DIExpr->getNumElements() < 1 ||
       DIExpr->getElement(0) != dwarf::DW_OP_deref)
     return;
@@ -2155,16 +2165,16 @@ static void updateOneDbgValueForAlloca(const DebugLoc &Loc,
     DVI->setExpression(DIExpr);
     DVI->replaceVariableLocationOp(0u, NewAddress);
   } else {
-    assert(DPV);
-    DPV->setExpression(DIExpr);
-    DPV->replaceVariableLocationOp(0u, NewAddress);
+    assert(DVR);
+    DVR->setExpression(DIExpr);
+    DVR->replaceVariableLocationOp(0u, NewAddress);
   }
 }
 
 void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                     DIBuilder &Builder, int Offset) {
   SmallVector<DbgValueInst *, 1> DbgUsers;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgValues(DbgUsers, AI, &DPUsers);
 
   // Attempt to replace dbg.values that use this alloca.
@@ -2173,18 +2183,18 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                                DVI->getExpression(), NewAllocaAddress, DVI,
                                nullptr, Builder, Offset);
 
-  // Replace any DPValues that use this alloca.
-  for (DPValue *DPV : DPUsers)
-    updateOneDbgValueForAlloca(DPV->getDebugLoc(), DPV->getVariable(),
-                               DPV->getExpression(), NewAllocaAddress, nullptr,
-                               DPV, Builder, Offset);
+  // Replace any DbgVariableRecords that use this alloca.
+  for (DbgVariableRecord *DVR : DPUsers)
+    updateOneDbgValueForAlloca(DVR->getDebugLoc(), DVR->getVariable(),
+                               DVR->getExpression(), NewAllocaAddress, nullptr,
+                               DVR, Builder, Offset);
 }
 
 /// Where possible to salvage debug information for \p I do so.
 /// If not possible mark undef.
 void llvm::salvageDebugInfo(Instruction &I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(DbgUsers, &I, &DPUsers);
   salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers);
 }
@@ -2224,7 +2234,7 @@ template <typename T> static void salvageDbgAssignAddress(T *Assign) {
 
 void llvm::salvageDebugInfoForDbgValues(
     Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers,
-    ArrayRef<DPValue *> DPUsers) {
+    ArrayRef<DbgVariableRecord *> DPUsers) {
   // These are arbitrary chosen limits on the maximum number of values and the
   // maximum size of a debug expression we can salvage up to, used for
   // performance reasons.
@@ -2290,67 +2300,68 @@ void llvm::salvageDebugInfoForDbgValues(
     LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
     Salvaged = true;
   }
-  // Duplicate of above block for DPValues.
-  for (auto *DPV : DPUsers) {
-    if (DPV->isDbgAssign()) {
-      if (DPV->getAddress() == &I) {
-        salvageDbgAssignAddress(DPV);
+  // Duplicate of above block for DbgVariableRecords.
+  for (auto *DVR : DPUsers) {
+    if (DVR->isDbgAssign()) {
+      if (DVR->getAddress() == &I) {
+        salvageDbgAssignAddress(DVR);
         Salvaged = true;
       }
-      if (DPV->getValue() != &I)
+      if (DVR->getValue() != &I)
         continue;
     }
 
     // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they
     // are implicitly pointing out the value as a DWARF memory location
     // description.
-    bool StackValue = DPV->getType() != DPValue::LocationType::Declare;
-    auto DPVLocation = DPV->location_ops();
+    bool StackValue =
+        DVR->getType() != DbgVariableRecord::LocationType::Declare;
+    auto DVRLocation = DVR->location_ops();
     assert(
-        is_contained(DPVLocation, &I) &&
+        is_contained(DVRLocation, &I) &&
         "DbgVariableIntrinsic must use salvaged instruction as its location");
     SmallVector<Value *, 4> AdditionalValues;
-    // 'I' may appear more than once in DPV's location ops, and each use of 'I'
+    // 'I' may appear more than once in DVR's location ops, and each use of 'I'
     // must be updated in the DIExpression and potentially have additional
     // values added; thus we call salvageDebugInfoImpl for each 'I' instance in
-    // DPVLocation.
+    // DVRLocation.
     Value *Op0 = nullptr;
-    DIExpression *SalvagedExpr = DPV->getExpression();
-    auto LocItr = find(DPVLocation, &I);
-    while (SalvagedExpr && LocItr != DPVLocation.end()) {
+    DIExpression *SalvagedExpr = DVR->getExpression();
+    auto LocItr = find(DVRLocation, &I);
+    while (SalvagedExpr && LocItr != DVRLocation.end()) {
       SmallVector<uint64_t, 16> Ops;
-      unsigned LocNo = std::distance(DPVLocation.begin(), LocItr);
+      unsigned LocNo = std::distance(DVRLocation.begin(), LocItr);
       uint64_t CurrentLocOps = SalvagedExpr->getNumLocationOperands();
       Op0 = salvageDebugInfoImpl(I, CurrentLocOps, Ops, AdditionalValues);
       if (!Op0)
         break;
       SalvagedExpr =
           DIExpression::appendOpsToArg(SalvagedExpr, Ops, LocNo, StackValue);
-      LocItr = std::find(++LocItr, DPVLocation.end(), &I);
+      LocItr = std::find(++LocItr, DVRLocation.end(), &I);
     }
     // salvageDebugInfoImpl should fail on examining the first element of
     // DbgUsers, or none of them.
     if (!Op0)
       break;
 
-    DPV->replaceVariableLocationOp(&I, Op0);
+    DVR->replaceVariableLocationOp(&I, Op0);
     bool IsValidSalvageExpr =
         SalvagedExpr->getNumElements() <= MaxExpressionSize;
     if (AdditionalValues.empty() && IsValidSalvageExpr) {
-      DPV->setExpression(SalvagedExpr);
-    } else if (DPV->getType() != DPValue::LocationType::Declare &&
+      DVR->setExpression(SalvagedExpr);
+    } else if (DVR->getType() != DbgVariableRecord::LocationType::Declare &&
                IsValidSalvageExpr &&
-               DPV->getNumVariableLocationOps() + AdditionalValues.size() <=
+               DVR->getNumVariableLocationOps() + AdditionalValues.size() <=
                    MaxDebugArgs) {
-      DPV->addVariableLocationOps(AdditionalValues, SalvagedExpr);
+      DVR->addVariableLocationOps(AdditionalValues, SalvagedExpr);
     } else {
       // Do not salvage using DIArgList for dbg.addr/dbg.declare, as it is
       // currently only valid for stack value expressions.
       // Also do not salvage if the resulting DIArgList would contain an
       // unreasonably large number of values.
-      DPV->setKillLocation();
+      DVR->setKillLocation();
     }
-    LLVM_DEBUG(dbgs() << "SALVAGE: " << DPV << '\n');
+    LLVM_DEBUG(dbgs() << "SALVAGE: " << DVR << '\n');
     Salvaged = true;
   }
 
@@ -2360,8 +2371,8 @@ void llvm::salvageDebugInfoForDbgValues(
   for (auto *DII : DbgUsers)
     DII->setKillLocation();
 
-  for (auto *DPV : DPUsers)
-    DPV->setKillLocation();
+  for (auto *DVR : DPUsers)
+    DVR->setKillLocation();
 }
 
 Value *getSalvageOpsForGEP(GetElementPtrInst *GEP, const DataLayout &DL,
@@ -2577,10 +2588,10 @@ using DbgValReplacement = std::optional<DIExpression *>;
 static bool rewriteDebugUsers(
     Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
     function_ref<DbgValReplacement(DbgVariableIntrinsic &DII)> RewriteExpr,
-    function_ref<DbgValReplacement(DPValue &DPV)> RewriteDPVExpr) {
+    function_ref<DbgValReplacement(DbgVariableRecord &DVR)> RewriteDVRExpr) {
   // Find debug users of From.
   SmallVector<DbgVariableIntrinsic *, 1> Users;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(Users, &From, &DPUsers);
   if (Users.empty() && DPUsers.empty())
     return false;
@@ -2589,7 +2600,7 @@ static bool rewriteDebugUsers(
   bool Changed = false;
 
   SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage;
-  SmallPtrSet<DPValue *, 1> UndefOrSalvageDPV;
+  SmallPtrSet<DbgVariableRecord *, 1> UndefOrSalvageDVR;
   if (isa<Instruction>(&To)) {
     bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
 
@@ -2608,22 +2619,22 @@ static bool rewriteDebugUsers(
       }
     }
 
-    // DPValue implementation of the above.
-    for (auto *DPV : DPUsers) {
-      Instruction *MarkedInstr = DPV->getMarker()->MarkedInstr;
+    // DbgVariableRecord implementation of the above.
+    for (auto *DVR : DPUsers) {
+      Instruction *MarkedInstr = DVR->getMarker()->MarkedInstr;
       Instruction *NextNonDebug = MarkedInstr;
       // The next instruction might still be a dbg.declare, skip over it.
       if (isa<DbgVariableIntrinsic>(NextNonDebug))
         NextNonDebug = NextNonDebug->getNextNonDebugInstruction();
 
       if (DomPointAfterFrom && NextNonDebug == &DomPoint) {
-        LLVM_DEBUG(dbgs() << "MOVE:  " << *DPV << '\n');
-        DPV->removeFromParent();
+        LLVM_DEBUG(dbgs() << "MOVE:  " << *DVR << '\n');
+        DVR->removeFromParent();
         // Ensure there's a marker.
-        DomPoint.getParent()->insertDbgRecordAfter(DPV, &DomPoint);
+        DomPoint.getParent()->insertDbgRecordAfter(DVR, &DomPoint);
         Changed = true;
       } else if (!DT.dominates(&DomPoint, MarkedInstr)) {
-        UndefOrSalvageDPV.insert(DPV);
+        UndefOrSalvageDVR.insert(DVR);
       }
     }
   }
@@ -2633,30 +2644,30 @@ static bool rewriteDebugUsers(
     if (UndefOrSalvage.count(DII))
       continue;
 
-    DbgValReplacement DVR = RewriteExpr(*DII);
-    if (!DVR)
+    DbgValReplacement DVRepl = RewriteExpr(*DII);
+    if (!DVRepl)
       continue;
 
     DII->replaceVariableLocationOp(&From, &To);
-    DII->setExpression(*DVR);
+    DII->setExpression(*DVRepl);
     LLVM_DEBUG(dbgs() << "REWRITE:  " << *DII << '\n');
     Changed = true;
   }
-  for (auto *DPV : DPUsers) {
-    if (UndefOrSalvageDPV.count(DPV))
+  for (auto *DVR : DPUsers) {
+    if (UndefOrSalvageDVR.count(DVR))
       continue;
 
-    DbgValReplacement DVR = RewriteDPVExpr(*DPV);
-    if (!DVR)
+    DbgValReplacement DVRepl = RewriteDVRExpr(*DVR);
+    if (!DVRepl)
       continue;
 
-    DPV->replaceVariableLocationOp(&From, &To);
-    DPV->setExpression(*DVR);
-    LLVM_DEBUG(dbgs() << "REWRITE:  " << DPV << '\n');
+    DVR->replaceVariableLocationOp(&From, &To);
+    DVR->setExpression(*DVRepl);
+    LLVM_DEBUG(dbgs() << "REWRITE:  " << DVR << '\n');
     Changed = true;
   }
 
-  if (!UndefOrSalvage.empty() || !UndefOrSalvageDPV.empty()) {
+  if (!UndefOrSalvage.empty() || !UndefOrSalvageDVR.empty()) {
     // Try to salvage the remaining debug users.
     salvageDebugInfo(From);
     Changed = true;
@@ -2704,15 +2715,15 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
   auto Identity = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
     return DII.getExpression();
   };
-  auto IdentityDPV = [&](DPValue &DPV) -> DbgValReplacement {
-    return DPV.getExpression();
+  auto IdentityDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement {
+    return DVR.getExpression();
   };
 
   // Handle no-op conversions.
   Module &M = *From.getModule();
   const DataLayout &DL = M.getDataLayout();
   if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
-    return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDPV);
+    return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR);
 
   // Handle integer-to-integer widening and narrowing.
   // FIXME: Use DW_OP_convert when it's available everywhere.
@@ -2724,7 +2735,7 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
     // When the width of the result grows, assume that a debugger will only
     // access the low `FromBits` bits when inspecting the source variable.
     if (FromBits < ToBits)
-      return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDPV);
+      return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR);
 
     // The width of the result has shrunk. Use sign/zero extension to describe
     // the source variable's high bits.
@@ -2740,10 +2751,10 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
       return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits,
                                      Signed);
     };
-    // RemoveDIs: duplicate implementation working on DPValues rather than on
-    // dbg.value intrinsics.
-    auto SignOrZeroExtDPV = [&](DPValue &DPV) -> DbgValReplacement {
-      DILocalVariable *Var = DPV.getVariable();
+    // RemoveDIs: duplicate implementation working on DbgVariableRecords rather
+    // than on dbg.value intrinsics.
+    auto SignOrZeroExtDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement {
+      DILocalVariable *Var = DVR.getVariable();
 
       // Without knowing signedness, sign/zero extension isn't possible.
       auto Signedness = Var->getSignedness();
@@ -2751,11 +2762,11 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
         return std::nullopt;
 
       bool Signed = *Signedness == DIBasicType::Signedness::Signed;
-      return DIExpression::appendExt(DPV.getExpression(), ToBits, FromBits,
+      return DIExpression::appendExt(DVR.getExpression(), ToBits, FromBits,
                                      Signed);
     };
     return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt,
-                             SignOrZeroExtDPV);
+                             SignOrZeroExtDVR);
   }
 
   // TODO: Floating-point conversions, vectors.
@@ -2795,8 +2806,8 @@ llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
     if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
       Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType()));
     if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
-      // EHPads can't have DPValues attached to them, but it might be possible
-      // for things with token type.
+      // EHPads can't have DbgVariableRecords attached to them, but it might be
+      // possible for things with token type.
       Inst->dropDbgRecords();
       EndInst = Inst;
       continue;
@@ -3544,12 +3555,12 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
 
 void llvm::dropDebugUsers(Instruction &I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
-  SmallVector<DPValue *, 1> DPUsers;
+  SmallVector<DbgVariableRecord *, 1> DPUsers;
   findDbgUsers(DbgUsers, &I, &DPUsers);
   for (auto *DII : DbgUsers)
     DII->eraseFromParent();
-  for (auto *DPV : DPUsers)
-    DPV->eraseFromParent();
+  for (auto *DVR : DPUsers)
+    DVR->eraseFromParent();
 }
 
 void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 980ecf06c055c4..4470c5af870ae2 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -159,8 +159,8 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
     // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
     // intrinsics.
     SmallVector<DbgValueInst *, 1> DbgValues;
-    SmallVector<DPValue *, 1> DPValues;
-    llvm::findDbgValues(DbgValues, OrigHeaderVal, &DPValues);
+    SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
+    llvm::findDbgValues(DbgValues, OrigHeaderVal, &DbgVariableRecords);
     for (auto &DbgValue : DbgValues) {
       // The original users in the OrigHeader are already using the original
       // definitions.
@@ -183,11 +183,11 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
     }
 
     // RemoveDIs: duplicate implementation for non-instruction debug-info
-    // storage in DPValues.
-    for (DPValue *DPV : DPValues) {
+    // storage in DbgVariableRecords.
+    for (DbgVariableRecord *DVR : DbgVariableRecords) {
       // The original users in the OrigHeader are already using the original
       // definitions.
-      BasicBlock *UserBB = DPV->getMarker()->getParent();
+      BasicBlock *UserBB = DVR->getMarker()->getParent();
       if (UserBB == OrigHeader)
         continue;
 
@@ -202,7 +202,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
         NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
       else
         NewVal = UndefValue::get(OrigHeaderVal->getType());
-      DPV->replaceVariableLocationOp(OrigHeaderVal, NewVal);
+      DVR->replaceVariableLocationOp(OrigHeaderVal, NewVal);
     }
   }
 }
@@ -552,20 +552,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     for (Instruction &I : llvm::drop_begin(llvm::reverse(*OrigPreheader))) {
       if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
         DbgIntrinsics.insert(makeHash(DII));
-        // Until RemoveDIs supports dbg.declares in DPValue format, we'll need
-        // to collect DPValues attached to any other debug intrinsics.
-        for (const DPValue &DPV : filterDbgVars(DII->getDbgRecordRange()))
-          DbgIntrinsics.insert(makeHash(&DPV));
+        // Until RemoveDIs supports dbg.declares in DbgVariableRecord format,
+        // we'll need to collect DbgVariableRecords attached to any other debug
+        // intrinsics.
+        for (const DbgVariableRecord &DVR :
+             filterDbgVars(DII->getDbgRecordRange()))
+          DbgIntrinsics.insert(makeHash(&DVR));
       } else {
         break;
       }
     }
 
-    // Build DPValue hashes for DPValues attached to the terminator, which isn't
-    // considered in the loop above.
-    for (const DPValue &DPV :
+    // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
+    // terminator, which isn't considered in the loop above.
+    for (const DbgVariableRecord &DVR :
          filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
-      DbgIntrinsics.insert(makeHash(&DPV));
+      DbgIntrinsics.insert(makeHash(&DVR));
 
     // Remember the local noalias scope declarations in the header. After the
     // rotation, they must be duplicated and the scope must be cloned. This
@@ -627,13 +629,14 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
             !NextDbgInsts.empty()) {
           auto DbgValueRange =
               LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-          RemapDPValueRange(M, DbgValueRange, ValueMap,
-                            RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+          RemapDbgVariableRecordRange(M, DbgValueRange, ValueMap,
+                                      RF_NoModuleLevelChanges |
+                                          RF_IgnoreMissingLocals);
           // Erase anything we've seen before.
-          for (DPValue &DPV :
+          for (DbgVariableRecord &DVR :
                make_early_inc_range(filterDbgVars(DbgValueRange)))
-            if (DbgIntrinsics.count(makeHash(&DPV)))
-              DPV.eraseFromParent();
+            if (DbgIntrinsics.count(makeHash(&DVR)))
+              DVR.eraseFromParent();
         }
 
         NextDbgInsts = I->getDbgRecordRange();
@@ -653,13 +656,15 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       if (LoopEntryBranch->getParent()->IsNewDbgInfoFormat &&
           !NextDbgInsts.empty()) {
         auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-        RemapDPValueRange(M, Range, ValueMap,
-                          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+        RemapDbgVariableRecordRange(M, Range, ValueMap,
+                                    RF_NoModuleLevelChanges |
+                                        RF_IgnoreMissingLocals);
         NextDbgInsts = DPMarker::getEmptyDbgRecordRange();
         // Erase anything we've seen before.
-        for (DPValue &DPV : make_early_inc_range(filterDbgVars(Range)))
-          if (DbgIntrinsics.count(makeHash(&DPV)))
-            DPV.eraseFromParent();
+        for (DbgVariableRecord &DVR :
+             make_early_inc_range(filterDbgVars(Range)))
+          if (DbgIntrinsics.count(makeHash(&DVR)))
+            DVR.eraseFromParent();
       }
 
       // Eagerly remap the operands of the instruction.
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index ecd76b7c1fbfb2..2d5b5f967ffbc0 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -917,8 +917,9 @@ bool llvm::UnrollRuntimeLoopRemainder(
     for (Instruction &I : *BB) {
       RemapInstruction(&I, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-      RemapDPValueRange(M, I.getDbgRecordRange(), VMap,
-                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      RemapDbgVariableRecordRange(M, I.getDbgRecordRange(), VMap,
+                                  RF_NoModuleLevelChanges |
+                                      RF_IgnoreMissingLocals);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index c815a7b0b2932f..73c5d636782294 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -605,7 +605,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
   // Use a map to unique and a vector to guarantee deterministic ordering.
   llvm::SmallDenseSet<DebugVariable, 4> DeadDebugSet;
   llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
-  llvm::SmallVector<DPValue *, 4> DeadDPValues;
+  llvm::SmallVector<DbgVariableRecord *, 4> DeadDbgVariableRecords;
 
   if (ExitBlock) {
     // Given LCSSA form is satisfied, we should not have users of instructions
@@ -631,17 +631,17 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
           U.set(Poison);
         }
 
-        // RemoveDIs: do the same as below for DPValues.
+        // RemoveDIs: do the same as below for DbgVariableRecords.
         if (Block->IsNewDbgInfoFormat) {
-          for (DPValue &DPV : llvm::make_early_inc_range(
+          for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
                    filterDbgVars(I.getDbgRecordRange()))) {
-            DebugVariable Key(DPV.getVariable(), DPV.getExpression(),
-                              DPV.getDebugLoc().get());
+            DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
+                              DVR.getDebugLoc().get());
             if (!DeadDebugSet.insert(Key).second)
               continue;
-            // Unlinks the DPV from it's container, for later insertion.
-            DPV.removeFromParent();
-            DeadDPValues.push_back(&DPV);
+            // Unlinks the DVR from it's container, for later insertion.
+            DVR.removeFromParent();
+            DeadDbgVariableRecords.push_back(&DVR);
           }
         }
 
@@ -673,11 +673,11 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
       DVI->moveBefore(*ExitBlock, InsertDbgValueBefore);
 
     // Due to the "head" bit in BasicBlock::iterator, we're going to insert
-    // each DPValue right at the start of the block, wheras dbg.values would be
-    // repeatedly inserted before the first instruction. To replicate this
-    // behaviour, do it backwards.
-    for (DPValue *DPV : llvm::reverse(DeadDPValues))
-      ExitBlock->insertDbgRecordBefore(DPV, InsertDbgValueBefore);
+    // each DbgVariableRecord right at the start of the block, wheras dbg.values
+    // would be repeatedly inserted before the first instruction. To replicate
+    // this behaviour, do it backwards.
+    for (DbgVariableRecord *DVR : llvm::reverse(DeadDbgVariableRecords))
+      ExitBlock->insertDbgRecordBefore(DVR, InsertDbgValueBefore);
   }
 
   // Remove the block from the reference counting scheme, so that we can
@@ -1034,6 +1034,15 @@ CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
   }
 }
 
+Value *llvm::createAnyOfOp(IRBuilderBase &Builder, Value *StartVal,
+                           RecurKind RK, Value *Left, Value *Right) {
+  if (auto VTy = dyn_cast<VectorType>(Left->getType()))
+    StartVal = Builder.CreateVectorSplat(VTy->getElementCount(), StartVal);
+  Value *Cmp =
+      Builder.CreateCmp(CmpInst::ICMP_NE, Left, StartVal, "rdx.select.cmp");
+  return Builder.CreateSelect(Cmp, Left, Right, "rdx.select");
+}
+
 Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
                             Value *Right) {
   Type *Ty = Left->getType();
@@ -1142,13 +1151,16 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src,
     NewVal = SI->getTrueValue();
   }
 
+  // Create a splat vector with the new value and compare this to the vector
+  // we want to reduce.
+  ElementCount EC = cast<VectorType>(Src->getType())->getElementCount();
+  Value *Right = Builder.CreateVectorSplat(EC, InitVal);
+  Value *Cmp =
+      Builder.CreateCmp(CmpInst::ICMP_NE, Src, Right, "rdx.select.cmp");
+
   // If any predicate is true it means that we want to select the new value.
-  Value *AnyOf =
-      Src->getType()->isVectorTy() ? Builder.CreateOrReduce(Src) : Src;
-  // The compares in the loop may yield poison, which propagates through the
-  // bitwise ORs. Freeze it here before the condition is used.
-  AnyOf = Builder.CreateFreeze(AnyOf);
-  return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select");
+  Cmp = Builder.CreateOrReduce(Cmp);
+  return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select");
 }
 
 Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
diff --git a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
index d671a9373bf026..8f55d7bbd3182a 100644
--- a/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryOpRemark.cpp
@@ -332,7 +332,7 @@ void MemoryOpRemark::visitVariable(const Value *V,
     }
   };
   for_each(findDbgDeclares(const_cast<Value *>(V)), FindDI);
-  for_each(findDPVDeclares(const_cast<Value *>(V)), FindDI);
+  for_each(findDVRDeclares(const_cast<Value *>(V)), FindDI);
 
   if (FoundDI) {
     assert(!Result.empty());
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index e1f14b903fb8e5..8dd1002a6e4ac8 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -18,7 +18,9 @@
 #include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 
 namespace llvm {
@@ -110,21 +112,21 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) {
 
 void StackInfoBuilder::visit(Instruction &Inst) {
   // Visit non-intrinsic debug-info records attached to Inst.
-  for (DPValue &DPV : filterDbgVars(Inst.getDbgRecordRange())) {
+  for (DbgVariableRecord &DVR : filterDbgVars(Inst.getDbgRecordRange())) {
     auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
           return;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
-        auto &DPVVec = AInfo.DbgVariableRecords;
-        if (DPVVec.empty() || DPVVec.back() != &DPV)
-          DPVVec.push_back(&DPV);
+        auto &DVRVec = AInfo.DbgVariableRecords;
+        if (DVRVec.empty() || DVRVec.back() != &DVR)
+          DVRVec.push_back(&DVR);
       }
     };
 
-    for_each(DPV.location_ops(), AddIfInteresting);
-    if (DPV.isDbgAssign())
-      AddIfInteresting(DPV.getAddress());
+    for_each(DVR.location_ops(), AddIfInteresting);
+    if (DVR.isDbgAssign())
+      AddIfInteresting(DVR.getAddress());
   }
 
   if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
@@ -236,5 +238,40 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) {
   Info.AI = NewAI;
 }
 
+bool isLifetimeIntrinsic(Value *V) {
+  auto *II = dyn_cast<IntrinsicInst>(V);
+  return II && II->isLifetimeStartOrEnd();
+}
+
+Value *readRegister(IRBuilder<> &IRB, StringRef Name) {
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ReadRegister = Intrinsic::getDeclaration(
+      M, Intrinsic::read_register, IRB.getIntPtrTy(M->getDataLayout()));
+  MDNode *MD =
+      MDNode::get(M->getContext(), {MDString::get(M->getContext(), Name)});
+  Value *Args[] = {MetadataAsValue::get(M->getContext(), MD)};
+  return IRB.CreateCall(ReadRegister, Args);
+}
+
+Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB) {
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  if (TargetTriple.getArch() == Triple::aarch64)
+    return memtag::readRegister(IRB, "pc");
+  return IRB.CreatePtrToInt(IRB.GetInsertBlock()->getParent(),
+                            IRB.getIntPtrTy(M->getDataLayout()));
+}
+
+Value *getFP(IRBuilder<> &IRB) {
+  Function *F = IRB.GetInsertBlock()->getParent();
+  Module *M = F->getParent();
+  auto *GetStackPointerFn = Intrinsic::getDeclaration(
+      M, Intrinsic::frameaddress,
+      IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()));
+  return IRB.CreatePtrToInt(
+      IRB.CreateCall(GetStackPointerFn,
+                     {Constant::getNullValue(IRB.getInt32Ty())}),
+      IRB.getIntPtrTy(M->getDataLayout()));
+}
+
 } // namespace memtag
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index b462803bad38c3..adcf161b313b2b 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -104,11 +104,13 @@ namespace {
 static void createDebugValue(DIBuilder &DIB, Value *NewValue,
                              DILocalVariable *Variable,
                              DIExpression *Expression, const DILocation *DI,
-                             DPValue *InsertBefore) {
-  // FIXME: Merge these two functions now that DIBuilder supports DPValues.
-  // We neeed the API to accept DPValues as an insert point for that to work.
+                             DbgVariableRecord *InsertBefore) {
+  // FIXME: Merge these two functions now that DIBuilder supports
+  // DbgVariableRecords. We neeed the API to accept DbgVariableRecords as an
+  // insert point for that to work.
   (void)DIB;
-  DPValue::createDPValue(NewValue, Variable, Expression, DI, *InsertBefore);
+  DbgVariableRecord::createDbgVariableRecord(NewValue, Variable, Expression, DI,
+                                             *InsertBefore);
 }
 static void createDebugValue(DIBuilder &DIB, Value *NewValue,
                              DILocalVariable *Variable,
@@ -123,7 +125,7 @@ class AssignmentTrackingInfo {
   /// fragment. (i.e. not be a comprehensive set if there are multiple
   /// dbg.assigns for one variable fragment).
   SmallVector<DbgVariableIntrinsic *> DbgAssigns;
-  SmallVector<DPValue *> DPVAssigns;
+  SmallVector<DbgVariableRecord *> DVRAssigns;
 
 public:
   void init(AllocaInst *AI) {
@@ -132,21 +134,21 @@ class AssignmentTrackingInfo {
       if (Vars.insert(DebugVariable(DAI)).second)
         DbgAssigns.push_back(DAI);
     }
-    for (DPValue *DPV : at::getDPVAssignmentMarkers(AI)) {
-      if (Vars.insert(DebugVariable(DPV)).second)
-        DPVAssigns.push_back(DPV);
+    for (DbgVariableRecord *DVR : at::getDVRAssignmentMarkers(AI)) {
+      if (Vars.insert(DebugVariable(DVR)).second)
+        DVRAssigns.push_back(DVR);
     }
   }
 
   /// Update assignment tracking debug info given for the to-be-deleted store
   /// \p ToDelete that stores to this alloca.
-  void
-  updateForDeletedStore(StoreInst *ToDelete, DIBuilder &DIB,
-                        SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
-                        SmallSet<DPValue *, 8> *DPVAssignsToDelete) const {
+  void updateForDeletedStore(
+      StoreInst *ToDelete, DIBuilder &DIB,
+      SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
+      SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) const {
     // There's nothing to do if the alloca doesn't have any variables using
     // assignment tracking.
-    if (DbgAssigns.empty() && DPVAssigns.empty())
+    if (DbgAssigns.empty() && DVRAssigns.empty())
       return;
 
     // Insert a dbg.value where the linked dbg.assign is and remember to delete
@@ -165,8 +167,8 @@ class AssignmentTrackingInfo {
     };
     for (auto *Assign : at::getAssignmentMarkers(ToDelete))
       InsertValueForAssign(Assign, DbgAssignsToDelete);
-    for (auto *Assign : at::getDPVAssignmentMarkers(ToDelete))
-      InsertValueForAssign(Assign, DPVAssignsToDelete);
+    for (auto *Assign : at::getDVRAssignmentMarkers(ToDelete))
+      InsertValueForAssign(Assign, DVRAssignsToDelete);
 
     // It's possible for variables using assignment tracking to have no
     // dbg.assign linked to this store. These are variables in DbgAssigns that
@@ -182,7 +184,7 @@ class AssignmentTrackingInfo {
       ConvertDebugDeclareToDebugValue(Assign, ToDelete, DIB);
     };
     for_each(DbgAssigns, ConvertUnlinkedAssignToValue);
-    for_each(DPVAssigns, ConvertUnlinkedAssignToValue);
+    for_each(DVRAssigns, ConvertUnlinkedAssignToValue);
   }
 
   /// Update assignment tracking debug info given for the newly inserted PHI \p
@@ -193,20 +195,20 @@ class AssignmentTrackingInfo {
     // debug-phi.
     for (auto *DAI : DbgAssigns)
       ConvertDebugDeclareToDebugValue(DAI, NewPhi, DIB);
-    for (auto *DPV : DPVAssigns)
-      ConvertDebugDeclareToDebugValue(DPV, NewPhi, DIB);
+    for (auto *DVR : DVRAssigns)
+      ConvertDebugDeclareToDebugValue(DVR, NewPhi, DIB);
   }
 
   void clear() {
     DbgAssigns.clear();
-    DPVAssigns.clear();
+    DVRAssigns.clear();
   }
-  bool empty() { return DbgAssigns.empty() && DPVAssigns.empty(); }
+  bool empty() { return DbgAssigns.empty() && DVRAssigns.empty(); }
 };
 
 struct AllocaInfo {
   using DbgUserVec = SmallVector<DbgVariableIntrinsic *, 1>;
-  using DPUserVec = SmallVector<DPValue *, 1>;
+  using DPUserVec = SmallVector<DbgVariableRecord *, 1>;
 
   SmallVector<BasicBlock *, 32> DefiningBlocks;
   SmallVector<BasicBlock *, 32> UsingBlocks;
@@ -262,7 +264,7 @@ struct AllocaInfo {
       }
     }
     DbgUserVec AllDbgUsers;
-    SmallVector<DPValue *> AllDPUsers;
+    SmallVector<DbgVariableRecord *> AllDPUsers;
     findDbgUsers(AllDbgUsers, AI, &AllDPUsers);
     std::copy_if(AllDbgUsers.begin(), AllDbgUsers.end(),
                  std::back_inserter(DbgUsers), [](DbgVariableIntrinsic *DII) {
@@ -270,7 +272,7 @@ struct AllocaInfo {
                  });
     std::copy_if(AllDPUsers.begin(), AllDPUsers.end(),
                  std::back_inserter(DPUsers),
-                 [](DPValue *DPV) { return !DPV->isDbgAssign(); });
+                 [](DbgVariableRecord *DVR) { return !DVR->isDbgAssign(); });
     AssignmentTracking.init(AI);
   }
 };
@@ -378,7 +380,7 @@ struct PromoteMem2Reg {
   /// A set of dbg.assigns to delete because they've been demoted to
   /// dbg.values. Call cleanUpDbgAssigns to delete them.
   SmallSet<DbgAssignIntrinsic *, 8> DbgAssignsToDelete;
-  SmallSet<DPValue *, 8> DPVAssignsToDelete;
+  SmallSet<DbgVariableRecord *, 8> DVRAssignsToDelete;
 
   /// The set of basic blocks the renamer has already visited.
   SmallPtrSet<BasicBlock *, 16> Visited;
@@ -428,9 +430,9 @@ struct PromoteMem2Reg {
     for (auto *DAI : DbgAssignsToDelete)
       DAI->eraseFromParent();
     DbgAssignsToDelete.clear();
-    for (auto *DPV : DPVAssignsToDelete)
-      DPV->eraseFromParent();
-    DPVAssignsToDelete.clear();
+    for (auto *DVR : DVRAssignsToDelete)
+      DVR->eraseFromParent();
+    DVRAssignsToDelete.clear();
   }
 };
 
@@ -508,7 +510,7 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
                          const DataLayout &DL, DominatorTree &DT,
                          AssumptionCache *AC,
                          SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
-                         SmallSet<DPValue *, 8> *DPVAssignsToDelete) {
+                         SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   StoreInst *OnlyStore = Info.OnlyStore;
   bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
   BasicBlock *StoreBB = OnlyStore->getParent();
@@ -569,7 +571,7 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
   DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
   // Update assignment tracking info for the store we're going to delete.
   Info.AssignmentTracking.updateForDeletedStore(
-      Info.OnlyStore, DIB, DbgAssignsToDelete, DPVAssignsToDelete);
+      Info.OnlyStore, DIB, DbgAssignsToDelete, DVRAssignsToDelete);
 
   // Record debuginfo for the store and remove the declaration's
   // debuginfo.
@@ -618,7 +620,7 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                          LargeBlockInfo &LBI, const DataLayout &DL,
                          DominatorTree &DT, AssumptionCache *AC,
                          SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
-                         SmallSet<DPValue *, 8> *DPVAssignsToDelete) {
+                         SmallSet<DbgVariableRecord *, 8> *DVRAssignsToDelete) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
   // significantly pessimize the small block case.  This uses LargeBlockInfo to
@@ -683,7 +685,7 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     StoreInst *SI = cast<StoreInst>(AI->user_back());
     // Update assignment tracking info for the store we're going to delete.
     Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DbgAssignsToDelete,
-                                                  DPVAssignsToDelete);
+                                                  DVRAssignsToDelete);
     // Record debuginfo for the store before removing it.
     auto DbgUpdateForStore = [&](auto &Container) {
       for (auto *DbgItem : Container) {
@@ -755,7 +757,7 @@ void PromoteMem2Reg::run() {
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
       if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                   &DbgAssignsToDelete, &DPVAssignsToDelete)) {
+                                   &DbgAssignsToDelete, &DVRAssignsToDelete)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
         ++NumSingleStore;
@@ -767,7 +769,7 @@ void PromoteMem2Reg::run() {
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock &&
         promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                 &DbgAssignsToDelete, &DPVAssignsToDelete)) {
+                                 &DbgAssignsToDelete, &DVRAssignsToDelete)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -1174,7 +1176,7 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
       // Record debuginfo for the store before removing it.
       IncomingLocs[AllocaNo] = SI->getDebugLoc();
       AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB, &DbgAssignsToDelete,
-                                                   &DPVAssignsToDelete);
+                                                   &DVRAssignsToDelete);
       auto ConvertDbgDeclares = [&](auto &Container) {
         for (auto *DbgItem : Container)
           if (DbgItem->isAddressOfVariable())
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index a185e8cd371c60..38fc7763c5b204 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -1486,7 +1486,13 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
   // Try to simplify to a constant range.
   ConstantRange A = getConstantRange(V1State, I.getType());
   ConstantRange B = getConstantRange(V2State, I.getType());
-  ConstantRange R = A.binaryOp(cast<BinaryOperator>(&I)->getOpcode(), B);
+
+  auto *BO = cast<BinaryOperator>(&I);
+  ConstantRange R = ConstantRange::getEmpty(I.getType()->getScalarSizeInBits());
+  if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO))
+    R = A.overflowingBinaryOp(BO->getOpcode(), B, OBO->getNoWrapKind());
+  else
+    R = A.binaryOp(BO->getOpcode(), B);
   mergeInValue(&I, ValueLatticeElement::getRange(R));
 
   // TODO: Currently we do not exploit special values that produce something
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index fc21fb552137d7..38df20c949c2ec 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -199,17 +199,17 @@ void SSAUpdater::RewriteUse(Use &U) {
 
 void SSAUpdater::UpdateDebugValues(Instruction *I) {
   SmallVector<DbgValueInst *, 4> DbgValues;
-  SmallVector<DPValue *, 4> DPValues;
-  llvm::findDbgValues(DbgValues, I, &DPValues);
+  SmallVector<DbgVariableRecord *, 4> DbgVariableRecords;
+  llvm::findDbgValues(DbgValues, I, &DbgVariableRecords);
   for (auto &DbgValue : DbgValues) {
     if (DbgValue->getParent() == I->getParent())
       continue;
     UpdateDebugValue(I, DbgValue);
   }
-  for (auto &DPV : DPValues) {
-    if (DPV->getParent() == I->getParent())
+  for (auto &DVR : DbgVariableRecords) {
+    if (DVR->getParent() == I->getParent())
       continue;
-    UpdateDebugValue(I, DPV);
+    UpdateDebugValue(I, DVR);
   }
 }
 
@@ -220,10 +220,10 @@ void SSAUpdater::UpdateDebugValues(Instruction *I,
   }
 }
 
-void SSAUpdater::UpdateDebugValues(Instruction *I,
-                                   SmallVectorImpl<DPValue *> &DPValues) {
-  for (auto &DPV : DPValues) {
-    UpdateDebugValue(I, DPV);
+void SSAUpdater::UpdateDebugValues(
+    Instruction *I, SmallVectorImpl<DbgVariableRecord *> &DbgVariableRecords) {
+  for (auto &DVR : DbgVariableRecords) {
+    UpdateDebugValue(I, DVR);
   }
 }
 
@@ -236,13 +236,13 @@ void SSAUpdater::UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue) {
     DbgValue->setKillLocation();
 }
 
-void SSAUpdater::UpdateDebugValue(Instruction *I, DPValue *DPV) {
-  BasicBlock *UserBB = DPV->getParent();
+void SSAUpdater::UpdateDebugValue(Instruction *I, DbgVariableRecord *DVR) {
+  BasicBlock *UserBB = DVR->getParent();
   if (HasValueForBlock(UserBB)) {
     Value *NewVal = GetValueAtEndOfBlock(UserBB);
-    DPV->replaceVariableLocationOp(I, NewVal);
+    DVR->replaceVariableLocationOp(I, NewVal);
   } else
-    DPV->setKillLocation();
+    DVR->setKillLocation();
 }
 
 void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 292ad78a1852bc..55bbffb18879fb 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1126,8 +1126,9 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
 
     NewBonusInst->insertInto(PredBlock, PTI->getIterator());
     auto Range = NewBonusInst->cloneDebugInfoFrom(&BonusInst);
-    RemapDPValueRange(NewBonusInst->getModule(), Range, VMap,
-                      RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    RemapDbgVariableRecordRange(NewBonusInst->getModule(), Range, VMap,
+                                RF_NoModuleLevelChanges |
+                                    RF_IgnoreMissingLocals);
 
     if (isa<DbgInfoIntrinsic>(BonusInst))
       continue;
@@ -1526,15 +1527,15 @@ static bool shouldHoistCommonInstructions(Instruction *I1, Instruction *I2,
   return true;
 }
 
-/// Hoists DPValues from \p I1 and \p OtherInstrs that are identical in
-/// lock-step to \p TI. This matches how dbg.* intrinsics are hoisting in
+/// Hoists DbgVariableRecords from \p I1 and \p OtherInstrs that are identical
+/// in lock-step to \p TI. This matches how dbg.* intrinsics are hoisting in
 /// hoistCommonCodeFromSuccessors. e.g. The input:
-///    I1                DPVs: { x, z },
-///    OtherInsts: { I2  DPVs: { x, y, z } }
-/// would result in hoisting only DPValue x.
-static void
-hoistLockstepIdenticalDPValues(Instruction *TI, Instruction *I1,
-                               SmallVectorImpl<Instruction *> &OtherInsts) {
+///    I1                DVRs: { x, z },
+///    OtherInsts: { I2  DVRs: { x, y, z } }
+/// would result in hoisting only DbgVariableRecord x.
+static void hoistLockstepIdenticalDbgVariableRecords(
+    Instruction *TI, Instruction *I1,
+    SmallVectorImpl<Instruction *> &OtherInsts) {
   if (!I1->hasDbgRecords())
     return;
   using CurrentAndEndIt =
@@ -1570,12 +1571,12 @@ hoistLockstepIdenticalDPValues(Instruction *TI, Instruction *I1,
   // This replicates the dbg.* intrinsic behaviour in
   // hoistCommonCodeFromSuccessors.
   while (none_of(Itrs, atEnd)) {
-    bool HoistDPVs = allIdentical(Itrs);
+    bool HoistDVRs = allIdentical(Itrs);
     for (CurrentAndEndIt &Pair : Itrs) {
       // Increment Current iterator now as we may be about to move the
       // DbgRecord.
       DbgRecord &DR = *Pair.first++;
-      if (HoistDPVs) {
+      if (HoistDVRs) {
         DR.removeFromParent();
         TI->getParent()->insertDbgRecordBefore(&DR, TI->getIterator());
       }
@@ -1691,7 +1692,7 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
       // at the beginning of the loop, we can hoist the terminator instruction.
       // If any instructions remain in the block, we cannot hoist terminators.
       if (NumSkipped || !AllInstsAreIdentical) {
-        hoistLockstepIdenticalDPValues(TI, I1, OtherInsts);
+        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
         return Changed;
       }
 
@@ -1721,9 +1722,9 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
         // The debug location is an integral part of a debug info intrinsic
         // and can't be separated from it or replaced.  Instead of attempting
         // to merge locations, simply hoist both copies of the intrinsic.
-        hoistLockstepIdenticalDPValues(TI, I1, OtherInsts);
-        // We've just hoisted DPValues; move I1 after them (before TI) and
-        // leave any that were not hoisted behind (by calling moveBefore
+        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
+        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
+        // and leave any that were not hoisted behind (by calling moveBefore
         // rather than moveBeforePreserving).
         I1->moveBefore(TI);
         for (auto &SuccIter : OtherSuccIterRange) {
@@ -1735,9 +1736,9 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
         // For a normal instruction, we just move one to right before the
         // branch, then replace all uses of the other with the first.  Finally,
         // we remove the now redundant second instruction.
-        hoistLockstepIdenticalDPValues(TI, I1, OtherInsts);
-        // We've just hoisted DPValues; move I1 after them (before TI) and
-        // leave any that were not hoisted behind (by calling moveBefore
+        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
+        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
+        // and leave any that were not hoisted behind (by calling moveBefore
         // rather than moveBeforePreserving).
         I1->moveBefore(TI);
         for (auto &SuccIter : OtherSuccIterRange) {
@@ -1759,7 +1760,7 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
       NumHoistCommonInstrs += SuccIterPairs.size();
     } else {
       if (NumSkipped >= HoistCommonSkipLimit) {
-        hoistLockstepIdenticalDPValues(TI, I1, OtherInsts);
+        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
         return Changed;
       }
       // We are about to skip over a pair of non-identical instructions. Record
@@ -1822,9 +1823,9 @@ bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
     }
   }
 
-  // Hoist DPValues attached to the terminator to match dbg.* intrinsic hoisting
-  // behaviour in hoistCommonCodeFromSuccessors.
-  hoistLockstepIdenticalDPValues(TI, I1, OtherSuccTIs);
+  // Hoist DbgVariableRecords attached to the terminator to match dbg.*
+  // intrinsic hoisting behaviour in hoistCommonCodeFromSuccessors.
+  hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherSuccTIs);
   // Clone the terminator and hoist it into the pred, without any debug info.
   Instruction *NT = I1->clone();
   NT->insertInto(TIParent, TI->getIterator());
@@ -3179,7 +3180,7 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
         DbgAssign->replaceVariableLocationOp(OrigV, S);
     };
     for_each(at::getAssignmentMarkers(SpeculatedStore), replaceVariable);
-    for_each(at::getDPVAssignmentMarkers(SpeculatedStore), replaceVariable);
+    for_each(at::getDVRAssignmentMarkers(SpeculatedStore), replaceVariable);
   }
 
   // Metadata can be dependent on the condition we are hoisting above.
@@ -3204,13 +3205,15 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
   }
 
   // Hoist the instructions.
-  // In "RemoveDIs" non-instr debug-info mode, drop DPValues attached to these
-  // instructions, in the same way that dbg.value intrinsics are dropped at the
-  // end of this block.
+  // In "RemoveDIs" non-instr debug-info mode, drop DbgVariableRecords attached
+  // to these instructions, in the same way that dbg.value intrinsics are
+  // dropped at the end of this block.
   for (auto &It : make_range(ThenBB->begin(), ThenBB->end()))
     for (DbgRecord &DR : make_early_inc_range(It.getDbgRecordRange()))
-      // Drop all records except assign-kind DPValues (dbg.assign equivalent).
-      if (DPValue *DPV = dyn_cast<DPValue>(&DR); !DPV || !DPV->isDbgAssign())
+      // Drop all records except assign-kind DbgVariableRecords (dbg.assign
+      // equivalent).
+      if (DbgVariableRecord *DVR = dyn_cast<DbgVariableRecord>(&DR);
+          !DVR || !DVR->isDbgAssign())
         It.dropOneDbgRecord(&DR);
   BB->splice(BI->getIterator(), ThenBB, ThenBB->begin(),
              std::prev(ThenBB->end()));
@@ -3385,7 +3388,7 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
     TranslateMap[Cond] = CB;
 
     // RemoveDIs: track instructions that we optimise away while folding, so
-    // that we can copy DPValues from them later.
+    // that we can copy DbgVariableRecords from them later.
     BasicBlock::iterator SrcDbgCursor = BB->begin();
     for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
       if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
@@ -3849,10 +3852,10 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
 
   if (PredBlock->IsNewDbgInfoFormat) {
     PredBlock->getTerminator()->cloneDebugInfoFrom(BB->getTerminator());
-    for (DPValue &DPV :
+    for (DbgVariableRecord &DVR :
          filterDbgVars(PredBlock->getTerminator()->getDbgRecordRange())) {
-      RemapDPValue(M, &DPV, VMap,
-                   RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      RemapDbgVariableRecord(M, &DVR, VMap,
+                             RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index b8fa985fa3462e..f6d72265305d7d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1381,6 +1381,77 @@ const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
   };
 }
 
+namespace {
+
+// Represents a interesting integer binary operation for
+// getExtendedOperandRecurrence. This may be a shl that is being treated as a
+// multiply or a 'or disjoint' that is being treated as 'add nsw nuw'.
+struct BinaryOp {
+  unsigned Opcode;
+  std::array<Value *, 2> Operands;
+  bool IsNSW = false;
+  bool IsNUW = false;
+
+  explicit BinaryOp(Instruction *Op)
+      : Opcode(Op->getOpcode()),
+        Operands({Op->getOperand(0), Op->getOperand(1)}) {
+    if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(Op)) {
+      IsNSW = OBO->hasNoSignedWrap();
+      IsNUW = OBO->hasNoUnsignedWrap();
+    }
+  }
+
+  explicit BinaryOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
+                    bool IsNSW = false, bool IsNUW = false)
+      : Opcode(Opcode), Operands({LHS, RHS}), IsNSW(IsNSW), IsNUW(IsNUW) {}
+};
+
+} // end anonymous namespace
+
+static std::optional<BinaryOp> matchBinaryOp(Instruction *Op) {
+  switch (Op->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    return BinaryOp(Op);
+  case Instruction::Or: {
+    // Convert or disjoint into add nuw nsw.
+    if (cast<PossiblyDisjointInst>(Op)->isDisjoint())
+      return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1),
+                      /*IsNSW=*/true, /*IsNUW=*/true);
+    break;
+  }
+  case Instruction::Shl: {
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(Op->getOperand(1))) {
+      unsigned BitWidth = cast<IntegerType>(SA->getType())->getBitWidth();
+
+      // If the shift count is not less than the bitwidth, the result of
+      // the shift is undefined. Don't try to analyze it, because the
+      // resolution chosen here may differ from the resolution chosen in
+      // other parts of the compiler.
+      if (SA->getValue().ult(BitWidth)) {
+        // We can safely preserve the nuw flag in all cases. It's also safe to
+        // turn a nuw nsw shl into a nuw nsw mul. However, nsw in isolation
+        // requires special handling. It can be preserved as long as we're not
+        // left shifting by bitwidth - 1.
+        bool IsNUW = Op->hasNoUnsignedWrap();
+        bool IsNSW = Op->hasNoSignedWrap() &&
+                     (IsNUW || SA->getValue().ult(BitWidth - 1));
+
+        ConstantInt *X =
+            ConstantInt::get(Op->getContext(),
+                             APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
+        return BinaryOp(Instruction::Mul, Op->getOperand(0), X, IsNSW, IsNUW);
+      }
+    }
+
+    break;
+  }
+  }
+
+  return std::nullopt;
+}
+
 /// No-wrap operations can transfer sign extension of their result to their
 /// operands. Generate the SCEV value for the widened operation without
 /// actually modifying the IR yet. If the expression after extending the
@@ -1388,24 +1459,22 @@ const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
 /// extension used.
 WidenIV::WidenedRecTy
 WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
-  // Handle the common case of add<nsw/nuw>
-  const unsigned OpCode = DU.NarrowUse->getOpcode();
-  // Only Add/Sub/Mul instructions supported yet.
-  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
-      OpCode != Instruction::Mul)
+  auto Op = matchBinaryOp(DU.NarrowUse);
+  if (!Op)
     return {nullptr, ExtendKind::Unknown};
 
+  assert((Op->Opcode == Instruction::Add || Op->Opcode == Instruction::Sub ||
+          Op->Opcode == Instruction::Mul) &&
+         "Unexpected opcode");
+
   // One operand (NarrowDef) has already been extended to WideDef. Now determine
   // if extending the other will lead to a recurrence.
-  const unsigned ExtendOperIdx =
-      DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
-  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+  const unsigned ExtendOperIdx = Op->Operands[0] == DU.NarrowDef ? 1 : 0;
+  assert(Op->Operands[1 - ExtendOperIdx] == DU.NarrowDef && "bad DU");
 
-  const OverflowingBinaryOperator *OBO =
-    cast<OverflowingBinaryOperator>(DU.NarrowUse);
   ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
-  if (!(ExtKind == ExtendKind::Sign && OBO->hasNoSignedWrap()) &&
-      !(ExtKind == ExtendKind::Zero && OBO->hasNoUnsignedWrap())) {
+  if (!(ExtKind == ExtendKind::Sign && Op->IsNSW) &&
+      !(ExtKind == ExtendKind::Zero && Op->IsNUW)) {
     ExtKind = ExtendKind::Unknown;
 
     // For a non-negative NarrowDef, we can choose either type of
@@ -1413,16 +1482,15 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
     // (see above), and we only hit this code if we need to check
     // the opposite case.
     if (DU.NeverNegative) {
-      if (OBO->hasNoSignedWrap()) {
+      if (Op->IsNSW) {
         ExtKind = ExtendKind::Sign;
-      } else if (OBO->hasNoUnsignedWrap()) {
+      } else if (Op->IsNUW) {
         ExtKind = ExtendKind::Zero;
       }
     }
   }
 
-  const SCEV *ExtendOperExpr =
-      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx));
+  const SCEV *ExtendOperExpr = SE->getSCEV(Op->Operands[ExtendOperIdx]);
   if (ExtKind == ExtendKind::Sign)
     ExtendOperExpr = SE->getSignExtendExpr(ExtendOperExpr, WideType);
   else if (ExtKind == ExtendKind::Zero)
@@ -1443,7 +1511,7 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
   if (ExtendOperIdx == 0)
     std::swap(lhs, rhs);
   const SCEVAddRecExpr *AddRec =
-      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
+      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, Op->Opcode));
 
   if (!AddRec || AddRec->getLoop() != L)
     return {nullptr, ExtendKind::Unknown};
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index d0377421145cbc..8c24599c8ce37d 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -146,7 +146,7 @@ class Mapper {
   Value *mapValue(const Value *V);
   void remapInstruction(Instruction *I);
   void remapFunction(Function &F);
-  void remapDbgRecord(DbgRecord &DPV);
+  void remapDbgRecord(DbgRecord &DVR);
 
   Constant *mapConstant(const Constant *C) {
     return cast_or_null<Constant>(mapValue(C));
@@ -543,7 +543,7 @@ void Mapper::remapDbgRecord(DbgRecord &DR) {
     return;
   }
 
-  DPValue &V = cast<DPValue>(DR);
+  DbgVariableRecord &V = cast<DbgVariableRecord>(DR);
   // Remap variables and DILocations.
   auto *MappedVar = mapMetadata(V.getVariable());
   auto *MappedDILoc = mapMetadata(V.getDebugLoc());
@@ -1233,14 +1233,14 @@ void ValueMapper::remapInstruction(Instruction &I) {
   FlushingMapper(pImpl)->remapInstruction(&I);
 }
 
-void ValueMapper::remapDPValue(Module *M, DPValue &V) {
+void ValueMapper::remapDbgVariableRecord(Module *M, DbgVariableRecord &V) {
   FlushingMapper(pImpl)->remapDbgRecord(V);
 }
 
-void ValueMapper::remapDPValueRange(
+void ValueMapper::remapDbgVariableRecordRange(
     Module *M, iterator_range<DbgRecord::self_iterator> Range) {
-  for (DPValue &DPV : filterDbgVars(Range)) {
-    remapDPValue(M, DPV);
+  for (DbgVariableRecord &DVR : filterDbgVars(Range)) {
+    remapDbgVariableRecord(M, DVR);
   }
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 9a4b0baf9c15d8..5d03b66b0ce335 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -68,7 +68,9 @@ class VPBuilder {
 public:
   VPBuilder() = default;
   VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }
-  VPBuilder(VPRecipeBase *InsertPt) { setInsertPoint(InsertPt); }
+  VPBuilder(VPRecipeBase *InsertPt) {
+    setInsertPoint(InsertPt->getParent(), InsertPt->getIterator());
+  }
 
   /// Clear the insertion point: created instructions will not be inserted into
   /// a block.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 96933479edc654..2163930b02c1cc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6911,25 +6911,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
 
     SmallVector<const Value *, 4> Operands(I->operand_values());
-    auto InstrCost = TTI.getArithmeticInstrCost(
+    return TTI.getArithmeticInstrCost(
         I->getOpcode(), VectorTy, CostKind,
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-        Op2Info, Operands, I);
-
-    // Some targets can replace frem with vector library calls.
-    InstructionCost VecCallCost = InstructionCost::getInvalid();
-    if (I->getOpcode() == Instruction::FRem) {
-      LibFunc Func;
-      if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
-          TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
-        SmallVector<Type *, 4> OpTypes;
-        for (auto &Op : I->operands())
-          OpTypes.push_back(Op->getType());
-        VecCallCost =
-            TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
-      }
-    }
-    return std::min(InstrCost, VecCallCost);
+        Op2Info, Operands, I, TLI);
   }
   case Instruction::FNeg: {
     return TTI.getArithmeticInstrCost(
@@ -7405,6 +7390,7 @@ static void createAndCollectMergePhiForReduction(
   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
 
+  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
   Value *FinalValue =
       State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
   auto *ResumePhi =
@@ -7429,7 +7415,7 @@ static void createAndCollectMergePhiForReduction(
       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
                               Incoming);
     else
-      BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
+      BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
   }
 
   auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
@@ -7912,8 +7898,7 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
   }
 }
 
-VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
-                                         VPlan &Plan) {
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
 
   // Look for cached value.
@@ -7968,7 +7953,7 @@ VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
   return ECEntryIt->second;
 }
 
-void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
+void VPRecipeBuilder::createHeaderMask() {
   BasicBlock *Header = OrigLoop->getHeader();
 
   // When not folding the tail, use nullptr to model all-true mask.
@@ -8003,7 +7988,7 @@ VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
   return BCEntryIt->second;
 }
 
-void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
+void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
   assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
   assert(OrigLoop->getHeader() != BB &&
@@ -8014,7 +7999,7 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
   VPValue *BlockMask = nullptr;
   // This is the block mask. We OR all incoming edges.
   for (auto *Predecessor : predecessors(BB)) {
-    VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
+    VPValue *EdgeMask = createEdgeMask(Predecessor, BB);
     if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
       BlockMaskCache[BB] = EdgeMask;
       return;
@@ -8033,7 +8018,7 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
 
 VPWidenMemoryInstructionRecipe *
 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
-                                  VFRange &Range, VPlanPtr &Plan) {
+                                  VFRange &Range) {
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
          "Must be called with either a load or store");
 
@@ -8106,7 +8091,7 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
 }
 
 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
-    PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
+    PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
 
   // Check if this is an integer or fp induction. If so, build the recipe that
   // produces its scalar and vector values.
@@ -8130,7 +8115,7 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
 }
 
 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
-    TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
+    TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
   // Optimize the special case where the source is a constant integer
   // induction variable. Notice that we can only optimize the 'trunc' case
   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -8158,8 +8143,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
 }
 
 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
-                                           ArrayRef<VPValue *> Operands,
-                                           VPlanPtr &Plan) {
+                                           ArrayRef<VPValue *> Operands) {
   unsigned NumIncoming = Phi->getNumIncomingValues();
 
   // We know that all PHIs in non-header blocks are converted into selects, so
@@ -8172,7 +8156,7 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
   for (unsigned In = 0; In < NumIncoming; In++) {
     OperandsWithMask.push_back(Operands[In]);
     VPValue *EdgeMask =
-        createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
+        createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
     if (!EdgeMask) {
       assert(In == 0 && "Both null and non-null edge masks found");
       assert(all_equal(Operands) &&
@@ -8186,8 +8170,7 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
 
 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                                                    ArrayRef<VPValue *> Operands,
-                                                   VFRange &Range,
-                                                   VPlanPtr &Plan) {
+                                                   VFRange &Range) {
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [this, CI](ElementCount VF) {
         return CM.isScalarWithPredication(CI, VF);
@@ -8262,7 +8245,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
       if (Legal->isMaskRequired(CI))
         Mask = getBlockInMask(CI->getParent());
       else
-        Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
+        Mask = Plan.getVPValueOrAddLiveIn(ConstantInt::getTrue(
             IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
 
       Ops.insert(Ops.begin() + *MaskPos, Mask);
@@ -8292,7 +8275,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
 
 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
                                            ArrayRef<VPValue *> Operands,
-                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
+                                           VPBasicBlock *VPBB) {
   switch (I->getOpcode()) {
   default:
     return nullptr;
@@ -8305,8 +8288,8 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
     if (CM.isPredicatedInst(I)) {
       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
       VPValue *Mask = getBlockInMask(I->getParent());
-      VPValue *One = Plan->getVPValueOrAddLiveIn(
-          ConstantInt::get(I->getType(), 1u, false));
+      VPValue *One =
+          Plan.getVPValueOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
       auto *SafeRHS =
          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
                            I->getDebugLoc());
@@ -8350,8 +8333,7 @@ void VPRecipeBuilder::fixHeaderPhis() {
 }
 
 VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
-                                                      VFRange &Range,
-                                                      VPlan &Plan) {
+                                                      VFRange &Range) {
   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
       Range);
@@ -8406,21 +8388,22 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   return Recipe;
 }
 
-VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
-    Instruction *Instr, ArrayRef<VPValue *> Operands, VFRange &Range,
-    VPBasicBlock *VPBB, VPlanPtr &Plan) {
+VPRecipeBase *
+VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
+                                        ArrayRef<VPValue *> Operands,
+                                        VFRange &Range, VPBasicBlock *VPBB) {
   // First, check for specific widening recipes that deal with inductions, Phi
   // nodes, calls and memory operations.
   VPRecipeBase *Recipe;
   if (auto Phi = dyn_cast<PHINode>(Instr)) {
     if (Phi->getParent() != OrigLoop->getHeader())
-      return tryToBlend(Phi, Operands, Plan);
+      return tryToBlend(Phi, Operands);
 
     // Always record recipes for header phis. Later first-order recurrence phis
     // can have earlier phis as incoming values.
     recordRecipeOf(Phi);
 
-    if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
+    if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
       return Recipe;
 
     VPHeaderPHIRecipe *PhiRecipe = nullptr;
@@ -8456,9 +8439,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
     return PhiRecipe;
   }
 
-  if (isa<TruncInst>(Instr) &&
-      (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
-                                               Range, *Plan)))
+  if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
+                                    cast<TruncInst>(Instr), Operands, Range)))
     return Recipe;
 
   // All widen recipes below deal only with VF > 1.
@@ -8467,10 +8449,10 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
     return nullptr;
 
   if (auto *CI = dyn_cast<CallInst>(Instr))
-    return tryToWidenCall(CI, Operands, Range, Plan);
+    return tryToWidenCall(CI, Operands, Range);
 
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
-    return tryToWidenMemory(Instr, Operands, Range, Plan);
+    return tryToWidenMemory(Instr, Operands, Range);
 
   if (!shouldWiden(Instr, Range))
     return nullptr;
@@ -8489,7 +8471,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
                                  *CI);
   }
 
-  return tryToWiden(Instr, Operands, VPBB, Plan);
+  return tryToWiden(Instr, Operands, VPBB);
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
@@ -8561,37 +8543,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
-  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
-
-  // ---------------------------------------------------------------------------
-  // Pre-construction: record ingredients whose recipes we'll need to further
-  // process after constructing the initial VPlan.
-  // ---------------------------------------------------------------------------
-
-  // For each interleave group which is relevant for this (possibly trimmed)
-  // Range, add it to the set of groups to be later applied to the VPlan and add
-  // placeholders for its members' Recipes which we'll be replacing with a
-  // single VPInterleaveRecipe.
-  for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
-    auto applyIG = [IG, this](ElementCount VF) -> bool {
-      bool Result = (VF.isVector() && // Query is illegal for VF == 1
-                     CM.getWideningDecision(IG->getInsertPos(), VF) ==
-                         LoopVectorizationCostModel::CM_Interleave);
-      // For scalable vectors, the only interleave factor currently supported
-      // is 2 since we require the (de)interleave2 intrinsics instead of
-      // shufflevectors.
-      assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
-             "Unsupported interleave factor for scalable vectors");
-      return Result;
-    };
-    if (!getDecisionAndClampRange(applyIG, Range))
-      continue;
-    InterleaveGroups.insert(IG);
-    for (unsigned i = 0; i < IG->getFactor(); i++)
-      if (Instruction *Member = IG->getMember(i))
-        RecipeBuilder.recordRecipeOf(Member);
-  };
-
   // ---------------------------------------------------------------------------
   // Build initial VPlan: Scan the body of the loop in a topological order to
   // visit each basic block after having visited its predecessor basic blocks.
@@ -8626,6 +8577,41 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   bool HasNUW = Style == TailFoldingStyle::None;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
 
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+
+  // ---------------------------------------------------------------------------
+  // Pre-construction: record ingredients whose recipes we'll need to further
+  // process after constructing the initial VPlan.
+  // ---------------------------------------------------------------------------
+
+  // For each interleave group which is relevant for this (possibly trimmed)
+  // Range, add it to the set of groups to be later applied to the VPlan and add
+  // placeholders for its members' Recipes which we'll be replacing with a
+  // single VPInterleaveRecipe.
+  for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
+    auto applyIG = [IG, this](ElementCount VF) -> bool {
+      bool Result = (VF.isVector() && // Query is illegal for VF == 1
+                     CM.getWideningDecision(IG->getInsertPos(), VF) ==
+                         LoopVectorizationCostModel::CM_Interleave);
+      // For scalable vectors, the only interleave factor currently supported
+      // is 2 since we require the (de)interleave2 intrinsics instead of
+      // shufflevectors.
+      assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+             "Unsupported interleave factor for scalable vectors");
+      return Result;
+    };
+    if (!getDecisionAndClampRange(applyIG, Range))
+      continue;
+    InterleaveGroups.insert(IG);
+    for (unsigned i = 0; i < IG->getFactor(); i++)
+      if (Instruction *Member = IG->getMember(i))
+        RecipeBuilder.recordRecipeOf(Member);
+  };
+
+  // ---------------------------------------------------------------------------
+  // Construct recipes for the instructions in the loop
+  // ---------------------------------------------------------------------------
+
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   LoopBlocksDFS DFS(OrigLoop);
@@ -8647,9 +8633,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
     Builder.setInsertPoint(VPBB);
 
     if (VPBB == HeaderVPBB)
-      RecipeBuilder.createHeaderMask(*Plan);
+      RecipeBuilder.createHeaderMask();
     else if (NeedsMasks)
-      RecipeBuilder.createBlockInMask(BB, *Plan);
+      RecipeBuilder.createBlockInMask(BB);
 
     // Introduce each ingredient into VPlan.
     // TODO: Model and preserve debug intrinsics in VPlan.
@@ -8672,10 +8658,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
         continue;
 
-      VPRecipeBase *Recipe = RecipeBuilder.tryToCreateWidenRecipe(
-          Instr, Operands, Range, VPBB, Plan);
+      VPRecipeBase *Recipe =
+          RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
       if (!Recipe)
-        Recipe = RecipeBuilder.handleReplication(Instr, Range, *Plan);
+        Recipe = RecipeBuilder.handleReplication(Instr, Range);
       for (auto *Def : Recipe->definedValues()) {
         auto *UV = Def->getUnderlyingValue();
         Plan->addVPValue(UV, Def);
@@ -8853,10 +8839,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
 // A ComputeReductionResult recipe is added to the middle block, also for
 // in-loop reductions which compute their result in-loop, because generating
 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
-//
-// Adjust AnyOf reductions; replace the reduction phi for the selected value
-// with a boolean reduction phi node to check if the condition is true in any
-// iteration. The final value is selected by the final ComputeReductionResult.
 void LoopVectorizationPlanner::adjustRecipesForReductions(
     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
     ElementCount MinVF) {
@@ -9030,41 +9012,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       continue;
 
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-    // Adjust AnyOf reductions; replace the reduction phi for the selected value
-    // with a boolean reduction phi node to check if the condition is true in
-    // any iteration. The final value is selected by the final
-    // ComputeReductionResult.
-    if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
-            RdxDesc.getRecurrenceKind())) {
-      auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
-        return isa<VPWidenSelectRecipe>(U) ||
-               (isa<VPReplicateRecipe>(U) &&
-                cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
-                    Instruction::Select);
-      }));
-      VPValue *Cmp = Select->getOperand(0);
-      // If the compare is checking the reduction PHI node, adjust it to check
-      // the start value.
-      if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
-        for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
-          if (CmpR->getOperand(I) == PhiR)
-            CmpR->setOperand(I, PhiR->getStartValue());
-      }
-      VPBuilder::InsertPointGuard Guard(Builder);
-      Builder.setInsertPoint(Select);
-
-      // If the true value of the select is the reduction phi, the new value is
-      // selected if the negated condition is true in any iteration.
-      if (Select->getOperand(1) == PhiR)
-        Cmp = Builder.createNot(Cmp);
-      VPValue *Or = Builder.createOr(PhiR, Cmp);
-      Select->getVPSingleValue()->replaceAllUsesWith(Or);
-
-      // Convert the reduction phi to operate on bools.
-      PhiR->setOperand(0, Plan->getVPValueOrAddLiveIn(ConstantInt::getFalse(
-                              OrigLoop->getHeader()->getContext())));
-    }
-
     // If tail is folded by masking, introduce selects between the phi
     // and the live-out instruction of each reduction, at the beginning of the
     // dedicated latch block.
@@ -9097,9 +9044,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // then extend the loop exit value to enable InstCombine to evaluate the
     // entire expression in the smaller type.
     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
-    if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
-        !RecurrenceDescriptor::isAnyOfRecurrenceKind(
-            RdxDesc.getRecurrenceKind())) {
+    if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
       Type *RdxTy = RdxDesc.getRecurrenceType();
       auto *Trunc =
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 739dae3bdd0cff..ac16fcf767459a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1001,8 +1001,9 @@ class BoUpSLP {
           TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
           const DataLayout *DL, OptimizationRemarkEmitter *ORE)
-      : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),
-        DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
+      : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
+        AC(AC), DB(DB), DL(DL), ORE(ORE),
+        Builder(Se->getContext(), TargetFolder(*DL)) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
     // Use the vector register size specified by the target unless overridden
     // by a command-line option.
@@ -1085,6 +1086,9 @@ class BoUpSLP {
       BS->clear();
     }
     MinBWs.clear();
+    ReductionBitWidth = 0;
+    CastMaxMinBWSizes.reset();
+    TruncNodes.clear();
     InstrElementSize.clear();
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
@@ -2287,6 +2291,7 @@ class BoUpSLP {
   void clearReductionData() {
     AnalyzedReductionsRoots.clear();
     AnalyzedReductionVals.clear();
+    AnalyzedMinBWVals.clear();
   }
   /// Checks if the given value is gathered in one of the nodes.
   bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
@@ -2307,9 +2312,11 @@ class BoUpSLP {
   /// constant and to be demoted. Required to correctly identify constant nodes
   /// to be demoted.
   bool collectValuesToDemote(
-      Value *V, SmallVectorImpl<Value *> &ToDemote,
+      Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+      SmallVectorImpl<Value *> &ToDemote,
       DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
-      SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const;
+      DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
+      bool &IsProfitableToDemote, bool IsTruncRoot) const;
 
   /// Check if the operands on the edges \p Edges of the \p UserTE allows
   /// reordering (i.e. the operands can be reordered because they have only one
@@ -2375,6 +2382,10 @@ class BoUpSLP {
   /// \ returns the graph entry for the \p Idx operand of the \p E entry.
   const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
 
+  /// \returns Cast context for the given graph node.
+  TargetTransformInfo::CastContextHint
+  getCastContextHint(const TreeEntry &TE) const;
+
   /// \returns the cost of the vectorizable entry.
   InstructionCost getEntryCost(const TreeEntry *E,
                                ArrayRef<Value *> VectorizedVals,
@@ -2925,11 +2936,18 @@ class BoUpSLP {
       }
       assert(!BundleMember && "Bundle and VL out of sync");
     } else {
-      MustGather.insert(VL.begin(), VL.end());
       // Build a map for gathered scalars to the nodes where they are used.
+      bool AllConstsOrCasts = true;
       for (Value *V : VL)
-        if (!isConstant(V))
+        if (!isConstant(V)) {
+          auto *I = dyn_cast<CastInst>(V);
+          AllConstsOrCasts &= I && I->getType()->isIntegerTy();
           ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
+        }
+      if (AllConstsOrCasts)
+        CastMaxMinBWSizes =
+            std::make_pair(std::numeric_limits<unsigned>::max(), 1);
+      MustGather.insert(VL.begin(), VL.end());
     }
 
     if (UserTreeIdx.UserTE)
@@ -3054,6 +3072,10 @@ class BoUpSLP {
   /// Set of hashes for the list of reduction values already being analyzed.
   DenseSet<size_t> AnalyzedReductionVals;
 
+  /// Values, already been analyzed for mininmal bitwidth and found to be
+  /// non-profitable.
+  DenseSet<Value *> AnalyzedMinBWVals;
+
   /// A list of values that need to extracted out of the tree.
   /// This list holds pairs of (Internal Scalar : External User). External User
   /// can be nullptr, it means that this Internal Scalar will be used later,
@@ -3621,7 +3643,7 @@ class BoUpSLP {
   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
 
   /// Instruction builder to construct the vectorized tree.
-  IRBuilder<> Builder;
+  IRBuilder<TargetFolder> Builder;
 
   /// A map of scalar integer values to the smallest bit width with which they
   /// can legally be represented. The values map to (width, signed) pairs,
@@ -3629,6 +3651,18 @@ class BoUpSLP {
   /// value must be signed-extended, rather than zero-extended, back to its
   /// original width.
   DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
+
+  /// Final size of the reduced vector, if the current graph represents the
+  /// input for the reduction and it was possible to narrow the size of the
+  /// reduction.
+  unsigned ReductionBitWidth = 0;
+
+  /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
+  /// type sizes, used in the tree.
+  std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
+
+  /// Indices of the vectorized trunc nodes.
+  DenseSet<unsigned> TruncNodes;
 };
 
 } // end namespace slpvectorizer
@@ -4295,9 +4329,12 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
               llvm_unreachable(
                   "Expected only consecutive, strided or masked gather loads.");
             }
+            SmallVector<int> ShuffleMask(VL.size());
+            for (int Idx : seq<int>(0, VL.size()))
+              ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
             VecLdCost +=
                 TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
-                                   std::nullopt, CostKind, I * VF, SubVecTy);
+                                   ShuffleMask, CostKind, I * VF, SubVecTy);
           }
           // If masked gather cost is higher - better to vectorize, so
           // consider it as a gather node. It will be better estimated
@@ -4313,9 +4350,10 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
     // increases the cost.
     Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
     bool ProfitableGatherPointers =
-        L && Sz > 2 && count_if(PointerOps, [L](Value *V) {
-                         return L->isLoopInvariant(V);
-                       }) <= Sz / 2;
+        L && Sz > 2 &&
+        static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
+          return L->isLoopInvariant(V);
+        })) <= Sz / 2;
     if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
           auto *GEP = dyn_cast<GetElementPtrInst>(P);
           return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
@@ -6539,6 +6577,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
+      auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
+          std::make_pair(std::numeric_limits<unsigned>::min(),
+                         std::numeric_limits<unsigned>::max()));
+      if (ShuffleOrOp == Instruction::ZExt ||
+          ShuffleOrOp == Instruction::SExt) {
+        CastMaxMinBWSizes = std::make_pair(
+            std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
+                               PrevMaxBW),
+            std::min<unsigned>(
+                DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
+                PrevMinBW));
+      } else if (ShuffleOrOp == Instruction::Trunc) {
+        CastMaxMinBWSizes = std::make_pair(
+            std::max<unsigned>(
+                DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
+                PrevMaxBW),
+            std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
+                               PrevMinBW));
+        TruncNodes.insert(VectorizableTree.size());
+      }
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
                                    ReuseShuffleIndicies);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
@@ -7400,7 +7458,7 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
         Index + NumSrcElts <= static_cast<int>(Mask.size()))
       return TTI.getShuffleCost(
           TTI::SK_InsertSubvector,
-          FixedVectorType::get(Tp->getElementType(), Mask.size()), std::nullopt,
+          FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
           TTI::TCK_RecipThroughput, Index, Tp);
   }
   return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
@@ -7673,9 +7731,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         if (NeedInsertSubvectorAnalysis) {
           // Add the cost for the subvectors insert.
-          for (int I = VF, E = VL.size(); I < E; I += VF)
+          SmallVector<int> ShuffleMask(VL.size());
+          for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
+            for (unsigned Idx : seq<unsigned>(0, E))
+              ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
             GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
-                                             std::nullopt, CostKind, I, LoadTy);
+                                             ShuffleMask, CostKind, I, LoadTy);
+          }
         }
         GatherCost -= ScalarsCost;
       }
@@ -7690,16 +7752,22 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       bool NeedShuffle =
           count(VL, *It) > 1 &&
           (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof));
+      if (!NeedShuffle)
+        return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
+                                      CostKind, std::distance(VL.begin(), It),
+                                      PoisonValue::get(VecTy), *It);
+
+      SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
+      transform(VL, ShuffleMask.begin(), [](Value *V) {
+        return isa<PoisonValue>(V) ? PoisonMaskElem : 0;   
+      });
       InstructionCost InsertCost = TTI.getVectorInstrCost(
-          Instruction::InsertElement, VecTy, CostKind,
-          NeedShuffle ? 0 : std::distance(VL.begin(), It),
+          Instruction::InsertElement, VecTy, CostKind, 0,
           PoisonValue::get(VecTy), *It);
       return InsertCost +
-             (NeedShuffle ? TTI.getShuffleCost(
-                                TargetTransformInfo::SK_Broadcast, VecTy,
-                                /*Mask=*/std::nullopt, CostKind, /*Index=*/0,
-                                /*SubTp=*/nullptr, /*Args=*/*It)
-                          : TTI::TCC_Free);
+             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
+                                ShuffleMask, CostKind, /*Index=*/0,
+                                /*SubTp=*/nullptr, /*Args=*/*It);
     }
     return GatherCost +
            (all_of(Gathers, UndefValue::classof)
@@ -8362,6 +8430,22 @@ const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
   return It->get();
 }
 
+TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
+  if (TE.State == TreeEntry::ScatterVectorize ||
+      TE.State == TreeEntry::StridedVectorize)
+    return TTI::CastContextHint::GatherScatter;
+  if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
+      !TE.isAltShuffle()) {
+    if (TE.ReorderIndices.empty())
+      return TTI::CastContextHint::Normal;
+    SmallVector<int> Mask;
+    inversePermutation(TE.ReorderIndices, Mask);
+    if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
+      return TTI::CastContextHint::Reversed;
+  }
+  return TTI::CastContextHint::None;
+}
+
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                       SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -8384,6 +8468,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   // If we have computed a smaller type for the expression, update VecTy so
   // that the costs will be accurate.
   auto It = MinBWs.find(E);
+  Type *OrigScalarTy = ScalarTy;
   if (It != MinBWs.end()) {
     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
     VecTy = FixedVectorType::get(ScalarTy, VL.size());
@@ -8441,24 +8526,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     UsedScalars.set(I);
   }
   auto GetCastContextHint = [&](Value *V) {
-    if (const TreeEntry *OpTE = getTreeEntry(V)) {
-      if (OpTE->State == TreeEntry::ScatterVectorize ||
-          OpTE->State == TreeEntry::StridedVectorize)
-        return TTI::CastContextHint::GatherScatter;
-      if (OpTE->State == TreeEntry::Vectorize &&
-          OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) {
-        if (OpTE->ReorderIndices.empty())
-          return TTI::CastContextHint::Normal;
-        SmallVector<int> Mask;
-        inversePermutation(OpTE->ReorderIndices, Mask);
-        if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
-          return TTI::CastContextHint::Reversed;
-      }
-    } else {
-      InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
-      if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
-        return TTI::CastContextHint::GatherScatter;
-    }
+    if (const TreeEntry *OpTE = getTreeEntry(V))
+      return getCastContextHint(*OpTE);
+    InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
+    if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
+      return TTI::CastContextHint::GatherScatter;
     return TTI::CastContextHint::None;
   };
   auto GetCostDiff =
@@ -8507,8 +8579,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
               TTI::CastContextHint CCH = GetCastContextHint(VL0);
               VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
                                                CostKind);
-              ScalarCost += Sz * TTI->getCastInstrCost(VecOpcode, UserScalarTy,
-                                                       ScalarTy, CCH, CostKind);
             }
           }
         }
@@ -8525,7 +8595,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     InstructionCost ScalarCost = 0;
     InstructionCost VecCost = 0;
     std::tie(ScalarCost, VecCost) = getGEPCosts(
-        *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, ScalarTy, VecTy);
+        *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
     LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
                              "Calculated GEPs cost for Tree"));
 
@@ -8572,7 +8642,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
           NumElts = ATy->getNumElements();
         else
           NumElts = AggregateTy->getStructNumElements();
-        SrcVecTy = FixedVectorType::get(ScalarTy, NumElts);
+        SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
       }
       if (I->hasOneUse()) {
         Instruction *Ext = I->user_back();
@@ -8740,13 +8810,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       }
     }
     auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
-      // Do not count cost here if minimum bitwidth is in effect and it is just
-      // a bitcast (here it is just a noop).
-      if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
-        return TTI::TCC_Free;
-      auto *VI = VL0->getOpcode() == Opcode
-                     ? cast<Instruction>(UniqueValues[Idx])
-                     : nullptr;
+      auto *VI = cast<Instruction>(UniqueValues[Idx]);
       return TTI->getCastInstrCost(Opcode, VL0->getType(),
                                    VL0->getOperand(0)->getType(),
                                    TTI::getCastContextHint(VI), CostKind, VI);
@@ -8789,7 +8853,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                                        ? CmpInst::BAD_FCMP_PREDICATE
                                        : CmpInst::BAD_ICMP_PREDICATE;
 
-      return TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
+      return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
                                      Builder.getInt1Ty(), CurrentPred, CostKind,
                                      VI);
     };
@@ -8844,7 +8908,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       TTI::OperandValueInfo Op2Info =
           TTI::getOperandInfo(VI->getOperand(OpIdx));
       SmallVector<const Value *> Operands(VI->operand_values());
-      return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind,
+      return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
                                          Op1Info, Op2Info, Operands, VI);
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
@@ -8852,7 +8916,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
       TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
       return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
-                                         Op2Info) +
+                                         Op2Info, std::nullopt, nullptr, TLI) +
              CommonCost;
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
@@ -8863,9 +8927,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   case Instruction::Load: {
     auto GetScalarCost = [&](unsigned Idx) {
       auto *VI = cast<LoadInst>(UniqueValues[Idx]);
-      return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
-                                  VI->getPointerAddressSpace(), CostKind,
-                                  TTI::OperandValueInfo(), VI);
+      return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
+                                  VI->getAlign(), VI->getPointerAddressSpace(),
+                                  CostKind, TTI::OperandValueInfo(), VI);
     };
     auto *LI0 = cast<LoadInst>(VL0);
     auto GetVectorCost = [&](InstructionCost CommonCost) {
@@ -8908,9 +8972,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     auto GetScalarCost = [=](unsigned Idx) {
       auto *VI = cast<StoreInst>(VL[Idx]);
       TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
-      return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(),
-                                  VI->getPointerAddressSpace(), CostKind,
-                                  OpInfo, VI);
+      return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
+                                  VI->getAlign(), VI->getPointerAddressSpace(),
+                                  CostKind, OpInfo, VI);
     };
     auto *BaseSI =
         cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
@@ -9772,6 +9836,44 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     Cost -= InsertCost;
   }
 
+  // Add the cost for reduced value resize (if required).
+  if (ReductionBitWidth != 0) {
+    assert(UserIgnoreList && "Expected reduction tree.");
+    const TreeEntry &E = *VectorizableTree.front().get();
+    auto It = MinBWs.find(&E);
+    if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
+      unsigned SrcSize = It->second.first;
+      unsigned DstSize = ReductionBitWidth;
+      unsigned Opcode = Instruction::Trunc;
+      if (SrcSize < DstSize)
+        Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
+      auto *SrcVecTy =
+          FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
+      auto *DstVecTy =
+          FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
+      TTI::CastContextHint CCH = getCastContextHint(E);
+      InstructionCost CastCost;
+      switch (E.getOpcode()) {
+      case Instruction::SExt:
+      case Instruction::ZExt:
+      case Instruction::Trunc: {
+        const TreeEntry *OpTE = getOperandEntry(&E, 0);
+        CCH = getCastContextHint(*OpTE);
+        break;
+      }
+      default:
+        break;
+      }
+      CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
+                                        TTI::TCK_RecipThroughput);
+      Cost += CastCost;
+      LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
+                        << " for final resize for reduction from " << SrcVecTy
+                        << " to " << DstVecTy << "\n";
+                 dbgs() << "SLP: Current total cost = " << Cost << "\n");
+    }
+  }
+
 #ifndef NDEBUG
   SmallString<256> Str;
   {
@@ -10053,11 +10155,6 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
       if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
         continue;
-      auto It = MinBWs.find(VTE);
-      // If vectorize node is demoted - do not match.
-      if (It != MinBWs.end() &&
-          It->second.first != DL->getTypeSizeInBits(V->getType()))
-        continue;
       VToTEs.insert(VTE);
     }
     if (VToTEs.empty())
@@ -10374,7 +10471,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
   APInt ShuffledElements = APInt::getZero(VL.size());
-  DenseSet<Value *> UniqueElements;
+  DenseMap<Value *, unsigned> UniqueElements;
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost Cost;
   auto EstimateInsertCost = [&](unsigned I, Value *V) {
@@ -10383,27 +10480,34 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
           TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
                                   I, Constant::getNullValue(VecTy), V);
   };
+  SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
   for (unsigned I = 0, E = VL.size(); I < E; ++I) {
     Value *V = VL[I];
     // No need to shuffle duplicates for constants.
     if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
       ShuffledElements.setBit(I);
+      ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
       continue;
     }
-    if (!UniqueElements.insert(V).second) {
-      DuplicateNonConst = true;
-      ShuffledElements.setBit(I);
+
+    auto Res = UniqueElements.try_emplace(V, I);
+    if (Res.second) {
+      EstimateInsertCost(I, V);
+      ShuffleMask[I] = I;
       continue;
     }
-    EstimateInsertCost(I, V);
+
+    DuplicateNonConst = true;
+    ShuffledElements.setBit(I);
+    ShuffleMask[I] = Res.first->second;
   }
   if (ForPoisonSrc)
     Cost =
         TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
                                       /*Extract*/ false, CostKind);
   if (DuplicateNonConst)
-    Cost +=
-        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+    Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                VecTy, ShuffleMask);
   return Cost;
 }
 
@@ -10614,8 +10718,20 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
         PostponedInsts.emplace_back(Inst, I);
   }
 
-  auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) {
-    Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos));
+  auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
+                                      Type *Ty) {
+    Value *Scalar = V;
+    if (cast<VectorType>(Vec->getType())->getElementType() != Ty) {
+      assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
+             "Expected integer types only.");
+      Vec = Builder.CreateIntCast(
+          Vec,
+          VectorType::get(Ty,
+                          cast<VectorType>(Vec->getType())->getElementCount()),
+          !isKnownNonNegative(Vec, SimplifyQuery(*DL)));
+    }
+
+    Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
     auto *InsElt = dyn_cast<InsertElementInst>(Vec);
     if (!InsElt)
       return Vec;
@@ -10625,15 +10741,25 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
     if (isa<Instruction>(V)) {
       if (TreeEntry *Entry = getTreeEntry(V)) {
         // Find which lane we need to extract.
-        unsigned FoundLane = Entry->findLaneForValue(V);
-        ExternalUses.emplace_back(V, InsElt, FoundLane);
+        User *UserOp = nullptr;
+        if (Scalar != V) {
+          if (auto *SI = dyn_cast<Instruction>(Scalar))
+            UserOp = SI;
+        } else {
+          UserOp = InsElt;
+        }
+        if (UserOp) {
+          unsigned FoundLane = Entry->findLaneForValue(V);
+          ExternalUses.emplace_back(V, UserOp, FoundLane);
+        }
       }
     }
     return Vec;
   };
   Value *Val0 =
       isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
-  FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
+  Type *ScalarTy = Val0->getType();
+  FixedVectorType *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   Value *Vec = Root ? Root : PoisonValue::get(VecTy);
   SmallVector<int> NonConsts;
   // Insert constant values at first.
@@ -10656,15 +10782,15 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
           continue;
       }
     }
-    Vec = CreateInsertElement(Vec, VL[I], I);
+    Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
   }
   // Insert non-constant values.
   for (int I : NonConsts)
-    Vec = CreateInsertElement(Vec, VL[I], I);
+    Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
   // Append instructions, which are/may be part of the loop, in the end to make
   // it possible to hoist non-loop-based instructions.
   for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
-    Vec = CreateInsertElement(Vec, Pair.first, Pair.second);
+    Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
 
   return Vec;
 }
@@ -10721,16 +10847,35 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
     SetVector<Instruction *> &GatherShuffleExtractSeq;
     /// A list of blocks that we are going to CSE.
     DenseSet<BasicBlock *> &CSEBlocks;
+    /// Data layout.
+    const DataLayout &DL;
 
   public:
     ShuffleIRBuilder(IRBuilderBase &Builder,
                      SetVector<Instruction *> &GatherShuffleExtractSeq,
-                     DenseSet<BasicBlock *> &CSEBlocks)
+                     DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
         : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
-          CSEBlocks(CSEBlocks) {}
+          CSEBlocks(CSEBlocks), DL(DL) {}
     ~ShuffleIRBuilder() = default;
     /// Creates shufflevector for the 2 operands with the given mask.
     Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
+      if (V1->getType() != V2->getType()) {
+        assert(V1->getType()->isIntOrIntVectorTy() &&
+               V1->getType()->isIntOrIntVectorTy() &&
+               "Expected integer vector types only.");
+        if (V1->getType() != V2->getType()) {
+          if (cast<VectorType>(V2->getType())
+                  ->getElementType()
+                  ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
+                                               ->getElementType()
+                                               ->getIntegerBitWidth())
+            V2 = Builder.CreateIntCast(
+                V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
+          else
+            V1 = Builder.CreateIntCast(
+                V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
+        }
+      }
       Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
       if (auto *I = dyn_cast<Instruction>(Vec)) {
         GatherShuffleExtractSeq.insert(I);
@@ -10789,7 +10934,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
     assert(V1 && "Expected at least one vector value.");
     ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
-                                    R.CSEBlocks);
+                                    R.CSEBlocks, *R.DL);
     return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
                                                        ShuffleBuilder);
   }
@@ -11656,7 +11801,7 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
-  IRBuilder<>::InsertPointGuard Guard(Builder);
+  IRBuilderBase::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue &&
       (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
@@ -11675,10 +11820,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   }
 
   bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
-  auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy,
-                          bool IsSigned) {
-    if (V->getType() != VecTy)
-      V = Builder.CreateIntCast(V, VecTy, IsSigned);
+  auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
     ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
     if (E->getOpcode() == Instruction::Store) {
       ArrayRef<int> Mask =
@@ -11705,12 +11847,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     ScalarTy = Store->getValueOperand()->getType();
   else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
     ScalarTy = IE->getOperand(1)->getType();
-  bool IsSigned = false;
   auto It = MinBWs.find(E);
-  if (It != MinBWs.end()) {
+  if (It != MinBWs.end())
     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
-    IsSigned = It->second.second;
-  }
+  auto GetOperandSignedness = [&](unsigned Idx) {
+    const TreeEntry *OpE = getOperandEntry(E, Idx);
+    bool IsSigned = false;
+    auto It = MinBWs.find(OpE);
+    if (It != MinBWs.end())
+      IsSigned = It->second.second;
+    else
+      IsSigned = any_of(OpE->Scalars, [&](Value *R) {
+        return !isKnownNonNegative(R, SimplifyQuery(*DL));
+      });
+    return IsSigned;
+  };
   auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
   switch (ShuffleOrOp) {
     case Instruction::PHI: {
@@ -11734,7 +11885,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
                                PH->getParent()->getFirstInsertionPt());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
 
-        V = FinalShuffle(V, E, VecTy, IsSigned);
+        V = FinalShuffle(V, E, VecTy);
 
         E->VectorizedValue = V;
         if (PostponedPHIs)
@@ -11768,9 +11919,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
         Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
         if (VecTy != Vec->getType()) {
-          assert(MinBWs.contains(getOperandEntry(E, I)) &&
+          assert((getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
+                  MinBWs.contains(getOperandEntry(E, I))) &&
                  "Expected item in MinBWs.");
-          Vec = Builder.CreateIntCast(Vec, VecTy, It->second.second);
+          Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
         }
         NewPhi->addIncoming(Vec, IBB);
       }
@@ -11785,7 +11937,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (const TreeEntry *TE = getTreeEntry(V))
         V = TE->VectorizedValue;
       setInsertPointAfterBundle(E);
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
       E->VectorizedValue = V;
       return V;
     }
@@ -11795,7 +11947,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *Ptr = LI->getPointerOperand();
       LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
       Value *NewV = propagateMetadata(V, E->Scalars);
-      NewV = FinalShuffle(NewV, E, VecTy, IsSigned);
+      NewV = FinalShuffle(NewV, E, VecTy);
       E->VectorizedValue = NewV;
       return NewV;
     }
@@ -11981,10 +12133,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
 
       auto *CI = cast<CastInst>(VL0);
       Instruction::CastOps VecOpcode = CI->getOpcode();
-      Type *SrcScalarTy = VL0->getOperand(0)->getType();
+      Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
       auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
       if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
-          (SrcIt != MinBWs.end() || It != MinBWs.end())) {
+          (SrcIt != MinBWs.end() || It != MinBWs.end() ||
+           SrcScalarTy != CI->getOperand(0)->getType())) {
         // Check if the values are candidates to demote.
         unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
         if (SrcIt != MinBWs.end())
@@ -12003,7 +12156,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
                      ? InVec
                      : Builder.CreateCast(VecOpcode, InVec, VecTy);
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12024,11 +12177,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         return E->VectorizedValue;
       }
       if (L->getType() != R->getType()) {
-        assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+        assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
+                getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                MinBWs.contains(getOperandEntry(E, 0)) ||
                 MinBWs.contains(getOperandEntry(E, 1))) &&
                "Expected item in MinBWs.");
-        L = Builder.CreateIntCast(L, VecTy, IsSigned);
-        R = Builder.CreateIntCast(R, VecTy, IsSigned);
+        if (cast<VectorType>(L->getType())
+                ->getElementType()
+                ->getIntegerBitWidth() < cast<VectorType>(R->getType())
+                                             ->getElementType()
+                                             ->getIntegerBitWidth()) {
+          Type *CastTy = R->getType();
+          L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
+        } else {
+          Type *CastTy = L->getType();
+          R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
+        }
       }
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
@@ -12036,7 +12200,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       propagateIRFlags(V, E->Scalars, VL0);
       // Do not cast for cmps.
       VecTy = cast<FixedVectorType>(V->getType());
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12060,16 +12224,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
-      if (True->getType() != False->getType()) {
-        assert((MinBWs.contains(getOperandEntry(E, 1)) ||
+      if (True->getType() != VecTy || False->getType() != VecTy) {
+        assert((getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
+                MinBWs.contains(getOperandEntry(E, 1)) ||
                 MinBWs.contains(getOperandEntry(E, 2))) &&
                "Expected item in MinBWs.");
-        True = Builder.CreateIntCast(True, VecTy, IsSigned);
-        False = Builder.CreateIntCast(False, VecTy, IsSigned);
+        if (True->getType() != VecTy)
+          True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
+        if (False->getType() != VecTy)
+          False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
       }
 
       Value *V = Builder.CreateSelect(Cond, True, False);
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12091,7 +12259,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12128,12 +12296,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
-      if (LHS->getType() != RHS->getType()) {
-        assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+      if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
+        assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
+                getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                MinBWs.contains(getOperandEntry(E, 0)) ||
                 MinBWs.contains(getOperandEntry(E, 1))) &&
                "Expected item in MinBWs.");
-        LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
-        RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
+        if (LHS->getType() != VecTy)
+          LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
+        if (RHS->getType() != VecTy)
+          RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
       }
 
       Value *V = Builder.CreateBinOp(
@@ -12143,7 +12315,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       if (auto *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
 
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12214,7 +12386,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       }
       Value *V = propagateMetadata(NewLI, E->Scalars);
 
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
       return V;
@@ -12225,7 +12397,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       setInsertPointAfterBundle(E);
 
       Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
-      VecValue = FinalShuffle(VecValue, E, VecTy, IsSigned);
+      if (VecValue->getType() != VecTy)
+        VecValue =
+            Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
+      VecValue = FinalShuffle(VecValue, E, VecTy);
 
       Value *Ptr = SI->getPointerOperand();
       StoreInst *ST =
@@ -12267,7 +12442,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         V = propagateMetadata(I, GEPs);
       }
 
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12332,7 +12507,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
 
       propagateIRFlags(V, E->Scalars, VL0);
-      V = FinalShuffle(V, E, VecTy, IsSigned);
+      V = FinalShuffle(V, E, VecTy);
 
       E->VectorizedValue = V;
       ++NumVectorInstructions;
@@ -12364,12 +12539,30 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
         return E->VectorizedValue;
       }
-      if (LHS && RHS && LHS->getType() != RHS->getType()) {
-        assert((MinBWs.contains(getOperandEntry(E, 0)) ||
+      if (LHS && RHS &&
+          ((Instruction::isBinaryOp(E->getOpcode()) &&
+            (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
+           (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
+        assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
+                getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                MinBWs.contains(getOperandEntry(E, 0)) ||
                 MinBWs.contains(getOperandEntry(E, 1))) &&
                "Expected item in MinBWs.");
-        LHS = Builder.CreateIntCast(LHS, VecTy, IsSigned);
-        RHS = Builder.CreateIntCast(RHS, VecTy, IsSigned);
+        Type *CastTy = VecTy;
+        if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
+          if (cast<VectorType>(LHS->getType())
+                  ->getElementType()
+                  ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
+                                               ->getElementType()
+                                               ->getIntegerBitWidth())
+            CastTy = RHS->getType();
+          else
+            CastTy = LHS->getType();
+        }
+        if (LHS->getType() != VecTy)
+          LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
+        if (RHS->getType() != VecTy)
+          RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
       }
 
       Value *V0, *V1;
@@ -12421,9 +12614,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         CSEBlocks.insert(I->getParent());
       }
 
-      if (V->getType() != VecTy && !isa<CmpInst>(VL0))
-        V = Builder.CreateIntCast(
-            V, FixedVectorType::get(ScalarTy, E->getVectorFactor()), IsSigned);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -12519,6 +12709,15 @@ Value *BoUpSLP::vectorizeTree(
     }
     Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
     Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
+    if (Vec->getType() != PrevVec->getType()) {
+      assert(Vec->getType()->isIntOrIntVectorTy() &&
+             PrevVec->getType()->isIntOrIntVectorTy() &&
+             "Expected integer vector types only.");
+      assert(MinBWs.contains(TE->UserTreeIndices.front().UserTE) &&
+             "Expected user in MinBWs.");
+      bool IsSigned = MinBWs.lookup(TE->UserTreeIndices.front().UserTE).second;
+      Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), IsSigned);
+    }
     PrevVec->replaceAllUsesWith(Vec);
     PostponedValues.try_emplace(Vec).first->second.push_back(TE);
     // Replace the stub vector node, if it was used before for one of the
@@ -12679,7 +12878,7 @@ Value *BoUpSLP::vectorizeTree(
             auto Key = std::make_pair(Vec, ScalarTy);
             auto VecIt = VectorCasts.find(Key);
             if (VecIt == VectorCasts.end()) {
-              IRBuilder<>::InsertPointGuard Guard(Builder);
+              IRBuilderBase::InsertPointGuard Guard(Builder);
               if (auto *IVec = dyn_cast<Instruction>(Vec))
                 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
               Vec = Builder.CreateIntCast(
@@ -12938,7 +13137,21 @@ Value *BoUpSLP::vectorizeTree(
   Builder.ClearInsertionPoint();
   InstrElementSize.clear();
 
-  return VectorizableTree[0]->VectorizedValue;
+  const TreeEntry &RootTE = *VectorizableTree.front().get();
+  Value *Vec = RootTE.VectorizedValue;
+  if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
+                                      It != MinBWs.end() &&
+                                      ReductionBitWidth != It->second.first) {
+    IRBuilder<>::InsertPointGuard Guard(Builder);
+    Builder.SetInsertPoint(ReductionRoot->getParent(),
+                           ReductionRoot->getIterator());
+    Vec = Builder.CreateIntCast(
+        Vec,
+        VectorType::get(Builder.getIntNTy(ReductionBitWidth),
+                        cast<VectorType>(Vec->getType())->getElementCount()),
+        It->second.second);
+  }
+  return Vec;
 }
 
 void BoUpSLP::optimizeGatherSequence() {
@@ -13758,36 +13971,90 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
 // smaller type with a truncation. We collect the values that will be demoted
 // in ToDemote and additional roots that require investigating in Roots.
 bool BoUpSLP::collectValuesToDemote(
-    Value *V, SmallVectorImpl<Value *> &ToDemote,
+    Value *V, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
+    SmallVectorImpl<Value *> &ToDemote,
     DenseMap<Instruction *, SmallVector<unsigned>> &DemotedConsts,
-    SmallVectorImpl<Value *> &Roots, DenseSet<Value *> &Visited) const {
+    DenseSet<Value *> &Visited, unsigned &MaxDepthLevel,
+    bool &IsProfitableToDemote, bool IsTruncRoot) const {
   // We can always demote constants.
   if (isa<Constant>(V))
     return true;
 
+  if (DL->getTypeSizeInBits(V->getType()) == BitWidth) {
+    MaxDepthLevel = 1;
+    return true;
+  }
+
   // If the value is not a vectorized instruction in the expression and not used
   // by the insertelement instruction and not used in multiple vector nodes, it
   // cannot be demoted.
+  auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
+    if (MultiNodeScalars.contains(V))
+      return false;
+    uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
+    APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+    if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
+      return true;
+    auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
+    unsigned BitWidth1 = OrigBitWidth - NumSignBits;
+    if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
+      ++BitWidth1;
+    BitWidth = std::max(BitWidth, BitWidth1);
+    return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
+  };
+  auto FinalAnalysis = [&](const TreeEntry *ITE = nullptr) {
+    if (!IsProfitableToDemote)
+      return false;
+    return (ITE && ITE->UserTreeIndices.size() > 1) ||
+           IsPotentiallyTruncated(V, BitWidth);
+  };
+  // TODO: improve handling of gathered values and others.
   auto *I = dyn_cast<Instruction>(V);
-  if (!I || !getTreeEntry(I) || MultiNodeScalars.contains(I) ||
-      !Visited.insert(I).second || all_of(I->users(), [&](User *U) {
+  const TreeEntry *ITE = I ? getTreeEntry(I) : nullptr;
+  if (!ITE || !Visited.insert(I).second || MultiNodeScalars.contains(I) ||
+      all_of(I->users(), [&](User *U) {
         return isa<InsertElementInst>(U) && !getTreeEntry(U);
       }))
-    return false;
+    return FinalAnalysis();
 
   unsigned Start = 0;
   unsigned End = I->getNumOperands();
+
+  auto ProcessOperands = [&](ArrayRef<Value *> Operands, bool &NeedToExit) {
+    NeedToExit = false;
+    unsigned InitLevel = MaxDepthLevel;
+    for (Value *IncValue : Operands) {
+      unsigned Level = InitLevel;
+      if (!collectValuesToDemote(IncValue, IsProfitableToDemoteRoot, BitWidth,
+                                 ToDemote, DemotedConsts, Visited, Level,
+                                 IsProfitableToDemote, IsTruncRoot)) {
+        if (!IsProfitableToDemote)
+          return false;
+        NeedToExit = true;
+        if (!FinalAnalysis(ITE))
+          return false;
+        continue;
+      }
+      MaxDepthLevel = std::max(MaxDepthLevel, Level);
+    }
+    return true;
+  };
+  bool NeedToExit = false;
   switch (I->getOpcode()) {
 
   // We can always demote truncations and extensions. Since truncations can
   // seed additional demotion, we save the truncated value.
   case Instruction::Trunc:
-    Roots.push_back(I->getOperand(0));
+    if (!IsTruncRoot)
+      MaxDepthLevel = 1;
+    if (IsProfitableToDemoteRoot)
+      IsProfitableToDemote = true;
     break;
   case Instruction::ZExt:
   case Instruction::SExt:
-    if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0)))
-      return false;
+    if (!IsTruncRoot)
+      MaxDepthLevel = 1;
+    IsProfitableToDemote = true;
     break;
 
   // We can demote certain binary operations if we can demote both of their
@@ -13797,22 +14064,21 @@ bool BoUpSLP::collectValuesToDemote(
   case Instruction::Mul:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
-    if (!collectValuesToDemote(I->getOperand(0), ToDemote, DemotedConsts, Roots,
-                               Visited) ||
-        !collectValuesToDemote(I->getOperand(1), ToDemote, DemotedConsts, Roots,
-                               Visited))
+  case Instruction::Xor: {
+    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
+      return false;
+    if (!ProcessOperands({I->getOperand(0), I->getOperand(1)}, NeedToExit))
       return false;
     break;
+  }
 
   // We can demote selects if we can demote their true and false values.
   case Instruction::Select: {
+    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
+      return false;
     Start = 1;
-    SelectInst *SI = cast<SelectInst>(I);
-    if (!collectValuesToDemote(SI->getTrueValue(), ToDemote, DemotedConsts,
-                               Roots, Visited) ||
-        !collectValuesToDemote(SI->getFalseValue(), ToDemote, DemotedConsts,
-                               Roots, Visited))
+    auto *SI = cast<SelectInst>(I);
+    if (!ProcessOperands({SI->getTrueValue(), SI->getFalseValue()}, NeedToExit))
       return false;
     break;
   }
@@ -13821,172 +14087,267 @@ bool BoUpSLP::collectValuesToDemote(
   // we don't need to worry about cycles since we ensure single use above.
   case Instruction::PHI: {
     PHINode *PN = cast<PHINode>(I);
-    for (Value *IncValue : PN->incoming_values())
-      if (!collectValuesToDemote(IncValue, ToDemote, DemotedConsts, Roots,
-                                 Visited))
-        return false;
+    if (ITE->UserTreeIndices.size() > 1 && !IsPotentiallyTruncated(I, BitWidth))
+      return false;
+    SmallVector<Value *> Ops(PN->incoming_values().begin(),
+                             PN->incoming_values().end());
+    if (!ProcessOperands(Ops, NeedToExit))
+      return false;
     break;
   }
 
   // Otherwise, conservatively give up.
   default:
-    return false;
+    MaxDepthLevel = 1;
+    return FinalAnalysis();
   }
+  if (NeedToExit)
+    return true;
 
+  ++MaxDepthLevel;
   // Gather demoted constant operands.
   for (unsigned Idx : seq<unsigned>(Start, End))
     if (isa<Constant>(I->getOperand(Idx)))
       DemotedConsts.try_emplace(I).first->getSecond().push_back(Idx);
   // Record the value that we can demote.
   ToDemote.push_back(V);
-  return true;
+  return IsProfitableToDemote;
 }
 
 void BoUpSLP::computeMinimumValueSizes() {
   // We only attempt to truncate integer expressions.
-  auto &TreeRoot = VectorizableTree[0]->Scalars;
-  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
-  if (!TreeRootIT || VectorizableTree.front()->State == TreeEntry::NeedToGather)
+  bool IsStoreOrInsertElt =
+      VectorizableTree.front()->getOpcode() == Instruction::Store ||
+      VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
+  if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 &&
+      (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
+       CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
     return;
 
+  unsigned NodeIdx = 0;
+  if (IsStoreOrInsertElt &&
+      VectorizableTree.front()->State != TreeEntry::NeedToGather)
+    NodeIdx = 1;
+
   // Ensure the roots of the vectorizable tree don't form a cycle.
-  if (!VectorizableTree.front()->UserTreeIndices.empty())
+  if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
+      (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
+      (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
+                              [NodeIdx](const EdgeInfo &EI) {
+                                return EI.UserTE->Idx >
+                                       static_cast<int>(NodeIdx);
+                              })))
     return;
 
-  // Conservatively determine if we can actually truncate the roots of the
-  // expression. Collect the values that can be demoted in ToDemote and
-  // additional roots that require investigating in Roots.
-  SmallVector<Value *, 32> ToDemote;
-  DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
-  SmallVector<Value *, 4> Roots;
-  for (auto *Root : TreeRoot) {
-    DenseSet<Value *> Visited;
-    if (!collectValuesToDemote(Root, ToDemote, DemotedConsts, Roots, Visited))
-      return;
-  }
-
-  // The maximum bit width required to represent all the values that can be
-  // demoted without loss of precision. It would be safe to truncate the roots
-  // of the expression to this width.
-  auto MaxBitWidth = 1u;
-
-  // We first check if all the bits of the roots are demanded. If they're not,
-  // we can truncate the roots to this narrower type.
-  for (auto *Root : TreeRoot) {
-    auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
-    MaxBitWidth = std::max<unsigned>(Mask.getBitWidth() - Mask.countl_zero(),
-                                     MaxBitWidth);
-  }
-
-  // True if the roots can be zero-extended back to their original type, rather
-  // than sign-extended. We know that if the leading bits are not demanded, we
-  // can safely zero-extend. So we initialize IsKnownPositive to True.
-  bool IsKnownPositive = true;
-
-  // If all the bits of the roots are demanded, we can try a little harder to
-  // compute a narrower type. This can happen, for example, if the roots are
-  // getelementptr indices. InstCombine promotes these indices to the pointer
-  // width. Thus, all their bits are technically demanded even though the
-  // address computation might be vectorized in a smaller type.
-  //
-  // We start by looking at each entry that can be demoted. We compute the
-  // maximum bit width required to store the scalar by using ValueTracking to
-  // compute the number of high-order bits we can truncate.
-  if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
-      all_of(TreeRoot, [](Value *V) {
-        return all_of(V->users(),
-                      [](User *U) { return isa<GetElementPtrInst>(U); });
-      })) {
-    MaxBitWidth = 8u;
+  // The first value node for store/insertelement is sext/zext/trunc? Skip it,
+  // resize to the final type.
+  bool IsTruncRoot = false;
+  bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
+  if (NodeIdx != 0 &&
+      VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
+      (VectorizableTree[NodeIdx]->getOpcode() == Instruction::ZExt ||
+       VectorizableTree[NodeIdx]->getOpcode() == Instruction::SExt ||
+       VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc)) {
+    assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
+    IsTruncRoot = VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc;
+    IsProfitableToDemoteRoot = true;
+    ++NodeIdx;
+  }
+
+  // Analyzed in reduction already and not profitable - exit.
+  if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
+    return;
 
+  SmallVector<Value *> ToDemote;
+  DenseMap<Instruction *, SmallVector<unsigned>> DemotedConsts;
+  auto ComputeMaxBitWidth = [&](ArrayRef<Value *> TreeRoot, unsigned VF,
+                                bool IsTopRoot, bool IsProfitableToDemoteRoot,
+                                unsigned Opcode, unsigned Limit, bool IsTruncRoot) {
+    ToDemote.clear();
+    auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+    if (!TreeRootIT || !Opcode)
+      return 0u;
+
+    if (AnalyzedMinBWVals.contains(TreeRoot.front()))
+      return 0u;
+
+    unsigned NumParts = TTI->getNumberOfParts(
+        FixedVectorType::get(TreeRoot.front()->getType(), VF));
+
+    // The maximum bit width required to represent all the values that can be
+    // demoted without loss of precision. It would be safe to truncate the roots
+    // of the expression to this width.
+    unsigned MaxBitWidth = 1u;
+
+    // True if the roots can be zero-extended back to their original type,
+    // rather than sign-extended. We know that if the leading bits are not
+    // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
+    // True.
     // Determine if the sign bit of all the roots is known to be zero. If not,
     // IsKnownPositive is set to False.
-    IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
+    bool IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
       KnownBits Known = computeKnownBits(R, *DL);
       return Known.isNonNegative();
     });
 
-    // Determine the maximum number of bits required to store the scalar
-    // values.
-    for (auto *Scalar : ToDemote) {
-      auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
-      auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
-      MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
-    }
-
-    // If we can't prove that the sign bit is zero, we must add one to the
-    // maximum bit width to account for the unknown sign bit. This preserves
-    // the existing sign bit so we can safely sign-extend the root back to the
-    // original type. Otherwise, if we know the sign bit is zero, we will
-    // zero-extend the root instead.
-    //
-    // FIXME: This is somewhat suboptimal, as there will be cases where adding
-    //        one to the maximum bit width will yield a larger-than-necessary
-    //        type. In general, we need to add an extra bit only if we can't
-    //        prove that the upper bit of the original type is equal to the
-    //        upper bit of the proposed smaller type. If these two bits are the
-    //        same (either zero or one) we know that sign-extending from the
-    //        smaller type will result in the same value. Here, since we can't
-    //        yet prove this, we are just making the proposed smaller type
-    //        larger to ensure correctness.
-    if (!IsKnownPositive)
-      ++MaxBitWidth;
-  }
-
-  // Round MaxBitWidth up to the next power-of-two.
-  MaxBitWidth = llvm::bit_ceil(MaxBitWidth);
-
-  // If the maximum bit width we compute is less than the with of the roots'
-  // type, we can proceed with the narrowing. Otherwise, do nothing.
-  if (MaxBitWidth >= TreeRootIT->getBitWidth())
-    return;
+    // We first check if all the bits of the roots are demanded. If they're not,
+    // we can truncate the roots to this narrower type.
+    for (auto *Root : TreeRoot) {
+      unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
+      TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
+      unsigned BitWidth1 = NumTypeBits - NumSignBits;
+      // If we can't prove that the sign bit is zero, we must add one to the
+      // maximum bit width to account for the unknown sign bit. This preserves
+      // the existing sign bit so we can safely sign-extend the root back to the
+      // original type. Otherwise, if we know the sign bit is zero, we will
+      // zero-extend the root instead.
+      //
+      // FIXME: This is somewhat suboptimal, as there will be cases where adding
+      //        one to the maximum bit width will yield a larger-than-necessary
+      //        type. In general, we need to add an extra bit only if we can't
+      //        prove that the upper bit of the original type is equal to the
+      //        upper bit of the proposed smaller type. If these two bits are
+      //        the same (either zero or one) we know that sign-extending from
+      //        the smaller type will result in the same value. Here, since we
+      //        can't yet prove this, we are just making the proposed smaller
+      //        type larger to ensure correctness.
+      if (!IsKnownPositive)
+        ++BitWidth1;
+
+      APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
+      unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
+      MaxBitWidth =
+          std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
+    }
+
+    if (MaxBitWidth < 8 && MaxBitWidth > 1)
+      MaxBitWidth = 8;
+
+    // If the original type is large, but reduced type does not improve the reg
+    // use - ignore it.
+    if (NumParts > 1 &&
+        NumParts ==
+            TTI->getNumberOfParts(FixedVectorType::get(
+                IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
+      return 0u;
+
+    bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
+                                Opcode == Instruction::SExt ||
+                                Opcode == Instruction::ZExt || NumParts > 1;
+    // Conservatively determine if we can actually truncate the roots of the
+    // expression. Collect the values that can be demoted in ToDemote and
+    // additional roots that require investigating in Roots.
+    for (auto *Root : TreeRoot) {
+      DenseSet<Value *> Visited;
+      unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
+      bool NeedToDemote = IsProfitableToDemote;
+
+      if (!collectValuesToDemote(Root, IsProfitableToDemoteRoot, MaxBitWidth,
+                                 ToDemote, DemotedConsts, Visited,
+                                 MaxDepthLevel, NeedToDemote, IsTruncRoot) ||
+          (MaxDepthLevel <= Limit &&
+           !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
+              (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
+               DL->getTypeSizeInBits(Root->getType()) /
+                       DL->getTypeSizeInBits(
+                           cast<Instruction>(Root)->getOperand(0)->getType()) >
+                   2)))))
+        return 0u;
+    }
+    // Round MaxBitWidth up to the next power-of-two.
+    MaxBitWidth = bit_ceil(MaxBitWidth);
+
+    return MaxBitWidth;
+  };
 
   // If we can truncate the root, we must collect additional values that might
   // be demoted as a result. That is, those seeded by truncations we will
   // modify.
-  while (!Roots.empty()) {
-    DenseSet<Value *> Visited;
-    collectValuesToDemote(Roots.pop_back_val(), ToDemote, DemotedConsts, Roots,
-                          Visited);
-  }
-
-  // Check that all users are marked for demotion.
-  DenseSet<Value *> Demoted(ToDemote.begin(), ToDemote.end());
-  DenseSet<const TreeEntry *> Visited;
-  for (Value *V: ToDemote) {
-    const TreeEntry *TE = getTreeEntry(V);
-    assert(TE && "Expected vectorized scalar.");
-    if (!Visited.insert(TE).second)
-      continue;
-    if (!all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
-          return all_of(EI.UserTE->Scalars,
-                        [&](Value *V) { return Demoted.contains(V); });
-        }))
-      return;
-  }
-  // Finally, map the values we can demote to the maximum bit with we computed.
-  for (auto *Scalar : ToDemote) {
-    auto *TE = getTreeEntry(Scalar);
-    assert(TE && "Expected vectorized scalar.");
-    if (MinBWs.contains(TE))
+  // Add reduction ops sizes, if any.
+  if (UserIgnoreList &&
+      isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
+    for (Value *V : *UserIgnoreList) {
+      auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
+      auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
+      unsigned BitWidth1 = NumTypeBits - NumSignBits;
+      if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
+        ++BitWidth1;
+      auto Mask = DB->getDemandedBits(cast<Instruction>(V));
+      unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
+      ReductionBitWidth =
+          std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
+    }
+    if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
+      ReductionBitWidth = 8;
+
+    ReductionBitWidth = bit_ceil(ReductionBitWidth);
+  }
+  bool IsTopRoot = NodeIdx == 0;
+  while (NodeIdx < VectorizableTree.size() &&
+         VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
+         VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
+    ++NodeIdx;
+    IsTruncRoot = true;
+  }
+  while (NodeIdx < VectorizableTree.size()) {
+    ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
+    unsigned Limit = 2;
+    unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
+    if (IsTopRoot &&
+        ReductionBitWidth ==
+            DL->getTypeSizeInBits(
+                VectorizableTree.front()->Scalars.front()->getType()))
+      Limit = 3;
+    unsigned MaxBitWidth = ComputeMaxBitWidth(
+        TreeRoot, VectorizableTree[NodeIdx]->getVectorFactor(), IsTopRoot,
+        IsProfitableToDemoteRoot, Opcode, Limit, IsTruncRoot);
+    IsTopRoot = false;
+    IsProfitableToDemoteRoot = true;
+
+    if (TruncNodes.empty()) {
+      NodeIdx = VectorizableTree.size();
+    } else {
+      NodeIdx = *TruncNodes.begin() + 1;
+      TruncNodes.erase(TruncNodes.begin());
+      IsTruncRoot = true;
+    }
+
+    // If the maximum bit width we compute is less than the with of the roots'
+    // type, we can proceed with the narrowing. Otherwise, do nothing.
+    if (MaxBitWidth == 0 ||
+        MaxBitWidth >=
+            cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
+      if (UserIgnoreList)
+        AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
       continue;
-    bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
-      KnownBits Known = computeKnownBits(R, *DL);
-      return !Known.isNonNegative();
-    });
-    MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
-    const auto *I = cast<Instruction>(Scalar);
-    auto DCIt = DemotedConsts.find(I);
-    if (DCIt != DemotedConsts.end()) {
-      for (unsigned Idx : DCIt->getSecond()) {
-        // Check that all instructions operands are demoted.
-        if (all_of(TE->Scalars, [&](Value *V) {
-              auto SIt = DemotedConsts.find(cast<Instruction>(V));
-              return SIt != DemotedConsts.end() &&
-                     is_contained(SIt->getSecond(), Idx);
-            })) {
+    }
+
+    // Finally, map the values we can demote to the maximum bit with we
+    // computed.
+    for (Value *Scalar : ToDemote) {
+      TreeEntry *TE = getTreeEntry(Scalar);
+      assert(TE && "Expected vectorized scalar.");
+      if (MinBWs.contains(TE))
+        continue;
+      bool IsSigned = TE->getOpcode() == Instruction::SExt ||
+                      any_of(TE->Scalars, [&](Value *R) {
+                        return !isKnownNonNegative(R, SimplifyQuery(*DL));
+                      });
+      MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
+      const auto *I = cast<Instruction>(Scalar);
+      auto DCIt = DemotedConsts.find(I);
+      if (DCIt != DemotedConsts.end()) {
+        for (unsigned Idx : DCIt->getSecond()) {
+          // Check that all instructions operands are demoted.
           const TreeEntry *CTE = getOperandEntry(TE, Idx);
-          MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
+          if (all_of(TE->Scalars,
+                     [&](Value *V) {
+                       auto SIt = DemotedConsts.find(cast<Instruction>(V));
+                       return SIt != DemotedConsts.end() &&
+                              is_contained(SIt->getSecond(), Idx);
+                     }) ||
+              all_of(CTE->Scalars, Constant::classof))
+            MinBWs.try_emplace(CTE, MaxBitWidth, IsSigned);
         }
       }
     }
@@ -14668,10 +15029,9 @@ class HorizontalReduction {
   }
 
   /// Creates reduction operation with the current opcode.
-  static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
+  static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
                          Value *RHS, const Twine &Name, bool UseSelect) {
     unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
-    bool IsConstant = isConstant(LHS) && isConstant(RHS);
     switch (Kind) {
     case RecurKind::Or:
       if (UseSelect &&
@@ -14693,49 +15053,33 @@ class HorizontalReduction {
       return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                  Name);
     case RecurKind::FMax:
-      if (IsConstant)
-        return ConstantFP::get(LHS->getType(),
-                               maxnum(cast<ConstantFP>(LHS)->getValueAPF(),
-                                      cast<ConstantFP>(RHS)->getValueAPF()));
       return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
     case RecurKind::FMin:
-      if (IsConstant)
-        return ConstantFP::get(LHS->getType(),
-                               minnum(cast<ConstantFP>(LHS)->getValueAPF(),
-                                      cast<ConstantFP>(RHS)->getValueAPF()));
       return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
     case RecurKind::FMaximum:
-      if (IsConstant)
-        return ConstantFP::get(LHS->getType(),
-                               maximum(cast<ConstantFP>(LHS)->getValueAPF(),
-                                      cast<ConstantFP>(RHS)->getValueAPF()));
       return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
     case RecurKind::FMinimum:
-      if (IsConstant)
-        return ConstantFP::get(LHS->getType(),
-                               minimum(cast<ConstantFP>(LHS)->getValueAPF(),
-                                      cast<ConstantFP>(RHS)->getValueAPF()));
       return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
     case RecurKind::SMax:
-      if (IsConstant || UseSelect) {
+      if (UseSelect) {
         Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       }
       return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
     case RecurKind::SMin:
-      if (IsConstant || UseSelect) {
+      if (UseSelect) {
         Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       }
       return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
     case RecurKind::UMax:
-      if (IsConstant || UseSelect) {
+      if (UseSelect) {
         Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       }
       return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
     case RecurKind::UMin:
-      if (IsConstant || UseSelect) {
+      if (UseSelect) {
         Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
         return Builder.CreateSelect(Cmp, LHS, RHS, Name);
       }
@@ -14747,7 +15091,7 @@ class HorizontalReduction {
 
   /// Creates reduction operation with the current opcode with the IR flags
   /// from \p ReductionOps, dropping nuw/nsw flags.
-  static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
+  static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
                          Value *RHS, const Twine &Name,
                          const ReductionOpsListType &ReductionOps) {
     bool UseSelect =
@@ -15129,7 +15473,7 @@ class HorizontalReduction {
   }
 
   /// Attempt to vectorize the tree found by matchAssociativeReduction.
-  Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI,
+  Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
                      const TargetLibraryInfo &TLI) {
     constexpr int ReductionLimit = 4;
     constexpr unsigned RegMaxNumber = 4;
@@ -15155,7 +15499,9 @@ class HorizontalReduction {
       return nullptr;
     }
 
-    IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
+    IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
+                                    TargetFolder(DL));
+    Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
 
     // Track the reduced values in case if they are replaced by extractelement
     // because of the vectorization.
@@ -15836,7 +16182,7 @@ class HorizontalReduction {
   }
 
   /// Emit a horizontal reduction of the vectorized value.
-  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
+  Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
                        unsigned ReduxWidth, const TargetTransformInfo *TTI) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
@@ -16251,7 +16597,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
     HorizontalReduction HorRdx;
     if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
-    return HorRdx.tryToReduce(R, TTI, *TLI);
+    return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
   };
   auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
     if (TryOperandsAsNewSeeds && FutureSeed == Root) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b1498026adadfe..29a395c3573118 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -23,6 +23,9 @@ class TargetLibraryInfo;
 
 /// Helper class to create VPRecipies from IR instructions.
 class VPRecipeBuilder {
+  /// The VPlan new recipes are added to.
+  VPlan &Plan;
+
   /// The loop that we evaluate.
   Loop *OrigLoop;
 
@@ -69,53 +72,50 @@ class VPRecipeBuilder {
   /// recipe that takes an additional VPInstruction for the mask.
   VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
                                                    ArrayRef<VPValue *> Operands,
-                                                   VFRange &Range,
-                                                   VPlanPtr &Plan);
+                                                   VFRange &Range);
 
   /// Check if an induction recipe should be constructed for \p Phi. If so build
   /// and return it. If not, return null.
   VPHeaderPHIRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
                                                ArrayRef<VPValue *> Operands,
-                                               VPlan &Plan, VFRange &Range);
+                                               VFRange &Range);
 
   /// Optimize the special case where the operand of \p I is a constant integer
   /// induction variable.
   VPWidenIntOrFpInductionRecipe *
   tryToOptimizeInductionTruncate(TruncInst *I, ArrayRef<VPValue *> Operands,
-                                 VFRange &Range, VPlan &Plan);
+                                 VFRange &Range);
 
   /// Handle non-loop phi nodes. Return a new VPBlendRecipe otherwise. Currently
   /// all such phi nodes are turned into a sequence of select instructions as
   /// the vectorizer currently performs full if-conversion.
-  VPBlendRecipe *tryToBlend(PHINode *Phi, ArrayRef<VPValue *> Operands,
-                            VPlanPtr &Plan);
+  VPBlendRecipe *tryToBlend(PHINode *Phi, ArrayRef<VPValue *> Operands);
 
   /// Handle call instructions. If \p CI can be widened for \p Range.Start,
   /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
   /// decision from \p Range.Start to \p Range.End.
   VPWidenCallRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
-                                    VFRange &Range, VPlanPtr &Plan);
+                                    VFRange &Range);
 
   /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
   /// if it can. The function should only be called if the cost-model indicates
   /// that widening should be performed.
   VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands,
-                            VPBasicBlock *VPBB, VPlanPtr &Plan);
+                            VPBasicBlock *VPBB);
 
 public:
-  VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+  VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM,
                   PredicatedScalarEvolution &PSE, VPBuilder &Builder)
-      : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE),
-        Builder(Builder) {}
+      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM),
+        PSE(PSE), Builder(Builder) {}
 
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.
   VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr,
                                        ArrayRef<VPValue *> Operands,
-                                       VFRange &Range, VPBasicBlock *VPBB,
-                                       VPlanPtr &Plan);
+                                       VFRange &Range, VPBasicBlock *VPBB);
 
   /// Set the recipe created for given ingredient. This operation is a no-op for
   /// ingredients that were not marked using a nullptr entry in the map.
@@ -128,19 +128,19 @@ class VPRecipeBuilder {
   }
 
   /// Create the mask for the vector loop header block.
-  void createHeaderMask(VPlan &Plan);
+  void createHeaderMask();
 
   /// A helper function that computes the predicate of the block BB, assuming
   /// that the header block of the loop is set to True or the loop mask when
   /// tail folding.
-  void createBlockInMask(BasicBlock *BB, VPlan &Plan);
+  void createBlockInMask(BasicBlock *BB);
 
   /// Returns the *entry* mask for the block \p BB.
   VPValue *getBlockInMask(BasicBlock *BB) const;
 
   /// A helper function that computes the predicate of the edge between SRC
   /// and DST.
-  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlan &Plan);
+  VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 
   /// A helper that returns the previously computed predicate of the edge
   /// between SRC and DST.
@@ -166,8 +166,7 @@ class VPRecipeBuilder {
   /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as
   /// last operand. Range.End may be decreased to ensure same recipe behavior
   /// from \p Range.Start to \p Range.End.
-  VPReplicateRecipe *handleReplication(Instruction *I, VFRange &Range,
-                                       VPlan &Plan);
+  VPReplicateRecipe *handleReplication(Instruction *I, VFRange &Range);
 
   /// Add the incoming values from the backedge to reduction & first-order
   /// recurrence cross-iteration phis.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index af6d0081bffebc..d720f17a3a8810 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1127,6 +1127,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
     return WrapFlags.HasNSW;
   }
 
+  bool isDisjoint() const {
+    assert(OpType == OperationType::DisjointOp &&
+           "recipe cannot have a disjoing flag");
+    return DisjointFlags.IsDisjoint;
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void printFlags(raw_ostream &O) const;
 #endif
@@ -2136,6 +2142,8 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags {
     assert(isPredicated() && "Trying to get the mask of a unpredicated recipe");
     return getOperand(getNumOperands() - 1);
   }
+
+  unsigned getOpcode() const { return getUnderlyingInstr()->getOpcode(); }
 };
 
 /// A recipe for generating conditional branches on the bits of a mask.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b90c588b607564..a03a408686ef13 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -50,13 +50,82 @@ template <typename Class> struct bind_ty {
   }
 };
 
+/// Match a specified integer value or vector of all elements of that
+/// value.
+struct specific_intval {
+  APInt Val;
+
+  specific_intval(APInt V) : Val(std::move(V)) {}
+
+  bool match(VPValue *VPV) {
+    if (!VPV->isLiveIn())
+      return false;
+    Value *V = VPV->getLiveInIRValue();
+    const auto *CI = dyn_cast<ConstantInt>(V);
+    if (!CI && V->getType()->isVectorTy())
+      if (const auto *C = dyn_cast<Constant>(V))
+        CI = dyn_cast_or_null<ConstantInt>(
+            C->getSplatValue(/*UndefsAllowed=*/false));
+
+    return CI && APInt::isSameValue(CI->getValue(), Val);
+  }
+};
+
+inline specific_intval m_SpecificInt(uint64_t V) {
+  return specific_intval(APInt(64, V));
+}
+
+/// Matching combinators
+template <typename LTy, typename RTy> struct match_combine_or {
+  LTy L;
+  RTy R;
+
+  match_combine_or(const LTy &Left, const RTy &Right) : L(Left), R(Right) {}
+
+  template <typename ITy> bool match(ITy *V) {
+    if (L.match(V))
+      return true;
+    if (R.match(V))
+      return true;
+    return false;
+  }
+};
+
+template <typename LTy, typename RTy>
+inline match_combine_or<LTy, RTy> m_CombineOr(const LTy &L, const RTy &R) {
+  return match_combine_or<LTy, RTy>(L, R);
+}
+
 /// Match a VPValue, capturing it if we match.
 inline bind_ty<VPValue> m_VPValue(VPValue *&V) { return V; }
 
-template <typename Op0_t, unsigned Opcode> struct UnaryVPInstruction_match {
+namespace detail {
+
+/// A helper to match an opcode against multiple recipe types.
+template <unsigned Opcode, typename...> struct MatchRecipeAndOpcode {};
+
+template <unsigned Opcode, typename RecipeTy>
+struct MatchRecipeAndOpcode<Opcode, RecipeTy> {
+  static bool match(const VPRecipeBase *R) {
+    auto *DefR = dyn_cast<RecipeTy>(R);
+    return DefR && DefR->getOpcode() == Opcode;
+  }
+};
+
+template <unsigned Opcode, typename RecipeTy, typename... RecipeTys>
+struct MatchRecipeAndOpcode<Opcode, RecipeTy, RecipeTys...> {
+  static bool match(const VPRecipeBase *R) {
+    return MatchRecipeAndOpcode<Opcode, RecipeTy>::match(R) ||
+           MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R);
+  }
+};
+} // namespace detail
+
+template <typename Op0_t, unsigned Opcode, typename... RecipeTys>
+struct UnaryRecipe_match {
   Op0_t Op0;
 
-  UnaryVPInstruction_match(Op0_t Op0) : Op0(Op0) {}
+  UnaryRecipe_match(Op0_t Op0) : Op0(Op0) {}
 
   bool match(const VPValue *V) {
     auto *DefR = V->getDefiningRecipe();
@@ -64,37 +133,58 @@ template <typename Op0_t, unsigned Opcode> struct UnaryVPInstruction_match {
   }
 
   bool match(const VPRecipeBase *R) {
-    auto *DefR = dyn_cast<VPInstruction>(R);
-    if (!DefR || DefR->getOpcode() != Opcode)
+    if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
       return false;
-    assert(DefR->getNumOperands() == 1 &&
+    assert(R->getNumOperands() == 1 &&
            "recipe with matched opcode does not have 1 operands");
-    return Op0.match(DefR->getOperand(0));
+    return Op0.match(R->getOperand(0));
   }
 };
 
-template <typename Op0_t, typename Op1_t, unsigned Opcode>
-struct BinaryVPInstruction_match {
+template <typename Op0_t, unsigned Opcode>
+using UnaryVPInstruction_match =
+    UnaryRecipe_match<Op0_t, Opcode, VPInstruction>;
+
+template <typename Op0_t, unsigned Opcode>
+using AllUnaryRecipe_match =
+    UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
+                      VPWidenCastRecipe, VPInstruction>;
+
+template <typename Op0_t, typename Op1_t, unsigned Opcode,
+          typename... RecipeTys>
+struct BinaryRecipe_match {
   Op0_t Op0;
   Op1_t Op1;
 
-  BinaryVPInstruction_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}
+  BinaryRecipe_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}
 
   bool match(const VPValue *V) {
     auto *DefR = V->getDefiningRecipe();
     return DefR && match(DefR);
   }
 
+  bool match(const VPSingleDefRecipe *R) {
+    return match(static_cast<const VPRecipeBase *>(R));
+  }
+
   bool match(const VPRecipeBase *R) {
-    auto *DefR = dyn_cast<VPInstruction>(R);
-    if (!DefR || DefR->getOpcode() != Opcode)
+    if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
       return false;
-    assert(DefR->getNumOperands() == 2 &&
+    assert(R->getNumOperands() == 2 &&
            "recipe with matched opcode does not have 2 operands");
-    return Op0.match(DefR->getOperand(0)) && Op1.match(DefR->getOperand(1));
+    return Op0.match(R->getOperand(0)) && Op1.match(R->getOperand(1));
   }
 };
 
+template <typename Op0_t, typename Op1_t, unsigned Opcode>
+using BinaryVPInstruction_match =
+    BinaryRecipe_match<Op0_t, Op1_t, Opcode, VPInstruction>;
+
+template <typename Op0_t, typename Op1_t, unsigned Opcode>
+using AllBinaryRecipe_match =
+    BinaryRecipe_match<Op0_t, Op1_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
+                       VPWidenCastRecipe, VPInstruction>;
+
 template <unsigned Opcode, typename Op0_t>
 inline UnaryVPInstruction_match<Op0_t, Opcode>
 m_VPInstruction(const Op0_t &Op0) {
@@ -130,6 +220,52 @@ inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::BranchOnCount>
 m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
   return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
 }
+
+template <unsigned Opcode, typename Op0_t>
+inline AllUnaryRecipe_match<Op0_t, Opcode> m_Unary(const Op0_t &Op0) {
+  return AllUnaryRecipe_match<Op0_t, Opcode>(Op0);
+}
+
+template <typename Op0_t>
+inline AllUnaryRecipe_match<Op0_t, Instruction::Trunc>
+m_Trunc(const Op0_t &Op0) {
+  return m_Unary<Instruction::Trunc, Op0_t>(Op0);
+}
+
+template <typename Op0_t>
+inline AllUnaryRecipe_match<Op0_t, Instruction::ZExt> m_ZExt(const Op0_t &Op0) {
+  return m_Unary<Instruction::ZExt, Op0_t>(Op0);
+}
+
+template <typename Op0_t>
+inline AllUnaryRecipe_match<Op0_t, Instruction::SExt> m_SExt(const Op0_t &Op0) {
+  return m_Unary<Instruction::SExt, Op0_t>(Op0);
+}
+
+template <typename Op0_t>
+inline match_combine_or<AllUnaryRecipe_match<Op0_t, Instruction::ZExt>,
+                        AllUnaryRecipe_match<Op0_t, Instruction::SExt>>
+m_ZExtOrSExt(const Op0_t &Op0) {
+  return m_CombineOr(m_ZExt(Op0), m_SExt(Op0));
+}
+
+template <unsigned Opcode, typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode> m_Binary(const Op0_t &Op0,
+                                                            const Op1_t &Op1) {
+  return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul>
+m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
+m_Or(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
+}
 } // namespace VPlanPatternMatch
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3af581e3526157..d75e322a74cfa6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -442,8 +442,6 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
-    if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
-      Op = Instruction::Or;
 
     if (PhiR->isOrdered()) {
       ReducedPartRdx = RdxParts[State.UF - 1];
@@ -456,16 +454,19 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
           ReducedPartRdx = Builder.CreateBinOp(
               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
-        else
+        else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+          TrackingVH<Value> ReductionStartValue =
+              RdxDesc.getRecurrenceStartValue();
+          ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
+                                         ReducedPartRdx, RdxPart);
+        } else
           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
       }
     }
 
     // Create the reduction after the loop. Note that inloop reductions create
     // the target reduction in the loop using a Reduction recipe.
-    if ((State.VF.isVector() ||
-         RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
-        !PhiR->isInLoop()) {
+    if (State.VF.isVector() && !PhiR->isInLoop()) {
       ReducedPartRdx =
           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
       // If the reduction can be performed in a smaller type, we need to extend
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3b19db9f0d30df..c6ec99fbbf0a0b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -156,8 +156,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
     if (NeedsDuplicating) {
       if (ScalarVFOnly)
         continue;
-      Instruction *I = cast<Instruction>(
-          cast<VPReplicateRecipe>(SinkCandidate)->getUnderlyingValue());
+      Instruction *I = SinkCandidate->getUnderlyingInstr();
       auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true);
       // TODO: add ".cloned" suffix to name of Clone's VPValue.
 
@@ -815,27 +814,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
   }
 }
 
-/// Returns true is \p V is constant one.
-static bool isConstantOne(VPValue *V) {
-  if (!V->isLiveIn())
-    return false;
-  auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue());
-  return C && C->isOne();
-}
-
-/// Returns the llvm::Instruction opcode for \p R.
-static unsigned getOpcodeForRecipe(VPRecipeBase &R) {
-  if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R))
-    return WidenR->getUnderlyingInstr()->getOpcode();
-  if (auto *WidenC = dyn_cast<VPWidenCastRecipe>(&R))
-    return WidenC->getOpcode();
-  if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R))
-    return RepR->getUnderlyingInstr()->getOpcode();
-  if (auto *VPI = dyn_cast<VPInstruction>(&R))
-    return VPI->getOpcode();
-  return 0;
-}
-
 /// Try to simplify recipe \p R.
 static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   // Try to remove redundant blend recipes.
@@ -849,24 +827,9 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
-  switch (getOpcodeForRecipe(R)) {
-  case Instruction::Mul: {
-    VPValue *A = R.getOperand(0);
-    VPValue *B = R.getOperand(1);
-    if (isConstantOne(A))
-      return R.getVPSingleValue()->replaceAllUsesWith(B);
-    if (isConstantOne(B))
-      return R.getVPSingleValue()->replaceAllUsesWith(A);
-    break;
-  }
-  case Instruction::Trunc: {
-    VPRecipeBase *Ext = R.getOperand(0)->getDefiningRecipe();
-    if (!Ext)
-      break;
-    unsigned ExtOpcode = getOpcodeForRecipe(*Ext);
-    if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
-      break;
-    VPValue *A = Ext->getOperand(0);
+  using namespace llvm::VPlanPatternMatch;
+  VPValue *A;
+  if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
     VPValue *Trunc = R.getVPSingleValue();
     Type *TruncTy = TypeInfo.inferScalarType(Trunc);
     Type *ATy = TypeInfo.inferScalarType(A);
@@ -875,8 +838,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     } else {
       // Don't replace a scalarizing recipe with a widened cast.
       if (isa<VPReplicateRecipe>(&R))
-        break;
+        return;
       if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
+
+        unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue()))
+                                 ? Instruction::SExt
+                                 : Instruction::ZExt;
         auto *VPC =
             new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
         VPC->insertBefore(&R);
@@ -902,11 +869,11 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
         assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
     }
 #endif
-    break;
-  }
-  default:
-    break;
   }
+
+  if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)),
+                            m_Mul(m_SpecificInt(1), m_VPValue(A)))))
+    return R.getVPSingleValue()->replaceAllUsesWith(A);
 }
 
 /// Try to simplify the recipes in \p Plan.
@@ -1249,6 +1216,23 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
       // load/store. If the underlying instruction has poison-generating flags,
       // drop them directly.
       if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
+        VPValue *A, *B;
+        using namespace llvm::VPlanPatternMatch;
+        // Dropping disjoint from an OR may yield incorrect results, as some
+        // analysis may have converted it to an Add implicitly (e.g. SCEV used
+        // for dependence analysis). Instead, replace it with an equivalent Add.
+        // This is possible as all users of the disjoint OR only access lanes
+        // where the operands are disjoint or poison otherwise.
+        if (match(RecWithFlags, m_Or(m_VPValue(A), m_VPValue(B))) &&
+            RecWithFlags->isDisjoint()) {
+          VPBuilder Builder(RecWithFlags);
+          VPInstruction *New = Builder.createOverflowingOp(
+              Instruction::Add, {A, B}, {false, false},
+              RecWithFlags->getDebugLoc());
+          RecWithFlags->replaceAllUsesWith(New);
+          RecWithFlags->eraseFromParent();
+          CurRec = New;
+        }
         RecWithFlags->dropPoisonGeneratingFlags();
       } else {
         Instruction *Instr = dyn_cast_or_null<Instruction>(
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index dc669e314a0d9a..0b16a8b7676923 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -684,10 +684,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
 /// destination type followed by shuffle. This can enable further transforms by
 /// moving bitcasts or shuffles together.
 bool VectorCombine::foldBitcastShuffle(Instruction &I) {
-  Value *V;
+  Value *V0;
   ArrayRef<int> Mask;
-  if (!match(&I, m_BitCast(
-                     m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
+  if (!match(&I, m_BitCast(m_OneUse(
+                     m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))))
     return false;
 
   // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
@@ -696,7 +696,7 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) {
   // 2) Disallow non-vector casts.
   // TODO: We could allow any shuffle.
   auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
-  auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
+  auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
   if (!DestTy || !SrcTy)
     return false;
 
@@ -724,20 +724,31 @@ bool VectorCombine::foldBitcastShuffle(Instruction &I) {
   // Bitcast the shuffle src - keep its original width but using the destination
   // scalar type.
   unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
-  auto *ShuffleTy = FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
-
-  // The new shuffle must not cost more than the old shuffle. The bitcast is
-  // moved ahead of the shuffle, so assume that it has the same cost as before.
-  InstructionCost DestCost = TTI.getShuffleCost(
-      TargetTransformInfo::SK_PermuteSingleSrc, ShuffleTy, NewMask);
+  auto *NewShuffleTy =
+      FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
+  auto *OldShuffleTy =
+      FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
+
+  // The new shuffle must not cost more than the old shuffle.
+  TargetTransformInfo::TargetCostKind CK =
+      TargetTransformInfo::TCK_RecipThroughput;
+  TargetTransformInfo::ShuffleKind SK =
+      TargetTransformInfo::SK_PermuteSingleSrc;
+
+  InstructionCost DestCost =
+      TTI.getShuffleCost(SK, NewShuffleTy, NewMask, CK) +
+      TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
+                           TargetTransformInfo::CastContextHint::None, CK);
   InstructionCost SrcCost =
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask);
+      TTI.getShuffleCost(SK, SrcTy, Mask, CK) +
+      TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
+                           TargetTransformInfo::CastContextHint::None, CK);
   if (DestCost > SrcCost || !DestCost.isValid())
     return false;
 
-  // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
+  // bitcast (shuf V0, MaskC) --> shuf (bitcast V0), MaskC'
   ++NumShufOfBitcast;
-  Value *CastV = Builder.CreateBitCast(V, ShuffleTy);
+  Value *CastV = Builder.CreateBitCast(V0, NewShuffleTy);
   Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
   replaceValue(I, *Shuf);
   return true;
@@ -786,9 +797,12 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
   // intrinsic
   VectorType *VecTy = cast<VectorType>(VPI.getType());
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  SmallVector<int> Mask;
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
+    Mask.resize(FVTy->getNumElements(), 0);
   InstructionCost SplatCost =
       TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
-      TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+      TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, Mask);
 
   // Calculate the cost of the VP Intrinsic
   SmallVector<Type *, 4> Args;
diff --git a/llvm/test/Analysis/ValueTracking/known-non-zero.ll b/llvm/test/Analysis/ValueTracking/known-non-zero.ll
index d804fe96649039..30d340dc725a3e 100644
--- a/llvm/test/Analysis/ValueTracking/known-non-zero.ll
+++ b/llvm/test/Analysis/ValueTracking/known-non-zero.ll
@@ -1292,4 +1292,15 @@ true:
 false:
   ret i1 %ne
 }
+
+define <2 x i1> @range_metadata_vec(ptr %p, <2 x i32> %x) {
+; CHECK-LABEL: @range_metadata_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %v = load <2 x i32>, ptr %p, !range !{i32 1, i32 100}
+  %or = or <2 x i32> %v, %x
+  %cmp = icmp ne <2 x i32> %or, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
 declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1)
diff --git a/llvm/test/Assembler/debug-info.ll b/llvm/test/Assembler/debug-info.ll
index 419623a2cb7d14..06144b261373f4 100644
--- a/llvm/test/Assembler/debug-info.ll
+++ b/llvm/test/Assembler/debug-info.ll
@@ -1,8 +1,8 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder %s
 
-; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39}
-!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42}
+; CHECK: !named = !{!0, !0, !1, !2, !3, !4, !5, !6, !7, !8, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43}
+!named = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46}
 
 ; CHECK:      !0 = !DISubrange(count: 3, lowerBound: 0)
 ; CHECK-NEXT: !1 = !DISubrange(count: 3, lowerBound: 4)
@@ -99,3 +99,15 @@
 ; CHECK-NEXT: !39 = !DIBasicType(name: "u64.le", size: 64, align: 1, encoding: DW_ATE_unsigned, flags: DIFlagLittleEndian)
 !41 = !DIBasicType(name: "u64.be", size: 64, align: 1, encoding: DW_ATE_unsigned, flags: DIFlagBigEndian)
 !42 = !DIBasicType(name: "u64.le", size: 64, align: 1, encoding: DW_ATE_unsigned, flags: DIFlagLittleEndian)
+
+; CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !13, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: false, ptrAuthAuthenticatesNullValues: false)
+!43 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !15, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234)
+
+; CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !13, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: true, ptrAuthAuthenticatesNullValues: false)
+!44 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !15, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: true)
+
+; CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !13, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: false, ptrAuthAuthenticatesNullValues: true)
+!45 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !15, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthAuthenticatesNullValues: true)
+
+; CHECK: !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !13, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: true, ptrAuthAuthenticatesNullValues: true)
+!46 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !15, ptrAuthKey: 2, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1234, ptrAuthIsaPointer: true, ptrAuthAuthenticatesNullValues: true)
diff --git a/llvm/test/Assembler/getelementptr.ll b/llvm/test/Assembler/getelementptr.ll
index 50695a65abb333..45c6a2d00cc346 100644
--- a/llvm/test/Assembler/getelementptr.ll
+++ b/llvm/test/Assembler/getelementptr.ll
@@ -23,24 +23,26 @@
 @PR23753_b = global ptr getelementptr (i8, ptr @PR23753_a, i64 ptrtoint (ptr @PR23753_a to i64))
 ; CHECK: @PR23753_b = global ptr getelementptr (i8, ptr @PR23753_a, i64 ptrtoint (ptr @PR23753_a to i64))
 
-; Verify that inrange on an index inhibits over-indexed getelementptr folding.
+; Verify that inrange doesn't inhibit over-indexed getelementptr folding,
+; but does inhibit combining two GEPs where the inner one has inrange (this
+; will be done when DataLayout is available instead).
 
 @nestedarray = global [2 x [4 x ptr]] zeroinitializer
 
-; CHECK: @nestedarray.1 = alias ptr, getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, inrange i32 0, i64 1, i32 0)
-@nestedarray.1 = alias ptr, getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, inrange i32 0, i32 0, i32 4)
+; CHECK: @nestedarray.1 = alias ptr, getelementptr inbounds inrange(-32, 32) ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i64 1, i32 0)
+@nestedarray.1 = alias ptr, getelementptr inbounds inrange(-32, 32) ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i32 0, i32 4)
 
-; CHECK: @nestedarray.2 = alias ptr, getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, i32 0, inrange i32 0, i32 4)
-@nestedarray.2 = alias ptr, getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, i32 0, inrange i32 0, i32 4)
+; CHECK: @nestedarray.2 = alias ptr, getelementptr inbounds inrange(0, 1) ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i64 1, i32 0)
+@nestedarray.2 = alias ptr, getelementptr inbounds inrange(0, 1) ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i32 0, i32 4)
 
-; CHECK: @nestedarray.3 = alias ptr, getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, i32 0, inrange i32 0)
-@nestedarray.3 = alias ptr, getelementptr inbounds ([4 x ptr], ptr getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, i32 0, inrange i32 0), i32 0, i32 0)
+; CHECK: @nestedarray.3 = alias ptr, getelementptr inbounds inrange(0, 4) ([4 x ptr], ptr @nestedarray, i32 0, i32 0)
+@nestedarray.3 = alias ptr, getelementptr inbounds inrange(0, 4) ([4 x ptr], ptr getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i32 0), i32 0, i32 0)
 
-; CHECK: @nestedarray.4 = alias ptr, getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i32 1, i32 0)
-@nestedarray.4 = alias ptr, getelementptr inbounds ([4 x ptr], ptr getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, i32 0, inrange i32 0), i32 1, i32 0)
+; CHECK: @nestedarray.4 = alias ptr, getelementptr inbounds ([4 x ptr], ptr getelementptr inbounds inrange(0, 4) ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i32 0), i32 1, i32 0)
+@nestedarray.4 = alias ptr, getelementptr inbounds ([4 x ptr], ptr getelementptr inbounds inrange(0, 4) ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i32 0), i32 1, i32 0)
 
-; CHECK: @nestedarray.5 = alias ptr, getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, inrange i32 0, i32 1, i32 0)
-@nestedarray.5 = alias ptr, getelementptr inbounds ([4 x ptr], ptr getelementptr inbounds ([2 x [4 x ptr]], ptr @nestedarray, inrange i32 0, i32 0), i32 1, i32 0)
+; CHECK: @nestedarray.5 = alias ptr, getelementptr inbounds ([4 x ptr], ptr getelementptr inbounds inrange(0, 32) ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i32 0), i32 1, i32 0)
+@nestedarray.5 = alias ptr, getelementptr inbounds ([4 x ptr], ptr getelementptr inbounds inrange(0, 32) ([2 x [4 x ptr]], ptr @nestedarray, i32 0, i32 0), i32 1, i32 0)
 
 ; See if i92 indices work too.
 define ptr @test(ptr %t, i92 %n) {
diff --git a/llvm/test/Assembler/inrange-errors.ll b/llvm/test/Assembler/inrange-errors.ll
new file mode 100644
index 00000000000000..128c21983ca617
--- /dev/null
+++ b/llvm/test/Assembler/inrange-errors.ll
@@ -0,0 +1,46 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %t/parse-error-1.ll -o /dev/null 2>&1 | FileCheck --check-prefix=PARSE-ERROR-1 %s
+; RUN: not llvm-as < %t/parse-error-2.ll -o /dev/null 2>&1 | FileCheck --check-prefix=PARSE-ERROR-2 %s
+; RUN: not llvm-as < %t/parse-error-3.ll -o /dev/null 2>&1 | FileCheck --check-prefix=PARSE-ERROR-3 %s
+; RUN: not llvm-as < %t/parse-error-4.ll -o /dev/null 2>&1 | FileCheck --check-prefix=PARSE-ERROR-4 %s
+; RUN: not llvm-as < %t/end-not-larger-start.ll -o /dev/null 2>&1 | FileCheck --check-prefix=END-NOT-LARGER-START %s
+
+;--- parse-error-1.ll
+
+; PARSE-ERROR-1: error: expected integer
+@g = external global i8
+define ptr @test() {
+  ret ptr getelementptr inrange (i8, ptr @g, i64 8)
+}
+
+;--- parse-error-2.ll
+
+; PARSE-ERROR-2: error: expected ','
+@g = external global i8
+define ptr @test() {
+  ret ptr getelementptr inrange(42 (i8, ptr @g, i64 8)
+}
+
+;--- parse-error-3.ll
+
+; PARSE-ERROR-3: error: expected integer
+@g = external global i8
+define ptr @test() {
+  ret ptr getelementptr inrange(42, (i8, ptr @g, i64 8)
+}
+
+;--- parse-error-4.ll
+
+; PARSE-ERROR-4: error: expected ')'
+@g = external global i8
+define ptr @test() {
+  ret ptr getelementptr inrange(42, 123 (i8, ptr @g, i64 8)
+}
+
+;--- end-not-larger-start.ll
+
+; END-NOT-LARGER-START: error: expected end to be larger than start
+@g = external global i8
+define ptr @test() {
+  ret ptr getelementptr inrange(42, 42) (i8, ptr @g, i64 8)
+}
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index be0207599478b8..953a16b7e624e1 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -334,6 +334,20 @@ define void @test_fast_math_flags_call_outer(float %a) {
   ret void
 }
 
+define void @test_func_prefix_data_01() prefix i32 123 {
+  ret void
+}
+
+define void @test_func_prefix_data_02() prefix i64 2000 {
+  ret void
+}
+
+%func_prolog_struct = type <{ i8, i8, ptr }>
+
+define void @test_func_prologue_data_01() prologue %func_prolog_struct <{ i8 235, i8 8, ptr zeroinitializer}> {
+  ret void
+}
+
 !llvm.dbg.cu = !{!0, !2}
 !llvm.module.flags = !{!3}
 
diff --git a/llvm/test/Bitcode/DIExpression-aggresult.ll b/llvm/test/Bitcode/DIExpression-aggresult.ll
index 0b89454aa2f942..017218277d02bf 100644
--- a/llvm/test/Bitcode/DIExpression-aggresult.ll
+++ b/llvm/test/Bitcode/DIExpression-aggresult.ll
@@ -1,4 +1,5 @@
 ; RUN: llvm-dis -o - %s.bc | FileCheck %s
+; RUN: llvm-dis -o - %s.bc --load-bitcode-into-experimental-debuginfo-iterators=true | FileCheck %s
 %class.A = type { i32, i32, i32, i32 }
 
 define void @_Z3fooi(%class.A* sret(%class.A) %agg.result) #0 !dbg !3 {
diff --git a/llvm/test/Bitcode/compatibility-4.0.ll b/llvm/test/Bitcode/compatibility-4.0.ll
index da5ea0e19639c1..06cb842059a4fa 100644
--- a/llvm/test/Bitcode/compatibility-4.0.ll
+++ b/llvm/test/Bitcode/compatibility-4.0.ll
@@ -1609,7 +1609,7 @@ declare void @f.writeonly() writeonly
 ;; Constant Expressions
 
 define i8** @constexpr() {
-  ; CHECK: ret ptr getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, inrange i32 1, i32 2)
+  ; CHECK: ret ptr getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, i32 1, i32 2)
   ret i8** getelementptr inbounds ({ [4 x i8*], [4 x i8*] }, { [4 x i8*], [4 x i8*] }* null, i32 0, inrange i32 1, i32 2)
 }
 
diff --git a/llvm/test/Bitcode/compatibility-5.0.ll b/llvm/test/Bitcode/compatibility-5.0.ll
index 7a39ae6256b805..f9ae558917cddc 100644
--- a/llvm/test/Bitcode/compatibility-5.0.ll
+++ b/llvm/test/Bitcode/compatibility-5.0.ll
@@ -1624,7 +1624,7 @@ declare void @f.speculatable() speculatable
 ;; Constant Expressions
 
 define i8** @constexpr() {
-  ; CHECK: ret ptr getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, inrange i32 1, i32 2)
+  ; CHECK: ret ptr getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, i32 1, i32 2)
   ret i8** getelementptr inbounds ({ [4 x i8*], [4 x i8*] }, { [4 x i8*], [4 x i8*] }* null, i32 0, inrange i32 1, i32 2)
 }
 
diff --git a/llvm/test/Bitcode/compatibility-6.0.ll b/llvm/test/Bitcode/compatibility-6.0.ll
index 4cb1f3bd123cb1..1458e1b639ad44 100644
--- a/llvm/test/Bitcode/compatibility-6.0.ll
+++ b/llvm/test/Bitcode/compatibility-6.0.ll
@@ -1634,7 +1634,7 @@ declare void @f.speculatable() speculatable
 ;; Constant Expressions
 
 define i8** @constexpr() {
-  ; CHECK: ret ptr getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, inrange i32 1, i32 2)
+  ; CHECK: ret ptr getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, i32 1, i32 2)
   ret i8** getelementptr inbounds ({ [4 x i8*], [4 x i8*] }, { [4 x i8*], [4 x i8*] }* null, i32 0, inrange i32 1, i32 2)
 }
 
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index ce6a6571ec144c..fa8b0520567a6f 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1941,8 +1941,8 @@ declare void @f.speculatable() speculatable
 ;; Constant Expressions
 
 define ptr @constexpr() {
-  ; CHECK: ret ptr getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, inrange i32 1, i32 2)
-  ret ptr getelementptr inbounds ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, inrange i32 1, i32 2)
+  ; CHECK: ret ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, i32 1, i32 2)
+  ret ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr], [4 x ptr] }, ptr null, i32 0, i32 1, i32 2)
 }
 
 define void @instructions.strictfp() strictfp {
diff --git a/llvm/test/Bitcode/dbg-record-roundtrip.ll b/llvm/test/Bitcode/dbg-record-roundtrip.ll
new file mode 100644
index 00000000000000..bd347cac720677
--- /dev/null
+++ b/llvm/test/Bitcode/dbg-record-roundtrip.ll
@@ -0,0 +1,172 @@
+;; Roundtrip tests.
+
+;; Load RemoveDIs mode in llvm-dis but write out debug intrinsics.
+; RUN: llvm-as --write-experimental-debuginfo-iterators-to-bitcode=true %s -o - \
+; RUN: | llvm-dis --load-bitcode-into-experimental-debuginfo-iterators=true --write-experimental-debuginfo=false \
+; RUN: | FileCheck %s
+
+;; Load and write RemoveDIs mode in llvm-dis.
+; RUN: llvm-as --write-experimental-debuginfo-iterators-to-bitcode=true %s -o - \
+; RUN: | llvm-dis --load-bitcode-into-experimental-debuginfo-iterators=true --write-experimental-debuginfo=true \
+; RUN: | FileCheck %s --check-prefixes=RECORDS
+
+;; Load intrinsics directly into the new format (auto-upgrade).
+; RUN: llvm-as --write-experimental-debuginfo-iterators-to-bitcode=false %s -o - \
+; RUN: | llvm-dis --load-bitcode-into-experimental-debuginfo-iterators=true --write-experimental-debuginfo=true \
+; RUN: | FileCheck %s --check-prefixes=RECORDS
+
+;; Check that verify-uselistorder passes regardless of input format.
+; RUN: llvm-as %s --write-experimental-debuginfo-iterators-to-bitcode=true -o - | verify-uselistorder
+; RUN: verify-uselistorder %s
+
+;; Confirm we're producing RemoveDI records from various tools.
+; RUN: opt %s -o - --write-experimental-debuginfo-iterators-to-bitcode=true | llvm-bcanalyzer - | FileCheck %s --check-prefix=BITCODE
+; RUN: llvm-as %s -o - --write-experimental-debuginfo-iterators-to-bitcode=true | llvm-bcanalyzer - | FileCheck %s --check-prefix=BITCODE
+; BITCODE-DAG: DEBUG_RECORD_LABEL
+; BITCODE-DAG: DEBUG_RECORD_VALUE
+; BITCODE-DAG: DEBUG_RECORD_ASSIGN
+; BITCODE-DAG: DEBUG_RECORD_DECLARE
+
+;; Check that llvm-link doesn't explode if we give it different formats to
+;; link.
+;; NOTE: This test fails intermittently on linux if the llvm-as output is piped
+;; into llvm-link in the RUN lines below, unless the verify-uselistorder RUN
+;; lines above are removed. Write to a temporary file to avoid that weirdness.
+;; NOTE2: Unfortunately, the above only stopped it occuring on my machine.
+;; It failed again intermittently here:
+;; https://lab.llvm.org/buildbot/#/builders/245/builds/21930
+;; Allow this test to fail-over twice, until this strangeness is understood.
+; ALLOW_RETRIES: 2
+; RUN: llvm-as %s --experimental-debuginfo-iterators=true --write-experimental-debuginfo-iterators-to-bitcode=true -o %t
+; RUN: llvm-link %t %s --experimental-debuginfo-iterators=false -o /dev/null
+; RUN: llvm-as %s --experimental-debuginfo-iterators=false -o %t
+; RUN: llvm-link %t %s --experimental-debuginfo-iterators=true
+
+;; Checks inline.
+
+@g = internal dso_local global i32 0, align 4, !dbg !0
+
+define internal dso_local noundef i32 @_Z3funv(i32 %p, ptr %storage) !dbg !13 {
+entry:
+;; Dbg record at top of block, check dbg.value configurations.
+; CHECK: entry:
+; CHECK-NEXT: dbg.value(metadata i32 %p, metadata ![[e:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg:[0-9]+]]
+; CHECK-NEXT: dbg.value(metadata ![[empty:[0-9]+]], metadata ![[e]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.value(metadata i32 poison, metadata ![[e]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.value(metadata i32 1, metadata ![[f:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: entry:
+; RECORDS-NEXT: dbg_value(i32 %p, ![[e:[0-9]+]], !DIExpression(), ![[dbg:[0-9]+]])
+; RECORDS-NEXT: dbg_value(![[empty:[0-9]+]], ![[e]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_value(i32 poison, ![[e]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_value(i32 1, ![[f:[0-9]+]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.value(metadata i32 %p, metadata !32, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.value(metadata !29, metadata !32, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 poison, metadata !32, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 1, metadata !33, metadata !DIExpression()), !dbg !19
+;; Arglist with an argument, constant, local use before def, poison.
+; CHECK-NEXT: dbg.value(metadata !DIArgList(i32 %p, i32 0, i32 %0, i32 poison), metadata ![[f]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_LLVM_arg, 3, DW_OP_plus, DW_OP_minus)), !dbg ![[dbg]]
+; RECORDS-NEXT: dbg_value(!DIArgList(i32 %p, i32 0, i32 %0, i32 poison), ![[f]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_LLVM_arg, 3, DW_OP_plus, DW_OP_minus), ![[dbg]])
+  tail call void @llvm.dbg.value(metadata !DIArgList(i32 %p, i32 0, i32 %0, i32 poison), metadata !33, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus, DW_OP_LLVM_arg, 2, DW_OP_LLVM_arg, 3, DW_OP_plus, DW_OP_minus)), !dbg !19
+;; Check dbg.assign use before def (value, addr and ID). Check expression order too.
+; CHECK: dbg.assign(metadata i32 %0, metadata ![[i:[0-9]+]], metadata !DIExpression(DW_OP_plus_uconst, 0),
+; CHECK-SAME:       metadata ![[ID:[0-9]+]], metadata ptr %a, metadata !DIExpression(DW_OP_plus_uconst, 1)), !dbg ![[dbg]]
+; RECORDS: dbg_assign(i32 %0, ![[i:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 0),
+; RECORDS-SAME:       ![[ID:[0-9]+]], ptr %a, !DIExpression(DW_OP_plus_uconst, 1), ![[dbg]])
+  tail call void @llvm.dbg.assign(metadata i32 %0, metadata !36, metadata !DIExpression(DW_OP_plus_uconst, 0), metadata !37, metadata ptr %a, metadata !DIExpression(DW_OP_plus_uconst, 1)), !dbg !19
+  %a = alloca i32, align 4, !DIAssignID !37
+; CHECK: %a = alloca i32, align 4, !DIAssignID ![[ID]]
+;; Check dbg.declare configurations.
+; CHECK-NEXT: dbg.declare(metadata ptr %a, metadata ![[a:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.declare(metadata ![[empty:[0-9]+]], metadata ![[b:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.declare(metadata ptr poison, metadata ![[c:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.declare(metadata ptr null, metadata ![[d:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; CHECK-NEXT: dbg.declare(metadata ptr @g, metadata ![[h:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: %a = alloca i32, align 4, !DIAssignID ![[ID]]
+;; Check dbg.declare configurations.
+; RECORDS-NEXT: dbg_declare(ptr %a, ![[a:[0-9]+]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_declare(![[empty:[0-9]+]], ![[b:[0-9]+]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_declare(ptr poison, ![[c:[0-9]+]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_declare(ptr null, ![[d:[0-9]+]], !DIExpression(), ![[dbg]])
+; RECORDS-NEXT: dbg_declare(ptr @g, ![[h:[0-9]+]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.declare(metadata ptr %a, metadata !17, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.declare(metadata !29, metadata !28, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.declare(metadata ptr poison, metadata !30, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.declare(metadata ptr null, metadata !31, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.declare(metadata ptr @g, metadata !35, metadata !DIExpression()), !dbg !19
+;; Argument value dbg.declare.
+; CHECK: dbg.declare(metadata ptr %storage, metadata ![[g:[0-9]+]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: dbg_declare(ptr %storage, ![[g:[0-9]+]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.declare(metadata ptr %storage, metadata !34, metadata !DIExpression()), !dbg !19
+;; Use before def dbg.value.
+; CHECK: dbg.value(metadata i32 %0, metadata ![[e]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: dbg_value(i32 %0, ![[e]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.value(metadata i32 %0, metadata !32, metadata !DIExpression()), !dbg !19
+  %0 = load i32, ptr @g, align 4, !dbg !20
+;; Non-argument local value dbg.value.
+; CHECK: dbg.value(metadata i32 %0, metadata ![[e]], metadata !DIExpression()), !dbg ![[dbg]]
+; RECORDS: dbg_value(i32 %0, ![[e]], !DIExpression(), ![[dbg]])
+  tail call void @llvm.dbg.value(metadata i32 %0, metadata !32, metadata !DIExpression()), !dbg !19
+  store i32 %0, ptr %a, align 4, !dbg !19
+  %1 = load i32, ptr %a, align 4, !dbg !25
+; CHECK: dbg.label(metadata ![[label:[0-9]+]]), !dbg ![[dbg]]
+; RECORDS: dbg_label(![[label:[0-9]+]], ![[dbg]])
+  tail call void @llvm.dbg.label(metadata !38), !dbg !19
+  ret i32 %1, !dbg !27
+}
+
+; CHECK-DAG: ![[a]] = !DILocalVariable(name: "a",
+; CHECK-DAG: ![[b]] = !DILocalVariable(name: "b",
+; CHECK-DAG: ![[c]] = !DILocalVariable(name: "c",
+; CHECK-DAG: ![[d]] = !DILocalVariable(name: "d",
+; CHECK-DAG: ![[e]] = !DILocalVariable(name: "e",
+; CHECK-DAG: ![[f]] = !DILocalVariable(name: "f",
+; CHECK-DAG: ![[g]] = !DILocalVariable(name: "g",
+; CHECK-DAG: ![[h]] = !DILocalVariable(name: "h",
+; CHECK-DAG: ![[i]] = !DILocalVariable(name: "i",
+; CHECK-DAG: ![[empty]] = !{}
+; CHECK-DAG: ![[label]] = !DILabel
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+declare void @llvm.dbg.label(metadata)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!6, !7, !8, !9, !10, !11}
+!llvm.ident = !{!12}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "g", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "test.cpp", directory: "/")
+!4 = !{!0}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!6 = !{i32 7, !"Dwarf Version", i32 5}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = !{i32 8, !"PIC Level", i32 2}
+!10 = !{i32 7, !"PIE Level", i32 2}
+!11 = !{i32 7, !"uwtable", i32 2}
+!12 = !{!"clang version 19.0.0"}
+!13 = distinct !DISubprogram(name: "fun", linkageName: "_Z3funv", scope: !3, file: !3, line: 2, type: !14, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !16)
+!14 = !DISubroutineType(types: !15)
+!15 = !{!5}
+!16 = !{!17}
+!17 = !DILocalVariable(name: "a", scope: !13, file: !3, line: 3, type: !5)
+!18 = !DILocation(line: 3, column: 3, scope: !13)
+!19 = !DILocation(line: 3, column: 7, scope: !13)
+!20 = !DILocation(line: 3, column: 11, scope: !13)
+!25 = !DILocation(line: 4, column: 12, scope: !13)
+!26 = !DILocation(line: 5, column: 1, scope: !13)
+!27 = !DILocation(line: 4, column: 5, scope: !13)
+!28 = !DILocalVariable(name: "b", scope: !13, file: !3, line: 3, type: !5)
+!29 = !{}
+!30 = !DILocalVariable(name: "c", scope: !13, file: !3, line: 3, type: !5)
+!31 = !DILocalVariable(name: "d", scope: !13, file: !3, line: 3, type: !5)
+!32 = !DILocalVariable(name: "e", scope: !13, file: !3, line: 3, type: !5)
+!33 = !DILocalVariable(name: "f", scope: !13, file: !3, line: 3, type: !5)
+!34 = !DILocalVariable(name: "g", scope: !13, file: !3, line: 3, type: !5)
+!35 = !DILocalVariable(name: "h", scope: !13, file: !3, line: 3, type: !5)
+!36 = !DILocalVariable(name: "i", scope: !13, file: !3, line: 3, type: !5)
+!37 = distinct !DIAssignID()
+!38 = !DILabel(scope: !13, name: "label", file: !3, line: 1)
diff --git a/llvm/test/Bitcode/upgrade-dbg-addr.ll b/llvm/test/Bitcode/upgrade-dbg-addr.ll
index 40fd7db18948b5..06a411c2c83486 100644
--- a/llvm/test/Bitcode/upgrade-dbg-addr.ll
+++ b/llvm/test/Bitcode/upgrade-dbg-addr.ll
@@ -1,6 +1,7 @@
 ; Test upgrade of dbg.addr intrinsics into dbg.value with DW_OP_deref appended
 ;
 ; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: llvm-dis < %s.bc --load-bitcode-into-experimental-debuginfo-iterators --write-experimental-debuginfo=false | FileCheck %s
 ; RUN: verify-uselistorder < %s.bc
 
 define i32 @example(i32 %num) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
new file mode 100644
index 00000000000000..10882a06af1b40
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-deinterleave2.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -O0 -mtriple=aarch64-- --global-isel --global-isel-abort=2 --verify-machineinstrs --stop-after=irtranslator %s -o - | FileCheck %s
+
+define void @vector_deinterleave2_v4i32(<4 x i32> %a) {
+  ; CHECK-LABEL: name: vector_deinterleave2_v4i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $q0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(0, 2)
+  ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[DEF]], shufflemask(1, 3)
+  ; CHECK-NEXT:   RET_ReallyLR
+    %res = call {<2 x i32>, <2 x i32>} @llvm.experimental.vector.deinterleave2.v4i32(<4 x i32> %a)
+    ret void
+}
+
+define void @vector_deinterleave2_v8f32(<8 x float> %a) {
+  ; CHECK-LABEL: name: vector_deinterleave2_v8f32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $q0, $q1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
+  ; CHECK-NEXT:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[BITCAST]](<4 x s32>), [[BITCAST1]](<4 x s32>)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(0, 2, 4, 6)
+  ; CHECK-NEXT:   [[SHUF1:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<8 x s32>), [[DEF]], shufflemask(1, 3, 5, 7)
+  ; CHECK-NEXT:   RET_ReallyLR
+    %res = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %a)
+    ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
new file mode 100644
index 00000000000000..f51e47a428d10e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-vector-interleave2.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -O0 -mtriple=aarch64-- --global-isel --global-isel-abort=2 --verify-machineinstrs --stop-after=irtranslator %s -o - | FileCheck %s
+
+define void @vector_interleave2_v4i32(<2 x i32> %a, <2 x i32> %b) {
+  ; CHECK-LABEL: name: vector_interleave2_v4i32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $d0, $d1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+  ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[COPY1]], shufflemask(0, 2, 1, 3)
+  ; CHECK-NEXT:   RET_ReallyLR
+    %res = call <4 x i32> @llvm.experimental.vector.interleave2.v4i32(<2 x i32> %a, <2 x i32> %b)
+    ret void
+}
+
+define void @vector_interleave2_v8f32(<4 x float> %a, <4 x float> %b) {
+  ; CHECK-LABEL: name: vector_interleave2_v8f32
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $q0, $q1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>)
+  ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<8 x s32>) = G_SHUFFLE_VECTOR [[BITCAST]](<4 x s32>), [[BITCAST1]], shufflemask(0, 4, 1, 5, 2, 6, 3, 7)
+  ; CHECK-NEXT:   RET_ReallyLR
+    %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b)
+    ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 5cbb8649d158b0..b8328eda9a6618 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -607,9 +607,11 @@ body:             |
     ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s16)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s16)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32)
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>)
-    ; CHECK-NEXT: $s0 = COPY [[TRUNC]](<2 x s16>)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[TRUNC]](<4 x s16>)
+    ; CHECK-NEXT: $s0 = COPY [[UV]](<2 x s16>)
     ; CHECK-NEXT: RET_ReallyLR
     %0:_(p0) = COPY $x0
     %1(<2 x s16>) = G_LOAD %0(p0) :: (load (<2 x s16>))
@@ -711,33 +713,24 @@ body:             |
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD %ptr(p0) :: (load (p0), align 64)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD %ptr(p0) :: (load (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD]](<2 x s64>)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
-    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (p0) from unknown-address + 8)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD1]](<2 x s64>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD1]](p0) :: (load (p0) from unknown-address + 16, align 16)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
-    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C2]](s64)
-    ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD2]](p0) :: (load (p0) from unknown-address + 24)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD3]](p0) :: (load (p0) from unknown-address + 32, align 32)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
-    ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C4]](s64)
-    ; CHECK-NEXT: [[LOAD5:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD4]](p0) :: (load (p0) from unknown-address + 40)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD]](p0), [[LOAD1]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD2]](p0), [[LOAD3]](p0)
-    ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD4]](p0), [[LOAD5]](p0)
-    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
-    ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
-    ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR1]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD5]](p0) :: (store (<2 x s64>) into unknown-address + 16)
-    ; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
-    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR2]](<2 x p0>)
-    ; CHECK-NEXT: G_STORE [[BITCAST2]](<2 x s64>), [[PTR_ADD6]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<2 x s64>) from unknown-address + 32, align 32)
+    ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD2]](<2 x s64>)
+    ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST3]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
+    ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST1]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST4]](<2 x s64>), [[PTR_ADD2]](p0) :: (store (<2 x s64>) into unknown-address + 16)
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
+    ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST2]](<2 x p0>)
+    ; CHECK-NEXT: G_STORE [[BITCAST5]](<2 x s64>), [[PTR_ADD3]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
     ; CHECK-NEXT: RET_ReallyLR
     %ptr:_(p0) = COPY $x0
     %val:_(<6 x p0>) = G_LOAD %ptr(p0) :: (load (<6 x p0>))
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
index 661265173ae82b..ed40a2ff7ea70f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
@@ -540,9 +540,17 @@ body:             |
     ; CHECK: liveins: $d0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[COPY]](<2 x s32>)
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s8>), [[TRUNC]](<2 x s8>)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[CONCAT_VECTORS]](<4 x s8>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR1]](<4 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s8>), [[UV3:%[0-9]+]]:_(<2 x s8>), [[UV4:%[0-9]+]]:_(<2 x s8>), [[UV5:%[0-9]+]]:_(<2 x s8>) = G_UNMERGE_VALUES [[TRUNC2]](<8 x s8>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s8>) = G_CONCAT_VECTORS [[UV2]](<2 x s8>), [[UV2]](<2 x s8>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[CONCAT_VECTORS1]](<4 x s8>)
     ; CHECK-NEXT: $d0 = COPY [[ANYEXT]](<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(<2 x s32>) = COPY $d0
diff --git a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
index 859be2d3374762..b940734c6988cb 100644
--- a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
+++ b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir
@@ -19,6 +19,8 @@
 ...
 ---
 name: foo
+frameInfo:
+  adjustsStack:    true
 body: |
   bb.0 (%ir-block.0):
     ; CHECK-LABEL: name: foo
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 9ebd570e687a01..b7c02d6855a9e9 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -4,12 +4,10 @@
 
 ; PR23065: SCALAR_TO_VECTOR implies the top elements 1 to N-1 of the N-element vector are undefined.
 
-; CHECK-GI:         warning: Instruction selection used fallback path for bitcast_v4i8_i32
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_i32_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_v2i16_i32
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_i32_v2i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_v2i16_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for bitcast_v4i8_v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for bitcast_i32_v4i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for bitcast_i32_v2i16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for bitcast_v2i16_v4i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for bitcast_v4i8_v2i16
 
 define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-SD-LABEL: foo1:
@@ -54,15 +52,28 @@ define <4 x i16> @foo2(<2 x i32> %a) {
 ; ===== To and From Scalar Types =====
 
 define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){
-; CHECK-LABEL: bitcast_v4i8_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bitcast_v4i8_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bitcast_v4i8_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
+; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %c = add <4 x i8> %a, %b
   %d = bitcast <4 x i8> %c to i32
   ret i32 %d
@@ -81,18 +92,27 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){
 }
 
 define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){
-; CHECK-LABEL: bitcast_v2i16_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    ldr w0, [sp, #12]
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bitcast_v2i16_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [sp, #12]
+; CHECK-SD-NEXT:    strh w8, [sp, #14]
+; CHECK-SD-NEXT:    ldr w0, [sp, #12]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bitcast_v2i16_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %c = add <2 x i16> %a, %b
   %d = bitcast <2 x i16> %c to i32
   ret i32 %d
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
index 7b8448de2331b4..7cdb10e7159f03 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
@@ -1,23 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
-; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve -o - | FileCheck %s
-; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve2 -o - | FileCheck %s
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve2 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --global-isel --global-isel-abort=2 --mattr=+complxnum,+neon,+fullfp16 -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s --global-isel --global-isel-abort=2 --mattr=+complxnum,+neon,+fullfp16,+sve -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s --global-isel --global-isel-abort=2 --mattr=+complxnum,+neon,+fullfp16,+sve2 -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 target triple = "aarch64"
 
+; CHECK-GI:      warning: Instruction selection used fallback path for complex_add_v16f16
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for complex_add_v32f16
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for complex_add_v16f16_with_intrinsic
+
 ; Expected to not transform
 define <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) {
-; CHECK-LABEL: complex_add_v2f16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h2, v0.h[1]
-; CHECK-NEXT:    mov h3, v1.h[1]
-; CHECK-NEXT:    fsub h1, h1, h2
-; CHECK-NEXT:    fadd h0, h3, h0
-; CHECK-NEXT:    mov v1.h[1], v0.h[0]
-; CHECK-NEXT:    fmov d0, d1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: complex_add_v2f16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov h2, v0.h[1]
+; CHECK-SD-NEXT:    mov h3, v1.h[1]
+; CHECK-SD-NEXT:    fsub h1, h1, h2
+; CHECK-SD-NEXT:    fadd h0, h3, h0
+; CHECK-SD-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-SD-NEXT:    fmov d0, d1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: complex_add_v2f16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NEXT:    fsub h1, h1, h2
+; CHECK-GI-NEXT:    fadd h0, h3, h0
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-GI-NEXT:    fmov d0, d1
+; CHECK-GI-NEXT:    ret
 entry:
   %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 0>
   %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 1>
@@ -162,17 +181,29 @@ entry:
 
 ; Expected not to transform as it is integer
 define <16 x i16> @complex_add_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: complex_add_v16i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 v4.8h, v2.8h, v3.8h
-; CHECK-NEXT:    uzp1 v5.8h, v0.8h, v1.8h
-; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    uzp2 v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    sub v2.8h, v4.8h, v0.8h
-; CHECK-NEXT:    add v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    zip1 v0.8h, v2.8h, v1.8h
-; CHECK-NEXT:    zip2 v1.8h, v2.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: complex_add_v16i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uzp1 v4.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    uzp1 v5.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    uzp2 v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    sub v2.8h, v4.8h, v0.8h
+; CHECK-SD-NEXT:    add v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    zip1 v0.8h, v2.8h, v1.8h
+; CHECK-SD-NEXT:    zip2 v1.8h, v2.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: complex_add_v16i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uzp1 v4.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-NEXT:    uzp2 v2.8h, v2.8h, v3.8h
+; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    add v2.8h, v2.8h, v4.8h
+; CHECK-GI-NEXT:    zip1 v0.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    zip2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %a.real = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %a.imag = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
new file mode 100644
index 00000000000000..c5c525a15ad9be
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -0,0 +1,1126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for extract_v4i32_vector_insert
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v4i32_vector_insert_const
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v4i32_vector_extract
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for extract_v4i32_vector_extract_const
+
+define i64 @extract_v2i64_undef_index(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_undef_index:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_undef_index:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    str q0, [sp, #-16]!
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    ldr x0, [sp], #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 undef
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_undef_vector(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_undef_vector:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_undef_vector:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> undef, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_opaque(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_opaque:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_opaque:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_oob(<2 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v2i64_oob:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> %a, i32 5
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_freeze(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_freeze:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_freeze:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %fvector = freeze <2 x i64> %a
+  %d = extractelement <2 x i64> %fvector, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_extract_of_insert(<2 x i64> %a, i64 %element, i64 %c) {
+; CHECK-LABEL: extract_v2i64_extract_of_insert:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %vector = insertelement <2 x i64> %a, i64 %element, i64 %c
+  %d = extractelement <2 x i64> %vector, i64 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_extract_of_insert_different_const(<2 x i64> %a, i64 %element) {
+; CHECK-SD-LABEL: extract_v2i64_extract_of_insert_different_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov x0, v0.d[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_extract_of_insert_different_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = insertelement <2 x i64> %a, i64 %element, i64 0
+  %d = extractelement <2 x i64> %vector, i64 1
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_extract_build_vector_const(<2 x i64> %a, i32 %c) {
+; CHECK-LABEL: extract_v2i64_extract_build_vector_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #11 // =0xb
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> <i64 42, i64 11>, i32 1
+  ret i64 %d
+}
+
+define i64 @extract_v2i64_extract_build_vector_opaque(<2 x i64> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i64_extract_build_vector_opaque:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI8_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i64_extract_build_vector_opaque:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI8_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x1
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr x0, [x9, x8, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <2 x i64> <i64 42, i64 11>, i32 %c
+  ret i64 %d
+}
+
+
+define i64 @extract_v2i32_zext(<2 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2i32_zext:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_zext:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %zvector = zext <2 x i32> %a to <2 x i64>
+  %d = extractelement <2 x i64> %zvector, i32 %c
+  ret i64 %d
+}
+
+define i64 @extract_v2double_fptosi(<2 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2double_fptosi:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr x0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2double_fptosi:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr x0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = fptosi <2 x double> %a to <2 x i64>
+  %d = extractelement <2 x i64> %vector, i32 %c
+  ret i64 %d
+}
+
+define double @extract_v2double_fneg(<2 x double> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v2double_fneg:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fneg v0.2d, v0.2d
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr d0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2double_fneg:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fneg v0.2d, v0.2d
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr d0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = fneg <2 x double> %a
+  %d = extractelement <2 x double> %vector, i32 %c
+  ret double %d
+}
+
+define i32 @extract_v4i32_add(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_add:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI12_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_add:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = add <4 x i32> %a, <i32 42, i32 11, i32 17, i32 6>
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define float @extract_v4i32_minimum(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_minimum:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_minimum:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.minimum.v4float(<4 x float> %a, <4 x float> %b)
+  %d = extractelement <4 x float> %vector, i32 %c
+  ret float %d
+}
+
+define float @extract_v4i32_minimum_build_vector(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_minimum_build_vector:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_minimum_build_vector:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.minimum.v4float(<4 x float> %a, <4 x float> <float 42.0, float 11.0, float 17.0, float 6.0>)
+  %d = extractelement <4 x float> %vector, i32 %c
+  ret float %d
+}
+
+define float @extract_v4i32_minimum_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_minimum_build_vector_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, .LCPI15_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov s0, v0.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.minimum.v4float(<4 x float> %a, <4 x float> <float 42.0, float 11.0, float 17.0, float 6.0>)
+  %d = extractelement <4 x float> %vector, i32 1
+  ret float %d
+}
+
+define float @extract_v4i32_copysign_build_vector(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI16_0
+; CHECK-SD-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr s0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_copysign_build_vector:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr s0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.copysign.v4float(<4 x float> %a, <4 x float> <float 42.0, float 11.0, float 17.0, float 6.0>)
+  %d = extractelement <4 x float> %vector, i32 %c
+  ret float %d
+}
+
+define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    adrp x8, .LCPI17_0
+; CHECK-SD-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI17_0]
+; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    mov s0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_copysign_build_vector_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mvni v1.4s, #128, lsl #24
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x float> @llvm.copysign.v4float(<4 x float> %a, <4 x float> <float 42.0, float 11.0, float 17.0, float 6.0>)
+  %d = extractelement <4 x float> %vector, i32 2
+  ret float %d
+}
+
+
+define i32 @extract_v4i32_icmp(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_icmp:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI18_0
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_icmp:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI18_0
+; CHECK-GI-NEXT:    movi v2.4s, #1
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = icmp sle <4 x i32> %a, <i32 42, i32 11, i32 17, i32 6>
+  %zvector = zext <4 x i1> %vector to <4 x i32>
+  %d = extractelement <4 x i32> %zvector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_icmp_const(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_icmp_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    adrp x8, .LCPI19_0
+; CHECK-SD-NEXT:    movi v2.4s, #1
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-SD-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_icmp_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI19_0
+; CHECK-GI-NEXT:    movi v2.4s, #1
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-GI-NEXT:    cmge v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = icmp sle <4 x i32> %a, <i32 42, i32 11, i32 17, i32 6>
+  %zvector = zext <4 x i1> %vector to <4 x i32>
+  %d = extractelement <4 x i32> %zvector, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v4float_fcmp(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4float_fcmp:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    movi v1.4s, #1
+; CHECK-SD-NEXT:    fcmeq v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4float_fcmp:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fmov v1.4s, #1.00000000
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = fcmp uno <4 x float> %a, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %zvector = zext <4 x i1> %vector to <4 x i32>
+  %d = extractelement <4 x i32> %zvector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4float_fcmp_const(<4 x float> %a, <4 x float> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4float_fcmp_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v1.4s, #1
+; CHECK-SD-NEXT:    fcmeq v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    mov w0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4float_fcmp_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov v1.4s, #1.00000000
+; CHECK-GI-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    movi v1.4s, #1
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    bic v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = fcmp uno <4 x float> %a, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %zvector = zext <4 x i1> %vector to <4 x i32>
+  %d = extractelement <4 x i32> %zvector, i32 1
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_select(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %cond) {
+; CHECK-SD-LABEL: extract_v4i32_select:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-SD-NEXT:    adrp x8, .LCPI22_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI22_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-SD-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_select:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI22_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI22_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = select <4 x i1> %cond, <4 x i32> %a, <4 x i32> <i32 42, i32 11, i32 17, i32 6>
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_select_const(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %cond) {
+; CHECK-SD-LABEL: extract_v4i32_select_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-SD-NEXT:    movi v2.4s, #17
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-SD-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-SD-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT:    mov w0, v0.s[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_select_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI23_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = select <4 x i1> %cond, <4 x i32> %a, <4 x i32> <i32 42, i32 11, i32 17, i32 6>
+  %d = extractelement <4 x i32> %vector, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_abs(<4 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_abs:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    frintp v0.4s, v0.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    frintm v0.4s, v0.4s
+; CHECK-SD-NEXT:    fabs v0.4s, v0.4s
+; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-SD-NEXT:    abs v0.4s, v0.4s
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_abs:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    frintp v0.4s, v0.4s
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    and x9, x9, #0x3
+; CHECK-GI-NEXT:    frintm v0.4s, v0.4s
+; CHECK-GI-NEXT:    fabs v0.4s, v0.4s
+; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x8, x9, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %ceil = call <4 x float> @llvm.ceil.v4float(<4 x float> %a)
+  %floor = call <4 x float> @llvm.floor.v4float(<4 x float> %ceil)
+  %fabs = call <4 x float> @llvm.fabs.v4float(<4 x float> %floor)
+  %abs = fptosi <4 x float> %fabs to <4 x i32>
+  %vector = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %abs, i1 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_abs_const(<4 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_abs_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w0, #4 // =0x4
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_abs_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI25_0
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI25_0]
+; CHECK-GI-NEXT:    frintp v0.4s, v0.4s
+; CHECK-GI-NEXT:    frintm v0.4s, v0.4s
+; CHECK-GI-NEXT:    fabs v0.4s, v0.4s
+; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %ceil = call <4 x float> @llvm.ceil.v4float(<4 x float> <float 1.0, float 4.0, float 3.0, float 2.0>)
+  %floor = call <4 x float> @llvm.floor.v4float(<4 x float> %ceil)
+  %fabs = call <4 x float> @llvm.fabs.v4float(<4 x float> %floor)
+  %abs = fptosi <4 x float> %fabs to <4 x i32>
+  %vector = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %abs, i1 0)
+  %d = extractelement <4 x i32> %vector, i32 1
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_abs_half_const(<4 x float> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_abs_half_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    adrp x8, .LCPI26_0
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_abs_half_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI26_0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI26_0]
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    frintp v0.4s, v0.4s
+; CHECK-GI-NEXT:    frintm v0.4s, v0.4s
+; CHECK-GI-NEXT:    fabs v0.4s, v0.4s
+; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-GI-NEXT:    abs v0.4s, v0.4s
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %ceil = call <4 x float> @llvm.ceil.v4float(<4 x float> <float 1.0, float 4.0, float 3.0, float 2.0>)
+  %floor = call <4 x float> @llvm.floor.v4float(<4 x float> %ceil)
+  %fabs = call <4 x float> @llvm.fabs.v4float(<4 x float> %floor)
+  %abs = fptosi <4 x float> %fabs to <4 x i32>
+  %vector = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %abs, i1 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_insert(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_vector_insert:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-NEXT:    str q1, [sp]
+; CHECK-NEXT:    ldr w0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_insert_const(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_vector_insert_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov w0, v1.s[1]
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 1
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_extract(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_vector_extract:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    bfi x8, x0, #2, #2
+; CHECK-NEXT:    ldr w0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %a, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vector_extract_const(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_vector_extract_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %a, i64 0)
+  %d = extractelement <4 x i32> %vector, i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_load(<4 x i32> %a, <2 x i32> %b, i32 %c, ptr %arg) {
+; CHECK-SD-LABEL: extract_v4i32_load:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    and x8, x0, #0x3
+; CHECK-SD-NEXT:    ldr w0, [x1, x8, lsl #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_load:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    ldr w0, [x1, x8, lsl #2]
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = load  <4 x i32>, ptr %arg
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_load_const(<4 x i32> %a, <2 x i32> %b, i32 %c, ptr %arg) {
+; CHECK-LABEL: extract_v4i32_load_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr w0, [x1]
+; CHECK-NEXT:    ret
+entry:
+  %vector = load  <4 x i32>, ptr %arg
+  %d = extractelement <4 x i32> %vector, i32 0
+  ret i32 %d
+}
+
+define double @extract_v4i32_bitcast(<4 x i32> %a, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_bitcast:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    bfi x8, x0, #3, #1
+; CHECK-SD-NEXT:    ldr d0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_bitcast:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w9, w0
+; CHECK-GI-NEXT:    mov x8, sp
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    ldr d0, [x8, x9, lsl #3]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = bitcast <4 x i32> %a to <2 x double>
+  %d = extractelement <2 x double> %vector, i32 %c
+  ret double %d
+}
+
+define double @extract_v4i32_bitcast_const(<4 x i32> %a, i32 %c) {
+; CHECK-LABEL: extract_v4i32_bitcast_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %vector = bitcast <4 x i32> %a to <2 x double>
+  %d = extractelement <2 x double> %vector, i32 0
+  ret double %d
+}
+
+define i32 @extract_v4i32_shuffle(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_shuffle:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    mov v1.s[3], v0.s[3]
+; CHECK-SD-NEXT:    str q1, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_shuffle:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 3>
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_shuffle_const(<4 x i32> %a, <4 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_shuffle_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_shuffle_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
+; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov s0, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 3>
+  %d = extractelement <4 x i32> %vector, i32 2
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_splat(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-SD-LABEL: extract_v4i32_splat:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    movi v0.4s, #11
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_splat:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    movi v0.4s, #11
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> splat (i32 11), i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_splat_const(<4 x i32> %a, <2 x i32> %b, i32 %c) {
+; CHECK-LABEL: extract_v4i32_splat_const:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #11 // =0xb
+; CHECK-NEXT:    ret
+entry:
+  %d = extractelement <4 x i32> splat (i32 11), i32 0
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vp_add(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %mask, i32 %evl) {
+; CHECK-SD-LABEL: extract_v4i32_vp_add:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov x8, sp
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    bfi x8, x0, #2, #2
+; CHECK-SD-NEXT:    str q0, [sp]
+; CHECK-SD-NEXT:    ldr w0, [x8]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_vp_add:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x9, sp
+; CHECK-GI-NEXT:    and x8, x8, #0x3
+; CHECK-GI-NEXT:    str q0, [sp]
+; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+  %d = extractelement <4 x i32> %vector, i32 %c
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_vp_add_const(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %mask, i32 %evl) {
+; CHECK-SD-LABEL: extract_v4i32_vp_add_const:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    mov w0, v0.s[3]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_vp_add_const:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov s0, v0.s[3]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %vector = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+  %d = extractelement <4 x i32> %vector, i32 3
+  ret i32 %d
+}
+
+define i32 @extract_v4i32_phi(i64 %val, i32  %limit, ptr %ptr) {
+; CHECK-SD-LABEL: extract_v4i32_phi:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    dup v1.2s, w0
+; CHECK-SD-NEXT:    adrp x8, .LCPI41_0
+; CHECK-SD-NEXT:    movi v0.2s, #16
+; CHECK-SD-NEXT:    ldr d2, [x8, :lo12:.LCPI41_0]
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-SD-NEXT:  .LBB41_1: // %loop
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    fmov w8, s1
+; CHECK-SD-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-SD-NEXT:    cmp w8, w1
+; CHECK-SD-NEXT:    add w0, w8, #10
+; CHECK-SD-NEXT:    str w0, [x2, w8, sxtw #2]
+; CHECK-SD-NEXT:    b.lo .LBB41_1
+; CHECK-SD-NEXT:  // %bb.2: // %ret
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4i32_phi:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI41_0
+; CHECK-GI-NEXT:    dup v0.2d, x0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI41_0]
+; CHECK-GI-NEXT:    add v1.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    movi v0.2s, #16
+; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
+; CHECK-GI-NEXT:  .LBB41_1: // %loop
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    cmp w8, w1
+; CHECK-GI-NEXT:    add w0, w9, #10
+; CHECK-GI-NEXT:    str w0, [x2, w8, sxtw #2]
+; CHECK-GI-NEXT:    b.lo .LBB41_1
+; CHECK-GI-NEXT:  // %bb.2: // %ret
+; CHECK-GI-NEXT:    ret
+entry:
+  %tempvector = insertelement <2 x i64> undef, i64 %val, i32 0
+  %vector = shufflevector <2 x i64> %tempvector, <2 x i64> undef, <2 x i32> zeroinitializer
+  %0 = add <2 x i64> %vector, <i64 1, i64 2>
+  %1 = trunc <2 x i64> %0 to <2 x i32>
+  br label %loop
+
+loop:
+  %2 = phi <2 x i32> [ %1, %entry ], [ %inc, %loop ]
+  %elt = extractelement <2 x i32> %2, i32 0
+  %end = icmp ult i32 %elt, %limit
+  %3 = add i32 10, %elt
+  %4 = sext i32 %elt to i64
+  %5 = getelementptr i32, ptr %ptr, i64 %4
+  store i32 %3, ptr %5
+  %inc = add <2 x i32> %2, <i32 16, i32 16>
+  br i1 %end, label %loop, label %ret
+
+ret:
+  ret i32 %3
+}
+
+
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 2d0b5574cdd7ba..9916aeeab1cad1 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1108,61 +1108,54 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov w12, #31 // =0x1f
-; CHECK-GI-FP16-NEXT:    ldr s4, [sp]
-; CHECK-GI-FP16-NEXT:    fmov s2, w12
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
+; CHECK-GI-FP16-NEXT:    ldr s3, [sp]
+; CHECK-GI-FP16-NEXT:    fmov s1, w10
 ; CHECK-GI-FP16-NEXT:    fmov s6, w0
-; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #8]
 ; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #24]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
-; CHECK-GI-FP16-NEXT:    umov w9, v1.h[4]
-; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
-; CHECK-GI-FP16-NEXT:    umov w11, v1.h[5]
-; CHECK-GI-FP16-NEXT:    umov w10, v1.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w12
-; CHECK-GI-FP16-NEXT:    umov w13, v1.h[2]
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
+; CHECK-GI-FP16-NEXT:    umov w9, v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
 ; CHECK-GI-FP16-NEXT:    mov v6.s[1], w1
 ; CHECK-GI-FP16-NEXT:    mov v7.s[1], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT:    fmov s3, w9
-; CHECK-GI-FP16-NEXT:    fmov s0, w8
-; CHECK-GI-FP16-NEXT:    umov w8, v1.h[6]
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w12
-; CHECK-GI-FP16-NEXT:    umov w9, v1.h[3]
+; CHECK-GI-FP16-NEXT:    fmov s2, w8
+; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
+; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
 ; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w11
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    fmov s1, w10
-; CHECK-GI-FP16-NEXT:    neg v17.4s, v2.4s
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    neg v17.4s, v1.4s
+; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s3
+; CHECK-GI-FP16-NEXT:    fmov s3, w7
+; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w8
+; CHECK-GI-FP16-NEXT:    fmov w8, s4
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #16]
+; CHECK-GI-FP16-NEXT:    ushl v1.4s, v2.4s, v1.4s
+; CHECK-GI-FP16-NEXT:    fmov s2, w4
+; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w5
 ; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
+; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v17.4s
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
-; CHECK-GI-FP16-NEXT:    fmov s4, w7
-; CHECK-GI-FP16-NEXT:    mov v0.s[2], w13
-; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s5
-; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #16]
-; CHECK-GI-FP16-NEXT:    ushl v2.4s, v3.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    fmov s3, w4
-; CHECK-GI-FP16-NEXT:    mov v0.s[3], w9
-; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w5
-; CHECK-GI-FP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-FP16-NEXT:    sshl v2.4s, v2.4s, v17.4s
-; CHECK-GI-FP16-NEXT:    fmov w8, s5
-; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    eor v1.16b, v2.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    mov v3.s[2], w6
-; CHECK-GI-FP16-NEXT:    mov v4.s[3], w8
-; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    and v1.16b, v7.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v4.16b
-; CHECK-GI-FP16-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    eor v4.16b, v1.16b, v5.16b
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w6
+; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
+; CHECK-GI-FP16-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v7.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    orr v1.16b, v1.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-FP16-NEXT:    mov s4, v0.s[3]
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 519a2978d8604b..93d3d96d67b650 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -36,6 +36,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @exp_v1f64(<1 x double> %x) {
+; CHECK-LABEL: exp_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl exp
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.exp.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @exp_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: exp_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -1293,6 +1306,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @exp2_v1f64(<1 x double> %x) {
+; CHECK-LABEL: exp2_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl exp2
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.exp2.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @exp2_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: exp2_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -2550,6 +2576,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @log_v1f64(<1 x double> %x) {
+; CHECK-LABEL: log_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl log
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.log.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @log_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: log_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -3807,6 +3846,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @log2_v1f64(<1 x double> %x) {
+; CHECK-LABEL: log2_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl log2
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.log2.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @log2_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: log2_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -5064,6 +5116,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @log10_v1f64(<1 x double> %x) {
+; CHECK-LABEL: log10_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl log10
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.log10.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @log10_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: log10_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 1b1cfead0f97ac..2ad5623b655176 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -1,29 +1,50 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_v2f16_v4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    dup v2.2s, v0.s[1]
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    mov v1.h[0], v0.h[1]
-; CHECK-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vector_deinterleave_v2f16_v4f16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    dup v2.2s, v0.s[1]
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.h[0], v0.h[1]
+; CHECK-SD-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    uzp2 v1.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT:    mov h0, v2.h[1]
+; CHECK-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT:    fmov d0, d2
+; CHECK-GI-NEXT:    ret
   %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
   ret {<2 x half>, <2 x half>}   %retval
 }
 
 define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_v4f16_v8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    uzp1 v2.4h, v0.4h, v1.4h
-; CHECK-NEXT:    uzp2 v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    fmov d0, d2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vector_deinterleave_v4f16_v8f16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    uzp1 v2.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp2 v1.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    fmov d0, d2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_deinterleave_v4f16_v8f16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uzp1 v2.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    uzp2 v1.8h, v0.8h, v0.8h
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT:    fmov d0, d2
+; CHECK-GI-NEXT:    ret
   %retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec)
   ret {<4 x half>, <4 x half>}   %retval
 }
@@ -40,13 +61,21 @@ define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %v
 }
 
 define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) {
-; CHECK-LABEL: vector_deinterleave_v2f32_v4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v2.2s, v0.2s, v1.2s
-; CHECK-NEXT:    zip2 v1.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmov d0, d2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vector_deinterleave_v2f32_v4f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    zip1 v2.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    zip2 v1.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    fmov d0, d2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_deinterleave_v2f32_v4f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uzp1 v2.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    uzp2 v1.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT:    fmov d0, d2
+; CHECK-GI-NEXT:    ret
   %retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec)
   ret {<2 x float>, <2 x float>}   %retval
 }
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index 071c1ffdbb45dc..eb81aff33e4963 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 ; CHECK-LABEL: interleave2_v4f16:
@@ -11,15 +12,22 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 }
 
 define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) {
-; CHECK-LABEL: interleave2_v8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    adrp x8, .LCPI1_0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: interleave2_v8f16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    adrp x8, .LCPI1_0
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: interleave2_v8f16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %retval = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %vec0, <4 x half> %vec1)
   ret <8 x half> %retval
 }
@@ -36,14 +44,21 @@ define <16 x half> @interleave2_v16f16(<8 x half> %vec0, <8 x half> %vec1) {
 }
 
 define <4 x float> @interleave2_v4f32(<2 x float> %vec0, <2 x float> %vec1) {
-; CHECK-LABEL: interleave2_v4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    rev64 v1.4s, v0.4s
-; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: interleave2_v4f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    rev64 v1.4s, v0.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: interleave2_v4f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %retval = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %vec0, <2 x float> %vec1)
   ret <4 x float> %retval
 }
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
index f80a8df18a03a2..79502121f3689b 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
@@ -1477,6 +1477,61 @@ define fp128 @fpext_f128_f64(double %x) #0 {
   ret fp128 %val
 }
 
+; CHECK-LABEL: sin_v1f64:
+; CHECK: bl sin
+define <1 x double> @sin_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.sin.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: cos_v1f64:
+; CHECK: bl cos
+define <1 x double> @cos_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.cos.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: pow_v1f64:
+; CHECK: bl pow
+define <1 x double> @pow_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.pow.v1f64(<1 x double> %x, <1 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: log_v1f64:
+; CHECK: bl log
+define <1 x double> @log_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.log.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: log2_v1f64:
+; CHECK: bl log2
+define <1 x double> @log2_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.log2.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: log10_v1f64:
+; CHECK: bl log10
+define <1 x double> @log10_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.log10.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: exp_v1f64:
+; CHECK: bl exp
+define <1 x double> @exp_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.exp.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
+; CHECK-LABEL: exp2_v1f64:
+; CHECK: bl exp2
+define <1 x double> @exp2_v1f64(<1 x double> %x, <1 x double> %y) {
+  %val = call <1 x double> @llvm.experimental.constrained.exp2.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
 
 attributes #0 = { strictfp }
 
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index c2ad1aafd65fc4..8d40121ad4543f 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -37,6 +37,21 @@ entry:
   ret half %c
 }
 
+define <1 x double> @pow_v1f64(<1 x double> %x) {
+; CHECK-LABEL: pow_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    bl pow
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.pow.v1f64(<1 x double> %x, <1 x double> <double 3.140000e+00>)
+  ret <1 x double> %c
+}
+
 define <2 x double> @pow_v2f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SD-LABEL: pow_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 2ab1610edcc7f1..0b34f9570fa77b 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -36,6 +36,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @sin_v1f64(<1 x double> %x) {
+; CHECK-LABEL: sin_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl sin
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.sin.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @sin_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: sin_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -1293,6 +1306,19 @@ entry:
   ret half %c
 }
 
+define <1 x double> @cos_v1f64(<1 x double> %x) {
+; CHECK-LABEL: cos_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl cos
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = call <1 x double> @llvm.cos.v1f64(<1 x double> %x)
+  ret <1 x double> %c
+}
+
 define <2 x double> @cos_v2f64(<2 x double> %a) {
 ; CHECK-SD-LABEL: cos_v2f64:
 ; CHECK-SD:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index 2269d75cdbb9ed..e12502980790da 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -329,9 +329,17 @@ define <8 x i16> @hadds_i_undef(<8 x i16> %t, <8 x i16> %src1) {
   ret <8 x i16> %result
 }
 
-
-
-
+define <8 x i16> @sub_fixedwidth_v4i32(<8 x i16> %a0, <8 x i16> %a1)  {
+; CHECK-LABEL: sub_fixedwidth_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %or = or <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a0, %a1
+  %srl = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <8 x i16> %or, %srl
+  ret <8 x i16> %res
+}
 
 define <8 x i16> @rhaddu_base(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: rhaddu_base:
@@ -859,6 +867,18 @@ define <4 x i32> @urhadd_v4i32(<4 x i32> %x) {
   ret <4 x i32> %r
 }
 
+define <8 x i16> @uhadd_fixedwidth_v4i32(<8 x i16> %a0, <8 x i16> %a1)  {
+; CHECK-LABEL: uhadd_fixedwidth_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %and = and <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a0, %a1
+  %srl = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <8 x i16> %and, %srl
+  ret <8 x i16> %res
+}
+
 declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll
index 458bd7eeba16cf..834417b98743a8 100644
--- a/llvm/test/CodeGen/AArch64/isinf.ll
+++ b/llvm/test/CodeGen/AArch64/isinf.ll
@@ -58,22 +58,14 @@ define i32 @replace_isinf_call_f64(double %x) {
 define i32 @replace_isinf_call_f128(fp128 %x) {
 ; CHECK-LABEL: replace_isinf_call_f128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    str q0, [sp]
-; CHECK-NEXT:    ldrb w8, [sp, #15]
-; CHECK-NEXT:    and w8, w8, #0x7f
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    ldr q0, [sp]
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    bl __eqtf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldp x9, x8, [sp], #16
+; CHECK-NEXT:    and x8, x8, #0x7fffffffffffffff
+; CHECK-NEXT:    eor x8, x8, #0x7fff000000000000
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cmp x8, #0
 ; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %abs = tail call fp128 @llvm.fabs.f128(fp128 %x)
   %cmpinf = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 2164c2aad20111..ceac37f91238a8 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -5521,7 +5521,8 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-FP16-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
@@ -5580,12 +5581,13 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    fmov w9, s0
+; CHECK-GI-FP16-NEXT:    and w9, w9, #0xff
+; CHECK-GI-FP16-NEXT:    and w8, w8, #0xff
+; CHECK-GI-FP16-NEXT:    ucvtf h0, w9
+; CHECK-GI-FP16-NEXT:    ucvtf h1, w8
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 56f4272c4363c8..51d17ad0644f15 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -532,11 +532,18 @@ define double @exp10_f64(double %x) {
   ret double %r
 }
 
-; FIXME: Broken
-; define <1 x double> @exp10_v1f64(<1 x double> %x) {
-;   %r = call <1 x double> @llvm.exp10.v1f64(<1 x double> %x)
-;   ret <1 x double> %r
-; }
+define <1 x double> @exp10_v1f64(<1 x double> %x) {
+; CHECK-LABEL: exp10_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl exp10
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %r = call <1 x double> @llvm.exp10.v1f64(<1 x double> %x)
+  ret <1 x double> %r
+}
 
 define <2 x double> @exp10_v2f64(<2 x double> %x) {
 ; SDAG-LABEL: exp10_v2f64:
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 61f04fbf0484f7..3e0d5dd875097f 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -280,13 +280,12 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    sxtb x8, w2
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
@@ -444,13 +443,12 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    sbfx x8, x2, #0, #10
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 3fa1ee5b9b0114..dba3227459b906 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -38,4 +38,43 @@ define void @streaming_compatible() #0 {
 
 declare void @non_streaming()
 
+
+; Verify that COALESCER_BARRIER is also supported without +sme.
+
+define void @streaming_compatible_arg(float %f) #0 {
+; CHECK-LABEL: streaming_compatible_arg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    tbz w19, #0, .LBB1_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    bl non_streaming
+; CHECK-NEXT:    tbz w19, #0, .LBB1_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  call void @non_streaming(float %f)
+  ret void
+}
+
+
 attributes #0 = { nounwind "aarch64_pstate_sm_compatible" }
diff --git a/llvm/test/CodeGen/AArch64/sme-machine-licm-vg.mir b/llvm/test/CodeGen/AArch64/sme-machine-licm-vg.mir
new file mode 100644
index 00000000000000..e6cce9af0daf0e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-machine-licm-vg.mir
@@ -0,0 +1,64 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=early-machinelicm %s -verify-machineinstrs -o - | FileCheck %s
+---
+name:            test_should_hoist_pfalse
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test_should_hoist_pfalse
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $x0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr64all = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr64all = COPY [[COPY]]
+  ; CHECK-NEXT:   [[PFALSE:%[0-9]+]]:ppr = PFALSE implicit $vg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr64common = PHI [[COPY2]], %bb.0, %5, %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:gpr64sp = PHI [[COPY3]], %bb.0, %7, %bb.1
+  ; CHECK-NEXT:   STR_PXI [[PFALSE]], [[PHI]], 0
+  ; CHECK-NEXT:   [[SUBSXri:%[0-9]+]]:gpr64 = SUBSXri [[PHI1]], 1, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr64all = COPY [[SUBSXri]]
+  ; CHECK-NEXT:   [[INCD_XPiI:%[0-9]+]]:gpr64 = INCD_XPiI [[PHI]], 31, 1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr64all = COPY [[INCD_XPiI]]
+  ; CHECK-NEXT:   Bcc 1, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.1
+    liveins: $x0, $x1
+
+    %5:gpr64 = COPY $x1
+    %4:gpr64 = COPY $x0
+    MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+    %6:gpr64all = COPY %4
+    %7:gpr64all = COPY %5
+
+  bb.1:
+    successors: %bb.2, %bb.1
+
+    %0:gpr64common = PHI %6, %bb.0, %3, %bb.1
+    %1:gpr64sp = PHI %7, %bb.0, %2, %bb.1
+    %8:ppr = PFALSE implicit $vg
+    STR_PXI killed %8, %0, 0
+    %9:gpr64 = SUBSXri %1, 1, 0, implicit-def $nzcv
+    %2:gpr64all = COPY %9
+    %10:gpr64 = INCD_XPiI %0, 31, 1
+    %3:gpr64all = COPY %10
+
+
+    Bcc 1, %bb.1, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/soft-float-abi.ll b/llvm/test/CodeGen/AArch64/soft-float-abi.ll
new file mode 100644
index 00000000000000..291c3875c2488d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/soft-float-abi.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc --mtriple aarch64-none-eabi < %s -mattr=-fp-armv8 | FileCheck %s
+
+; See also clang/test/CodeGen/aarch64-soft-float-abi.c, which tests the clang
+; parts of the soft-float ABI.
+
+; FP types up to 64-bit are passed in a general purpose register.
+define half @test0(half %a, half %b)  {
+; CHECK-LABEL: test0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+entry:
+  ret half %b
+}
+
+define bfloat @test1(i32 %a, bfloat %b) {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+entry:
+  ret bfloat %b
+}
+
+define float @test2(i64 %a, float %b) {
+; CHECK-LABEL: test2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+entry:
+  ret float %b
+}
+
+define double @test3(half %a, double %b) {
+; CHECK-LABEL: test3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x0, x1
+; CHECK-NEXT:    ret
+entry:
+  ret double %b
+}
+
+; fp128 is passed in a pair of GPRs.
+define fp128 @test4(fp128 %a, fp128 %b) {
+; CHECK-LABEL: test4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    ret
+entry:
+  ret fp128 %b
+}
+
+; fp128 is passed in an aligned pair of GPRs, leaving one register unused is
+; necessary.
+define fp128 @test5(float %a, fp128 %b) {
+; CHECK-LABEL: test5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x1, x3
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    ret
+entry:
+  ret fp128 %b
+}
+
+; If the alignment of an fp128 leaves a register unused, it remains unused even
+; if a later argument could fit in it.
+define i64 @test6(i64 %a, fp128 %b, i64 %c) {
+; CHECK-LABEL: test6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x0, x4
+; CHECK-NEXT:    ret
+entry:
+  ret i64 %c
+}
+
+; HFAs are all bit-casted to integer types in the frontend when using the
+; soft-float ABI, so they get passed in the same way as non-homeogeneous
+; aggregates. The IR is identical to the equivalent integer types, so nothing
+; to test here.
+
+; The PCS for vector and HVA types is not defined by the soft-float ABI because
+; these types are only defined by the ACLE when vector hardware is available,
+; so nothing to test here.
+
+; The front-end generates IR for va_arg which always reads from the integer
+; register save area, and never the floating-point register save area. The
+; layout of the va_list type remains the same, the floating-point related
+; fields are unused. The only change needed in the backend is  in va_start, to
+; not attempt to save the floating-point registers or set the FP fields in the
+; va_list.
+%struct.__va_list = type { ptr, ptr, ptr, i32, i32 }
+declare void @llvm.va_start(ptr)
+define double @test20(i32 %a, ...) {
+; CHECK-LABEL: test20:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    mov w8, #-56 // =0xffffffc8
+; CHECK-NEXT:    add x10, sp, #8
+; CHECK-NEXT:    add x9, sp, #96
+; CHECK-NEXT:    str x8, [sp, #88]
+; CHECK-NEXT:    add x10, x10, #56
+; CHECK-NEXT:    ldrsw x8, [sp, #88]
+; CHECK-NEXT:    stp x1, x2, [sp, #8]
+; CHECK-NEXT:    stp x3, x4, [sp, #24]
+; CHECK-NEXT:    stp x5, x6, [sp, #40]
+; CHECK-NEXT:    stp x7, x9, [sp, #56]
+; CHECK-NEXT:    str x10, [sp, #72]
+; CHECK-NEXT:    tbz w8, #31, .LBB7_3
+; CHECK-NEXT:  // %bb.1: // %vaarg.maybe_reg
+; CHECK-NEXT:    add w9, w8, #8
+; CHECK-NEXT:    cmn w8, #8
+; CHECK-NEXT:    str w9, [sp, #88]
+; CHECK-NEXT:    b.gt .LBB7_3
+; CHECK-NEXT:  // %bb.2: // %vaarg.in_reg
+; CHECK-NEXT:    ldr x9, [sp, #72]
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    b .LBB7_4
+; CHECK-NEXT:  .LBB7_3: // %vaarg.on_stack
+; CHECK-NEXT:    ldr x8, [sp, #64]
+; CHECK-NEXT:    add x9, x8, #8
+; CHECK-NEXT:    str x9, [sp, #64]
+; CHECK-NEXT:  .LBB7_4: // %vaarg.end
+; CHECK-NEXT:    ldr x0, [x8]
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+entry:
+  %vl = alloca %struct.__va_list, align 8
+  call void @llvm.va_start(ptr nonnull %vl)
+  %gr_offs_p = getelementptr inbounds %struct.__va_list, ptr %vl, i64 0, i32 3
+  %gr_offs = load i32, ptr %gr_offs_p, align 8
+  %0 = icmp sgt i32 %gr_offs, -1
+  br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg
+
+vaarg.maybe_reg:                                  ; preds = %entry
+  %new_reg_offs = add nsw i32 %gr_offs, 8
+  store i32 %new_reg_offs, ptr %gr_offs_p, align 8
+  %inreg = icmp slt i32 %gr_offs, -7
+  br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
+
+vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
+  %reg_top_p = getelementptr inbounds %struct.__va_list, ptr %vl, i64 0, i32 1
+  %reg_top = load ptr, ptr %reg_top_p, align 8
+  %1 = sext i32 %gr_offs to i64
+  %2 = getelementptr inbounds i8, ptr %reg_top, i64 %1
+  br label %vaarg.end
+
+vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg, %entry
+  %stack = load ptr, ptr %vl, align 8
+  %new_stack = getelementptr inbounds i8, ptr %stack, i64 8
+  store ptr %new_stack, ptr %vl, align 8
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
+  %vaargs.addr = phi ptr [ %2, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
+  %3 = load double, ptr %vaargs.addr, align 8
+  ret double %3
+}
+
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-no-scratch-reg.mir b/llvm/test/CodeGen/AArch64/stack-probing-no-scratch-reg.mir
index f2d79bd7206908..a9c9b5ff60e451 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-no-scratch-reg.mir
+++ b/llvm/test/CodeGen/AArch64/stack-probing-no-scratch-reg.mir
@@ -29,6 +29,7 @@ tracksRegLiveness: true
 liveins:
   - { reg: '$w0', virtual-reg: '' }
 frameInfo:
+  adjustsStack:    true
   localFrameSize:  150000
 stack:
   - { id: 0, name: a, type: default, offset: 0, size: 150000, alignment: 8,
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir b/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir
index 83aa90d389a4a2..985ec352139708 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir
+++ b/llvm/test/CodeGen/AArch64/stack-probing-shrink-wrap.mir
@@ -31,6 +31,7 @@ tracksRegLiveness: true
 liveins:
   - { reg: '$w0', virtual-reg: '' }
 frameInfo:
+  adjustsStack:    true
   localFrameSize:  150000
 stack:
   - { id: 0, name: a, type: default, offset: 0, size: 150000, alignment: 8,
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll b/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll
index d8969fc9bebdbf..22d177ca3267e9 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-initializer-merge.ll
@@ -20,10 +20,10 @@ entry:
 ; CHECK-LABEL: define void @OneVarNoInit(
 ; CHECK-DAG:  [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
 ; CHECK-DAG:  [[TX:%.*]] = call ptr @llvm.aarch64.tagp.{{.*}}(ptr [[X]], {{.*}}, i64 0)
-; CHECK-DAG:  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TX]])
+; CHECK-DAG:  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[X]])
 ; CHECK-DAG:  call void @llvm.aarch64.settag(ptr [[TX]], i64 16)
 ; CHECK-DAG:  call void @use(ptr nonnull [[TX]])
-; CHECK-DAG:  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[TX]])
+; CHECK-DAG:  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[X]])
 
 define void @OneVarInitConst() sanitize_memtag {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-stack-coloring.ll b/llvm/test/CodeGen/AArch64/stack-tagging-stack-coloring.ll
index 6eb72013fb0ed0..81349620fb7725 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-stack-coloring.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-stack-coloring.ll
@@ -1,20 +1,20 @@
 ; Test that storage for allocas with disjoint lifetimes is reused with stack
 ; tagging.
 
-; RUN: opt -S -aarch64-stack-tagging %s -o - | \
-; RUN:   llc -no-stack-coloring=false -o - | \
+; RUN: opt -S -aarch64-stack-tagging -stack-tagging-use-stack-safety=0 %s -o - | \
+; RUN:   llc --mattr=+mte -no-stack-coloring=false -o - | \
 ; RUN:   FileCheck %s --check-prefix=COLOR
-; RUN: opt -S -aarch64-stack-tagging %s -o - | \
-; RUN:   llc -no-stack-coloring=true -o - | \
+; RUN: opt -S -aarch64-stack-tagging %s -stack-tagging-use-stack-safety=0 -o - | \
+; RUN:   llc --mattr=+mte -no-stack-coloring=true -o - | \
 ; RUN:   FileCheck %s --check-prefix=NOCOLOR
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-android29"
+target triple = "aarch64"
 
-; COLOR: sub	sp, sp, #192
-; NOCOLOR: sub	sp, sp, #320
+; COLOR: sub	sp, sp, #208
+; NOCOLOR: sub	sp, sp, #336
 
-define i32 @myCall_w2(i32 %in) sanitize_hwaddress {
+define i32 @myCall_w2(i32 %in) sanitize_memtag {
 entry:
   %a = alloca [17 x ptr], align 8
   %a2 = alloca [16 x ptr], align 8
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
index 06f8cd5241ebfb..aa9cccc58712da 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
@@ -27,7 +27,7 @@ S1:
 ; CHECK: call void @llvm.aarch64.settag(ptr %w, i64 48)
 ; CHECK-NOT: settag{{.*}}%v
   call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %w) #1
-; CHECK: call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %w.tag)
+; CHECK: call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %w)
   %b1 = icmp eq i32 %t1, 0
   br i1 %b1, label %S2, label %S3
 ; CHECK-NOT: settag
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 66b49466cc7361..66ef436f48c637 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4,11 +4,6 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT
 
-; CHECK-GI-BASE:        warning: Instruction selection used fallback path for test_udot_v24i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_udot_v48i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_sdot_v24i8
-; CHECK-GI-BASE-NEXT:   warning: Instruction selection used fallback path for test_sdot_v48i8
-
 define i32 @addv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: addv_v2i32:
 ; CHECK:       // %bb.0: // %entry
@@ -2070,126 +2065,50 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
 ; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
-; CHECK-GI-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    umull v6.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    ushll v3.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v4.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v4.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    umlal v6.4s, v4.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    umlal v6.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    umull v6.4s, v5.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v5.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    umull v7.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull v0.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s2
+; CHECK-GI-BASE-NEXT:    fmov w9, s3
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    fmov d3, d3
-; CHECK-GI-DOT-NEXT:    fmov d4, d4
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    udot v5.4s, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    udot v1.4s, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -2257,243 +2176,91 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_udot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-GI-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT:    ushll2 v16.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v17.8h, v7.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v5.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    umull2 v18.4s, v6.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    umull v19.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umull v5.4s, v6.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v7.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v3.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v7.8h, v4.16b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v18.4s, v17.8h, v16.8h
-; CHECK-GI-BASE-NEXT:    umlal v5.4s, v17.4h, v16.4h
-; CHECK-GI-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v2.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    umlal2 v18.4s, v7.8h, v6.8h
-; CHECK-GI-BASE-NEXT:    umlal v5.4s, v7.4h, v6.4h
-; CHECK-GI-BASE-NEXT:    umlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    umull v18.4s, v4.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    umull2 v19.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v7.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    ushll v5.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    umull2 v7.4s, v7.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    umull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
+; CHECK-GI-BASE-NEXT:    umull v19.4s, v5.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v5.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v6.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    umull2 v6.4s, v6.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s16
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s2
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s7
+; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w12
+; CHECK-GI-BASE-NEXT:    fmov w11, s4
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w10, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_udot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
-; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
-; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
-; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
-; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
-; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
-; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
-; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
-; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
-; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
-; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
-; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
-; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
-; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
-; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
-; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    udot v7.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    udot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
+; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    udot v1.4s, v6.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    udot v2.4s, v16.16b, v7.16b
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
-; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
 ; CHECK-GI-DOT-NEXT:    fmov w8, s0
 ; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    fmov w10, s2
 ; CHECK-GI-DOT-NEXT:    add w8, w8, w9
-; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    fmov w9, s2
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
 ; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1
@@ -2648,126 +2415,50 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
 ; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
-; CHECK-GI-BASE-NEXT:    ldr d4, [x0, #16]
-; CHECK-GI-BASE-NEXT:    ldr d5, [x1, #16]
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    smull v6.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    sshll v3.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v4.8h, v5.8b, #0
-; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v4.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    smlal v6.4s, v4.4h, v3.4h
-; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT:    smlal v6.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT:    add v0.4s, v6.4s, v2.4s
+; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    smull v6.4s, v5.4h, v4.4h
+; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v5.8h, v4.8h
+; CHECK-GI-BASE-NEXT:    smull2 v5.4s, v1.8h, v0.8h
+; CHECK-GI-BASE-NEXT:    smull v7.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    fmov w8, s2
+; CHECK-GI-BASE-NEXT:    fmov w9, s3
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    fmov d3, d3
-; CHECK-GI-DOT-NEXT:    fmov d4, d4
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
-; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    sdot v5.4s, v2.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
+; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
+; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
+; CHECK-GI-DOT-NEXT:    sdot v1.4s, v4.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -2835,243 +2526,91 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ldp q0, q4, [x1]
-; CHECK-GI-BASE-NEXT:    ldr q2, [x0, #32]
-; CHECK-GI-BASE-NEXT:    ldp q1, q3, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT:    sshll2 v16.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v17.8h, v7.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v5.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT:    smull2 v18.4s, v6.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    smull v19.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smull v5.4s, v6.4h, v5.4h
-; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v7.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v3.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v7.8h, v4.16b, #0
-; CHECK-GI-BASE-NEXT:    smlal2 v18.4s, v17.8h, v16.8h
-; CHECK-GI-BASE-NEXT:    smlal v5.4s, v17.4h, v16.4h
-; CHECK-GI-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v2.8h, v4.8b, #0
-; CHECK-GI-BASE-NEXT:    smlal2 v18.4s, v7.8h, v6.8h
-; CHECK-GI-BASE-NEXT:    smlal v5.4s, v7.4h, v6.4h
-; CHECK-GI-BASE-NEXT:    smlal v19.4s, v2.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    add v1.4s, v19.4s, v5.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-GI-BASE-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
+; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-BASE-NEXT:    smull v18.4s, v4.4h, v5.4h
+; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v4.8h, v5.8h
+; CHECK-GI-BASE-NEXT:    smull2 v19.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v7.4h, v16.4h
+; CHECK-GI-BASE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    sshll v5.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v17.8b, #0
+; CHECK-GI-BASE-NEXT:    smull2 v7.4s, v7.8h, v16.8h
+; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v17.8h, v17.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    smull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
+; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
+; CHECK-GI-BASE-NEXT:    smull v19.4s, v5.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v5.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
+; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v6.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    smull2 v6.4s, v6.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    fmov w8, s16
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s3
+; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s0
+; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
+; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
+; CHECK-GI-BASE-NEXT:    add w10, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    fmov w12, s2
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    fmov w9, s7
+; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    add w10, w11, w12
+; CHECK-GI-BASE-NEXT:    fmov w11, s4
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    fmov w10, s0
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s1
+; CHECK-GI-BASE-NEXT:    add w8, w8, w9
+; CHECK-GI-BASE-NEXT:    add w9, w10, w11
+; CHECK-GI-BASE-NEXT:    add w0, w8, w9
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x0, #17]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v6.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #32]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #33]
-; CHECK-GI-DOT-NEXT:    mov v4.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #32]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #33]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #2]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #34]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #34]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #19]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #35]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #35]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #36]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #4]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #36]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #5]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #21]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #37]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #37]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #6]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #38]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #38]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #7]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #23]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #39]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #39]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #8]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #24]
-; CHECK-GI-DOT-NEXT:    mov v3.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #8]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #24]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #40]
-; CHECK-GI-DOT-NEXT:    mov v2.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #40]
-; CHECK-GI-DOT-NEXT:    mov v4.b[8], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    mov v5.b[8], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #25]
-; CHECK-GI-DOT-NEXT:    mov v3.b[8], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[8], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #25]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #41]
-; CHECK-GI-DOT-NEXT:    mov v2.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #41]
-; CHECK-GI-DOT-NEXT:    mov v4.b[9], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #10]
-; CHECK-GI-DOT-NEXT:    mov v5.b[9], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #26]
-; CHECK-GI-DOT-NEXT:    mov v3.b[9], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[9], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #10]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #26]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #42]
-; CHECK-GI-DOT-NEXT:    mov v2.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #42]
-; CHECK-GI-DOT-NEXT:    mov v4.b[10], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[10], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #27]
-; CHECK-GI-DOT-NEXT:    mov v3.b[10], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[10], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #11]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #27]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #43]
-; CHECK-GI-DOT-NEXT:    mov v2.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #43]
-; CHECK-GI-DOT-NEXT:    mov v4.b[11], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #12]
-; CHECK-GI-DOT-NEXT:    mov v5.b[11], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #28]
-; CHECK-GI-DOT-NEXT:    mov v3.b[11], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[11], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #28]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #44]
-; CHECK-GI-DOT-NEXT:    mov v2.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #44]
-; CHECK-GI-DOT-NEXT:    mov v4.b[12], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    mov v5.b[12], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #29]
-; CHECK-GI-DOT-NEXT:    mov v3.b[12], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[12], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #29]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #45]
-; CHECK-GI-DOT-NEXT:    mov v2.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #45]
-; CHECK-GI-DOT-NEXT:    mov v4.b[13], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #14]
-; CHECK-GI-DOT-NEXT:    mov v5.b[13], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x0, #30]
-; CHECK-GI-DOT-NEXT:    mov v3.b[13], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[13], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #30]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #46]
-; CHECK-GI-DOT-NEXT:    mov v2.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #46]
-; CHECK-GI-DOT-NEXT:    mov v4.b[14], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    mov v5.b[14], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #31]
-; CHECK-GI-DOT-NEXT:    mov v3.b[14], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[14], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v1.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #31]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #47]
-; CHECK-GI-DOT-NEXT:    mov v2.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #47]
-; CHECK-GI-DOT-NEXT:    mov v4.b[15], v7.b[0]
-; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v5.b[15], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v3.b[15], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[15], v16.b[0]
-; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    sdot v7.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    sdot v16.4s, v6.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
+; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
+; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
+; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    sdot v1.4s, v6.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    sdot v2.4s, v16.16b, v7.16b
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    addv s1, v7.4s
-; CHECK-GI-DOT-NEXT:    addv s2, v16.4s
+; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
+; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
 ; CHECK-GI-DOT-NEXT:    fmov w8, s0
 ; CHECK-GI-DOT-NEXT:    fmov w9, s1
-; CHECK-GI-DOT-NEXT:    fmov w10, s2
 ; CHECK-GI-DOT-NEXT:    add w8, w8, w9
-; CHECK-GI-DOT-NEXT:    add w0, w8, w10
+; CHECK-GI-DOT-NEXT:    fmov w9, s2
+; CHECK-GI-DOT-NEXT:    add w0, w8, w9
 ; CHECK-GI-DOT-NEXT:    ret
 entry:
   %a = load <48 x i8>, ptr %p1
diff --git a/llvm/test/CodeGen/AArch64/wrong-callee-save-size-after-livedebugvariables.mir b/llvm/test/CodeGen/AArch64/wrong-callee-save-size-after-livedebugvariables.mir
index 53a8612a7fae76..8e1142474447ed 100644
--- a/llvm/test/CodeGen/AArch64/wrong-callee-save-size-after-livedebugvariables.mir
+++ b/llvm/test/CodeGen/AArch64/wrong-callee-save-size-after-livedebugvariables.mir
@@ -64,6 +64,7 @@
 name:            foo
 tracksRegLiveness: true
 frameInfo:
+  adjustsStack:    true
   hasCalls:        true
 fixedStack:      []
 stack:
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 54b29be2132cdd..716d2398996be2 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -305,15 +305,14 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d1, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-GI-NEXT:    movi v0.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    and x8, x2, #0xff
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v1.d[1], x1
-; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -470,15 +469,14 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI27_0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI27_0]
 ; CHECK-GI-NEXT:    and x8, x2, #0x3ff
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index 192bf7c249817b..93b9aeac3cd3f3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -1197,3 +1197,54 @@ reallyfinally:
   store <5 x double> %val, ptr %out, align 1
   ret void
 }
+
+define amdgpu_kernel void @pr85718(i1 %Bool, ptr %Ptr, <4 x float> %Vec1, <4 x float> %Vec2) {
+; OPT-LABEL: @pr85718(
+; OPT-NEXT:  BB0:
+; OPT-NEXT:    [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true
+; OPT-NEXT:    br label [[BB1:%.*]]
+; OPT:       BB1:
+; OPT-NEXT:    [[TMP0:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE0:%.*]], [[BB2:%.*]] ], [ [[LARGEPHI_EXTRACTSLICE1:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0:%.*]] ]
+; OPT-NEXT:    [[TMP1:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE3:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE4:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ]
+; OPT-NEXT:    [[TMP2:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE6:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE7:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ]
+; OPT-NEXT:    [[TMP3:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE9:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE10:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ]
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE0]], float [[TMP1]], i64 1
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE1]], float [[TMP2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE2]], float [[TMP3]], i64 3
+; OPT-NEXT:    store <4 x float> [[LARGEPHI_INSERTSLICE3]], ptr [[PTR:%.*]], align 128
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE1]] = extractelement <4 x float> [[VEC2:%.*]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE4]] = extractelement <4 x float> [[VEC2]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE7]] = extractelement <4 x float> [[VEC2]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE10]] = extractelement <4 x float> [[VEC2]], i64 3
+; OPT-NEXT:    br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]]
+; OPT:       BB2:
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE0]] = extractelement <4 x float> [[I]], i64 0
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE3]] = extractelement <4 x float> [[I]], i64 1
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE6]] = extractelement <4 x float> [[I]], i64 2
+; OPT-NEXT:    [[LARGEPHI_EXTRACTSLICE9]] = extractelement <4 x float> [[I]], i64 3
+; OPT-NEXT:    br label [[BB1]]
+;
+; NOOPT-LABEL: @pr85718(
+; NOOPT-NEXT:  BB0:
+; NOOPT-NEXT:    [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true
+; NOOPT-NEXT:    br label [[BB1:%.*]]
+; NOOPT:       BB1:
+; NOOPT-NEXT:    [[PHI:%.*]] = phi <4 x float> [ [[I]], [[BB2:%.*]] ], [ [[VEC2:%.*]], [[BB1]] ], [ zeroinitializer, [[BB0:%.*]] ]
+; NOOPT-NEXT:    store <4 x float> [[PHI]], ptr [[PTR:%.*]], align 128
+; NOOPT-NEXT:    br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]]
+; NOOPT:       BB2:
+; NOOPT-NEXT:    br label [[BB1]]
+;
+BB0:
+  %I = insertelement <4 x float> %Vec1, float 4.200000e+01, i1 true
+  br label %BB1
+
+BB1:                                              ; preds = %BB0, %BB1, %BB2
+  %PHI = phi <4 x float> [ %I, %BB2 ], [ %Vec2, %BB1 ], [ zeroinitializer, %BB0 ]
+  store <4 x float> %PHI, ptr %Ptr, align 128
+  br i1 %Bool, label %BB1, label %BB2
+
+BB2:                                               ; preds = %BB1
+  br label %BB1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index c1da29ecc2c2f5..3228962ed01f78 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -14,6 +14,8 @@
 ---
 name:            test_av_spill_cross_bb_usage
 tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
 machineFunctionInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
new file mode 100644
index 00000000000000..1eb2771618dce0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -0,0 +1,344 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s
+
+; TODO: Add global-isel when it can support bf16
+define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
+; GCN-LABEL: v_test_cvt_bf16_f32_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %cvt = fpext bfloat %v to float
+  ret float %cvt
+}
+define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
+; GCN-LABEL: v_test_cvt_bf16_f32_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshl_b32 s0, s0, 16
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %cvt = fpext bfloat %v to float
+  ret float %cvt
+}
+define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
+; GCN-LABEL: v_test_cvt_v2f32_v2bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v2, v2, v0, s0
+; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GCN-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GCN-NEXT:    v_add3_u32 v2, v2, v1, s0
+; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GCN-NEXT:    s_mov_b32 s0, 0x7060302
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %res = fptrunc <2 x float> %src to <2 x bfloat>
+  %cast = bitcast <2 x bfloat> %res to float
+  ret float %cast
+}
+define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
+; GCN-LABEL: v_test_cvt_v2f32_v2bf16_s:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bfe_u32 s2, s1, 0x10010
+; GCN-NEXT:    s_add_i32 s2, s2, s1
+; GCN-NEXT:    s_or_b32 s4, s1, 0x400000
+; GCN-NEXT:    s_add_i32 s5, s2, 0x7fff
+; GCN-NEXT:    v_cmp_u_f32_e64 s[2:3], s1, s1
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    s_cselect_b32 s2, s4, s5
+; GCN-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GCN-NEXT:    s_add_i32 s1, s1, s0
+; GCN-NEXT:    s_or_b32 s3, s0, 0x400000
+; GCN-NEXT:    s_add_i32 s4, s1, 0x7fff
+; GCN-NEXT:    v_cmp_u_f32_e64 s[0:1], s0, s0
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s3, s4
+; GCN-NEXT:    s_pack_hh_b32_b16 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %res = fptrunc <2 x float> %src to <2 x bfloat>
+  %cast = bitcast <2 x bfloat> %res to float
+  ret float %cast
+}
+define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
+; GCN-LABEL: v_test_cvt_f32_bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GCN-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %trunc = fptrunc float %src to bfloat
+  %ext = fpext bfloat %trunc to float
+  ret float %ext
+}
+define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
+; GCN-LABEL: v_test_cvt_v2f64_v2bf16_v:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT:    v_and_b32_e32 v7, 1, v6
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v4, v6, v4
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT:    s_brev_b32 s4, 1
+; GCN-NEXT:    v_and_or_b32 v5, v1, s4, v4
+; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT:    s_movk_i32 s5, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v5, s5
+; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-NEXT:    v_cvt_f32_f64_e64 v5, |v[2:3]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v5
+; GCN-NEXT:    v_and_b32_e32 v6, 1, v5
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v0, v5, v0
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    v_and_or_b32 v1, v3, s4, v0
+; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 1
+; GCN-NEXT:    v_add3_u32 v0, v0, v1, s5
+; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; GCN-NEXT:    s_mov_b32 s0, 0x7060302
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %res = fptrunc <2 x double> %src to <2 x bfloat>
+  %cast = bitcast <2 x bfloat> %res to float
+  ret float %cast
+}
+define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
+; GCN-LABEL: fptrunc_f32_f32_to_v2bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v2, v2, v0, s0
+; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GCN-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GCN-NEXT:    v_add3_u32 v2, v2, v1, s0
+; GCN-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GCN-NEXT:    s_mov_b32 s0, 0x7060302
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %a.cvt = fptrunc float %a to bfloat
+  %b.cvt = fptrunc float %b to bfloat
+  %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
+  %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
+  %ret = bitcast <2 x bfloat> %v2.2 to float
+  ret float %ret
+}
+define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
+; GCN-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; GCN-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v3, v3, v2, s0
+; GCN-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GCN-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
+; GCN-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GCN-NEXT:    v_add3_u32 v3, v3, v2, s0
+; GCN-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GCN-NEXT:    v_cmp_u_f32_e64 vcc, |v1|, |v1|
+; GCN-NEXT:    s_mov_b32 s0, 0x7060302
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GCN-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %a.neg = fneg float %a
+  %a.cvt = fptrunc float %a.neg to bfloat
+  %b.abs = call float @llvm.fabs.f32(float %b)
+  %b.cvt = fptrunc float %b.abs to bfloat
+  %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
+  %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
+  %ret = bitcast <2 x bfloat> %v2.2 to float
+  ret float %ret
+}
+define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
+; GCN-LABEL: fptrunc_f32_to_bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GCN-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.cvt = fptrunc float %a to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
+; GCN-LABEL: fptrunc_f32_to_bf16_abs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
+; GCN-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f32_e64 vcc, |v0|, |v0|
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.abs = call float @llvm.fabs.f32(float %a)
+  %a.cvt = fptrunc float %a.abs to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
+; GCN-LABEL: fptrunc_f32_to_bf16_neg:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v3, v2
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
+; GCN-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.neg = fneg float %a
+  %a.cvt = fptrunc float %a.neg to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
+; GCN-LABEL: fptrunc_f64_to_bf16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT:    v_and_b32_e32 v7, 1, v6
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v4, v6, v4
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT:    s_brev_b32 s0, 1
+; GCN-NEXT:    v_and_or_b32 v5, v1, s0, v4
+; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.cvt = fptrunc double %a to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
+; GCN-LABEL: fptrunc_f64_to_bf16_neg:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GCN-NEXT:    v_and_b32_e32 v8, 1, v7
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v4, v7, v4
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    s_brev_b32 s4, 1
+; GCN-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT:    v_and_or_b32 v5, v6, s4, v4
+; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.neg = fneg double %a
+  %a.cvt = fptrunc double %a.neg to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
+; GCN-LABEL: fptrunc_f64_to_bf16_abs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GCN-NEXT:    v_and_b32_e32 v8, 1, v7
+; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v4, v7, v4
+; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT:    s_brev_b32 s0, 1
+; GCN-NEXT:    v_and_or_b32 v5, v6, s0, v4
+; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT:    s_movk_i32 s0, 0x7fff
+; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT:    s_endpgm
+entry:
+  %a.abs = call double @llvm.fabs.f64(double %a)
+  %a.cvt = fptrunc double %a.abs to bfloat
+  store bfloat %a.cvt, ptr %out
+  ret void
+}
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
index 2ed6d7fd0f598d..6beccce9400e58 100644
--- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
@@ -1,10 +1,12 @@
 ; RUN: llc --amdgpu-disable-structurizer -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
 ; RUN: llc --amdgpu-disable-structurizer -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s
+; RUN: llc --amdgpu-disable-structurizer -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL
 
 ; CHECK-LABEL: name:            basic_call
-;       CHECK:    [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
 ;        ISEL:    {{.*}} SI_CALL_ISEL {{.*}}, @foo, [[TOKEN]], csr_amdgpu, {{.*}}
 ;      DEADMI:    {{.*}} SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]]
+;       GISEL:    {{.*}} G_SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]]
 define i32 @basic_call(i32 %src) #0 {
   %t = call token @llvm.experimental.convergence.entry()
   %r = call i32 @foo(i32 %src) [ "convergencectrl"(token %t) ]
@@ -12,10 +14,11 @@ define i32 @basic_call(i32 %src) #0 {
 }
 
 ; CHECK-LABEL: name:            basic_intrinsic
-;       CHECK:    [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
 ;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
 ;  DEADMI-NOT:    CONVERGENCECTRL_GLUE
-;       CHECK:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]]
 define i32 @basic_intrinsic(i32 %src) #0 {
   %t = call token @llvm.experimental.convergence.anchor()
   %r = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t) ]
@@ -30,12 +33,13 @@ define i32 @uncontrolled_call(i32 %src) #0 {
 }
 
 ; CHECK-LABEL: name:            basic_branch
-;       CHECK:  bb.0.entry:
-;       CHECK:    [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR
-;       CHECK:  bb.1.then:
+;       CHECK:  bb.[[#]].entry:
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+;       CHECK:  bb.[[#]].then:
 ;        ISEL:    CONVERGENCECTRL_GLUE [[TOKEN]]
 ;  DEADMI-NOT:    CONVERGENCECTRL_GLUE
-;       CHECK:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]]
 define i32 @basic_branch(i32 %src, i1 %cond) #0 {
 entry:
   %t = call token @llvm.experimental.convergence.anchor()
@@ -52,12 +56,13 @@ else:
 }
 
 ; CHECK-LABEL: name:            basic_loop
-;       CHECK:    [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR
-;       CHECK:  bb.1.loop:
-;       CHECK:    [[LOOP:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_LOOP [[TOKEN]]
+;       CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+;       CHECK:  bb.[[#]].loop:
+;       CHECK:    [[LOOP:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_LOOP [[TOKEN]]
 ;        ISEL:    CONVERGENCECTRL_GLUE [[LOOP]]
 ;  DEADMI-NOT:    CONVERGENCECTRL_GLUE
-;       CHECK:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[LOOP]]
 define i32 @basic_loop(i32 %src, i1 %cond) #0 {
   %t1 = call token @llvm.experimental.convergence.anchor()
   br label %loop
@@ -71,6 +76,38 @@ end:
   ret i32 %r
 }
 
+; CHECK-LABEL: name: nested
+;       CHECK:    [[ENTRY:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
+;       CHECK:    [[ANCHOR:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ANCHOR]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ANCHOR]]
+;        ISEL:    {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ENTRY]]
+;       GISEL:    {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ENTRY]]
+define i32 @nested(i32 %src) #0 {
+  %t1 = call token @llvm.experimental.convergence.entry()
+  %t2 = call token @llvm.experimental.convergence.anchor()
+  %r2 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t2) ]
+  %r1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t1) ]
+  %sum = add i32 %r1, %r2
+  ret i32 %sum
+}
+
+; COM: FIXME: Tokens on tail-call have not been implemented for SelectionDAG
+; COM:        yet; the corresponding checks have been commented out.
+;
+; CHECK-LABEL: name:            tail_call_void_func_void
+;       GISEL:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
+; COM:  CHECK:    [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
+; COM:   ISEL:    {{.*}} SI_CALL_ISEL {{.*}}, @external_void_func_void, [[TOKEN]], csr_amdgpu, {{.*}}
+; COM: DEADMI:    {{.*}} SI_CALL {{.*}}, @external_void_func_void, csr_amdgpu, {{.*}}, implicit [[TOKEN]]
+;       GISEL:    {{.*}} SI_TCRETURN {{.*}}, @external_void_func_void, 0, csr_amdgpu, implicit [[TOKEN]]
+define void @tail_call_void_func_void() #0 {
+  %t1 = call token @llvm.experimental.convergence.entry()
+  tail call void @external_void_func_void() [ "convergencectrl"(token %t1) ]
+  ret void
+}
+
+declare hidden void @external_void_func_void() #0
 declare i32 @foo(i32 %x) #0
 
 declare i32 @llvm.amdgcn.readfirstlane(i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 6fa7df913812a3..18d2e52e8f9002 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -618,16 +618,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
 define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
 ; SI-LABEL: test_isinf_pattern_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s1, 0x7f800000
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
-; SI-NEXT:    v_cmp_eq_f32_e32 vcc, s1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    s_cmpk_eq_i32 s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isinf_pattern_f16:
@@ -667,16 +667,19 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
 define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_0_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s1, 0x1f8
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT:    s_cmpk_lg_i32 s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isfinite_pattern_0_f16:
@@ -718,16 +721,19 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
 define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_4_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s1, 0x1f8
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT:    s_cmpk_lt_i32 s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isfinite_pattern_4_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
new file mode 100644
index 00000000000000..b2311a87059c31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -0,0 +1,1510 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define i128 @fptosi_f64_to_i128(double %x) {
+; SDAG-LABEL: fptosi_f64_to_i128:
+; SDAG:       ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; SDAG-NEXT:    v_bfe_u32 v6, v5, 20, 11
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    s_mov_b64 s[4:5], 0x3fe
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB0_10
+; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0xffffff7f
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_7
+; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v9, vcc, -1, v0
+; SDAG-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, -1, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0x432
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xfffff, v5
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, -1, 1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v5, 0x100000, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_4
+; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0x473, v6
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xfffffb8d, v6
+; SDAG-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v1, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, 0, v1, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v11, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v11, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; SDAG-NEXT:    v_mul_lo_u32 v6, v11, v6
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v11, v[2:3]
+; SDAG-NEXT:    v_mul_lo_u32 v10, v10, v12
+; SDAG-NEXT:    v_add3_u32 v5, v5, v6, v13
+; SDAG-NEXT:    v_mov_b32_e32 v6, v2
+; SDAG-NEXT:    v_mov_b32_e32 v2, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[1:2]
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v9, v12, v[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v7, v8, v[5:6]
+; SDAG-NEXT:    ; implicit-def: $vgpr11
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    v_add3_u32 v4, v10, v4, v9
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:  .LBB0_4: ; %Flow
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[12:13], s[12:13]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_6
+; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x433, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v0, v4, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v1, v5, s[6:7]
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v11, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v11, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v7, v4
+; SDAG-NEXT:    v_mov_b32_e32 v4, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[3:4]
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v10, v6, v[3:4]
+; SDAG-NEXT:    v_mad_i32_i24 v3, v9, v5, v3
+; SDAG-NEXT:  .LBB0_6: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB0_7: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:  ; %bb.9: ; %Flow3
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB0_10: ; %fp-to-i-cleanup
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptosi_f64_to_i128:
+; GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v5, v1
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
+; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB0_10
+; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_7
+; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 3, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 4, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 5, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 6, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 7, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 8, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 9, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 10, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 11, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 12, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 13, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 14, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v20, 15, v0
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v20
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v20
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v9, v0, v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x433
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff, v5
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    v_or_b32_e32 v5, 0x100000, v2
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_4
+; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr6
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:  .LBB0_4: ; %Flow
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[16:17]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_6
+; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT:    v_sub_co_u32_e32 v6, vcc, 0x433, v6
+; GISEL-NEXT:    v_subrev_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, 0
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:  .LBB0_6: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:  .LBB0_7: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_9
+; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT:    v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GISEL-NEXT:  .LBB0_9: ; %Flow3
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB0_10: ; %fp-to-i-cleanup
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptosi double %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptoui_f64_to_i128(double %x) {
+; SDAG-LABEL: fptoui_f64_to_i128:
+; SDAG:       ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; SDAG-NEXT:    v_bfe_u32 v6, v5, 20, 11
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    s_mov_b64 s[4:5], 0x3fe
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB1_10
+; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0xffffff7f
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_7
+; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v9, vcc, -1, v0
+; SDAG-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, -1, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0x432
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xfffff, v5
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, -1, 1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v5, 0x100000, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_4
+; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0x473, v6
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xfffffb8d, v6
+; SDAG-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v1, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, 0, v1, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v11, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    v_mul_lo_u32 v13, v8, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v11, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; SDAG-NEXT:    v_mul_lo_u32 v6, v11, v6
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v11, v[2:3]
+; SDAG-NEXT:    v_mul_lo_u32 v10, v10, v12
+; SDAG-NEXT:    v_add3_u32 v5, v5, v6, v13
+; SDAG-NEXT:    v_mov_b32_e32 v6, v2
+; SDAG-NEXT:    v_mov_b32_e32 v2, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[1:2]
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v9, v12, v[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mul_lo_u32 v9, v9, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v7, v8, v[5:6]
+; SDAG-NEXT:    ; implicit-def: $vgpr11
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    v_add3_u32 v4, v10, v4, v9
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v3
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:  .LBB1_4: ; %Flow
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[12:13], s[12:13]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_6
+; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x433, v6
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v0, v4, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v1, v5, s[6:7]
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v11, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v11, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v7, v4
+; SDAG-NEXT:    v_mov_b32_e32 v4, v2
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[3:4]
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
+; SDAG-NEXT:    v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3]
+; SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v10, v6, v[3:4]
+; SDAG-NEXT:    v_mad_i32_i24 v3, v9, v5, v3
+; SDAG-NEXT:  .LBB1_6: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB1_7: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:  ; %bb.9: ; %Flow3
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB1_10: ; %fp-to-i-cleanup
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptoui_f64_to_i128:
+; GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v5, v1
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
+; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB1_10
+; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_7
+; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 3, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 4, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 5, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 6, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 7, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 8, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 9, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 10, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 11, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 12, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 13, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 14, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v20, 15, v0
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v20
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v20
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v9, v0, v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x433
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff, v5
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    v_or_b32_e32 v5, 0x100000, v2
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_4
+; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr6
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:  .LBB1_4: ; %Flow
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[16:17]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_6
+; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT:    v_sub_co_u32_e32 v6, vcc, 0x433, v6
+; GISEL-NEXT:    v_subrev_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, 0
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:  .LBB1_6: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:  .LBB1_7: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_9
+; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT:    v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GISEL-NEXT:  .LBB1_9: ; %Flow3
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB1_10: ; %fp-to-i-cleanup
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptoui double %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptosi_f32_to_i128(float %x) {
+; SDAG-LABEL: fptosi_f32_to_i128:
+; SDAG:       ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_bfe_u32 v5, v4, 23, 8
+; SDAG-NEXT:    s_movk_i32 s4, 0x7e
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB2_10
+; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0xffffff7f
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_7
+; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v9, vcc, -1, v0
+; SDAG-NEXT:    v_addc_co_u32_e64 v11, s[6:7], 0, -1, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0x95
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v6, 0x800000, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_4
+; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v5
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v5
+; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff6a, v5
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v13, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v13, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v14, v8, v2
+; SDAG-NEXT:    v_mul_lo_u32 v15, v10, v3
+; SDAG-NEXT:    v_mov_b32_e32 v6, v1
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v12, v10, v[6:7]
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, v5
+; SDAG-NEXT:    v_mov_b32_e32 v5, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v13, v8, v[4:5]
+; SDAG-NEXT:    v_add3_u32 v3, v3, v15, v14
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v9, v13, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mul_lo_u32 v3, v9, v12
+; SDAG-NEXT:    v_mul_lo_u32 v7, v11, v13
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[5:6]
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v1
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:  .LBB2_4: ; %Flow
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_6
+; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v5
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:  .LBB2_6: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB2_7: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:  ; %bb.9: ; %Flow3
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB2_10: ; %fp-to-i-cleanup
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptosi_f32_to_i128:
+; GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 23, v[4:5]
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB2_10
+; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_7
+; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v5, 3, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 4, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 5, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 6, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 7, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 8, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 9, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 10, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 11, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 12, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 13, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 14, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 15, v0
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v5
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v9, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v8, v0, v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x96
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    v_or_b32_e32 v4, 0x800000, v2
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr6
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:  .LBB2_4: ; %Flow
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT:    v_sub_co_u32_e32 v3, vcc, 0x96, v6
+; GISEL-NEXT:    v_subrev_u32_e32 v2, 64, v3
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v3, v[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[1:2], v2, 0
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:  .LBB2_6: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB2_7: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_9
+; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT:    v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GISEL-NEXT:  .LBB2_9: ; %Flow3
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB2_10: ; %fp-to-i-cleanup
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptosi float %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptoui_f32_to_i128(float %x) {
+; SDAG-LABEL: fptoui_f32_to_i128:
+; SDAG:       ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_bfe_u32 v5, v4, 23, 8
+; SDAG-NEXT:    s_movk_i32 s4, 0x7e
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB3_10
+; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0xffffff7f
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_7
+; SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    v_add_co_u32_e32 v9, vcc, -1, v0
+; SDAG-NEXT:    v_addc_co_u32_e64 v11, s[6:7], 0, -1, vcc
+; SDAG-NEXT:    s_mov_b64 s[6:7], 0x95
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
+; SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6]
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v6, 0x800000, v0
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_4
+; SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v5
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v5
+; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff6a, v5
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e32 v13, 0, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v13, v10, 0
+; SDAG-NEXT:    v_mul_lo_u32 v14, v8, v2
+; SDAG-NEXT:    v_mul_lo_u32 v15, v10, v3
+; SDAG-NEXT:    v_mov_b32_e32 v6, v1
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v12, v10, v[6:7]
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0
+; SDAG-NEXT:    v_mov_b32_e32 v6, v5
+; SDAG-NEXT:    v_mov_b32_e32 v5, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v13, v8, v[4:5]
+; SDAG-NEXT:    v_add3_u32 v3, v3, v15, v14
+; SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v9, v13, v[2:3]
+; SDAG-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; SDAG-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_mul_lo_u32 v3, v9, v12
+; SDAG-NEXT:    v_mul_lo_u32 v7, v11, v13
+; SDAG-NEXT:    v_mad_u64_u32 v[5:6], s[6:7], v12, v8, v[5:6]
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v1
+; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:  .LBB3_4: ; %Flow
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[12:13]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_6
+; SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v5
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0
+; SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2]
+; SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; SDAG-NEXT:  .LBB3_6: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB3_7: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-NEXT:  ; %bb.9: ; %Flow3
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:  .LBB3_10: ; %fp-to-i-cleanup
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptoui_f32_to_i128:
+; GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 23, v[4:5]
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 8
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB3_10
+; GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT:    v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_lt_i32_e64 s[4:5], -1, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_7
+; GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v5, 3, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v8, 4, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v9, 5, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v10, 6, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v11, 7, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v12, 8, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v13, 9, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v14, 10, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v15, 11, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v16, 12, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v17, 13, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v18, 14, v0
+; GISEL-NEXT:    v_lshlrev_b16_e32 v19, 15, v0
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v5
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT:    v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT:    v_or3_b32 v9, v1, v2, 1
+; GISEL-NEXT:    v_or3_b32 v8, v0, v2, 0
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x96
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT:    v_or_b32_e32 v4, 0x800000, v2
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT:    v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    ; implicit-def: $vgpr6
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:  .LBB3_4: ; %Flow
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[16:17]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT:    v_sub_co_u32_e32 v3, vcc, 0x96, v6
+; GISEL-NEXT:    v_subrev_u32_e32 v2, 64, v3
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v3, v[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[1:2], v2, 0
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:  .LBB3_6: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB3_7: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_9
+; GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT:    v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT:    v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT:    v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT:    v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT:    v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GISEL-NEXT:  .LBB3_9: ; %Flow3
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:  .LBB3_10: ; %fp-to-i-cleanup
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptoui float %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptosi_f16_to_i128(half %x) {
+; GCN-LABEL: fptosi_f16_to_i128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptosi half %x to i128
+  ret i128 %cvt
+}
+
+define i128 @fptoui_f16_to_i128(half %x) {
+; GCN-LABEL: fptoui_f16_to_i128:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = fptoui half %x to i128
+  ret i128 %cvt
+}
+
+; FIXME: ExpandLargeFpConvert asserts on bfloat
+; define i128 @fptosi_bf16_to_i128(bfloat %x) {
+;   %cvt = fptosi bfloat %x to i128
+;   ret i128 %cvt
+; }
+
+; define i128 @fptoui_bf16_to_i128(bfloat %x) {
+;   %cvt = fptoui bfloat %x to i128
+;   ret i128 %cvt
+; }
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 3a0b8259d08496..e361aa4db2aa94 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1705,16 +1705,16 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX6-NEXT:    v_min_f32_e32 v7, 0x3f7fffff, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v1, vcc
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX6-NEXT:    s_movk_i32 s10, 0x204
+; GFX6-NEXT:    v_mov_b32_e32 v8, 0x204
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
-; GFX6-NEXT:    v_cmp_class_f32_e64 s[8:9], v0, s10
+; GFX6-NEXT:    v_cmp_class_f32_e32 vcc, v0, v8
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v7, 0, s[8:9]
-; GFX6-NEXT:    v_cmp_class_f32_e64 s[8:9], v1, s10
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v7, 0, vcc
+; GFX6-NEXT:    v_cmp_class_f32_e32 vcc, v1, v8
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v6, 0, s[8:9]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v6, 0, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -1722,19 +1722,19 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX7-LABEL: safe_math_fract_v2f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0x204
 ; GFX7-NEXT:    v_fract_f32_e32 v6, v0
-; GFX7-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s8
+; GFX7-NEXT:    v_cmp_class_f32_e32 vcc, v0, v8
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_floor_f32_e32 v4, v0
 ; GFX7-NEXT:    v_fract_f32_e32 v7, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX7-NEXT:    v_cmp_neq_f32_e64 vcc, |v1|, s8
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX7-NEXT:    v_cmp_class_f32_e32 vcc, v1, v8
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    v_floor_f32_e32 v5, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v7, 0, vcc
 ; GFX7-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -1742,15 +1742,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX8-LABEL: safe_math_fract_v2f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0x204
 ; GFX8-NEXT:    v_fract_f32_e32 v6, v0
-; GFX8-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s4
+; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v0, v8
 ; GFX8-NEXT:    v_floor_f32_e32 v4, v0
 ; GFX8-NEXT:    v_fract_f32_e32 v7, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX8-NEXT:    v_cmp_neq_f32_e64 vcc, |v1|, s4
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX8-NEXT:    v_cmp_class_f32_e32 vcc, v1, v8
 ; GFX8-NEXT:    v_floor_f32_e32 v5, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, 0, vcc
 ; GFX8-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1759,14 +1759,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v6, v0
-; GFX11-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX11-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x204
 ; GFX11-NEXT:    v_fract_f32_e32 v7, v1
 ; GFX11-NEXT:    v_floor_f32_e32 v4, v0
 ; GFX11-NEXT:    v_floor_f32_e32 v5, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX11-NEXT:    v_cmp_class_f32_e64 s0, v1, 0x204
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v7, 0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
@@ -1937,21 +1938,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ; GFX6:       ; %bb.0: ; %entry
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX6-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    v_floor_f32_e32 v3, v0
-; GFX6-NEXT:    v_sub_f32_e32 v4, v0, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_min_f32_e32 v4, 0x3f7fe000, v4
-; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX6-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s8
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; GFX6-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GFX6-NEXT:    v_floor_f32_e32 v4, v3
+; GFX6-NEXT:    v_sub_f32_e32 v5, v3, v4
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT:    v_min_f32_e32 v5, 0x3f7fe000, v5
+; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-NEXT:    buffer_store_short v4, v[1:2], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1959,21 +1961,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX7-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    v_floor_f32_e32 v3, v0
-; GFX7-NEXT:    v_sub_f32_e32 v4, v0, v3
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT:    v_min_f32_e32 v4, 0x3f7fe000, v4
-; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX7-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s8
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; GFX7-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    v_floor_f32_e32 v4, v3
+; GFX7-NEXT:    v_sub_f32_e32 v5, v3, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_min_f32_e32 v5, 0x3f7fe000, v5
+; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX7-NEXT:    buffer_store_short v4, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2062,12 +2065,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX6-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX6-NEXT:    v_floor_f32_e32 v6, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v6
 ; GFX6-NEXT:    v_floor_f32_e32 v8, v5
@@ -2080,10 +2083,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX6-NEXT:    v_cmp_neq_f32_e32 vcc, s8, v0
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
-; GFX6-NEXT:    v_cmp_neq_f32_e32 vcc, s8, v1
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
@@ -2098,12 +2101,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
+; GFX7-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; GFX7-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX7-NEXT:    v_floor_f32_e32 v6, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v6
 ; GFX7-NEXT:    v_floor_f32_e32 v8, v5
@@ -2116,10 +2119,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX7-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, s8, v0
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v5, vcc
-; GFX7-NEXT:    v_cmp_neq_f32_e32 vcc, s8, v1
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, s8, v1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
@@ -2133,16 +2136,16 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    s_movk_i32 s6, 0x204
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x204
 ; GFX8-NEXT:    v_floor_f16_e32 v4, v3
 ; GFX8-NEXT:    v_floor_f16_e32 v5, v0
 ; GFX8-NEXT:    v_fract_f16_e32 v6, v3
-; GFX8-NEXT:    v_cmp_class_f16_e64 s[4:5], v3, s6
+; GFX8-NEXT:    v_cmp_class_f16_e32 vcc, v3, v7
 ; GFX8-NEXT:    v_pack_b32_f16 v4, v5, v4
 ; GFX8-NEXT:    v_fract_f16_e32 v5, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, 0, s[4:5]
-; GFX8-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, s6
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, 0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, 0, vcc
+; GFX8-NEXT:    v_cmp_class_f16_e32 vcc, v0, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
 ; GFX8-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; GFX8-NEXT:    global_store_dword v[1:2], v4, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -2237,19 +2240,19 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
 ; GFX6-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GFX6-NEXT:    s_movk_i32 s10, 0x204
-; GFX6-NEXT:    v_cmp_class_f64_e64 s[8:9], v[0:1], s10
+; GFX6-NEXT:    v_mov_b32_e32 v14, 0x204
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v13, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v12, 0, s[8:9]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v13, 0, s[8:9]
-; GFX6-NEXT:    v_cmp_class_f64_e64 s[8:9], v[2:3], s10
+; GFX6-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v14
 ; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v13, 0, vcc
+; GFX6-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v14
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v10, 0, s[8:9]
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v11, 0, s[8:9]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v11, 0, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2257,39 +2260,39 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX7-LABEL: safe_math_fract_v2f64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0x204
+; GFX7-NEXT:    v_mov_b32_e32 v6, 0x204
 ; GFX7-NEXT:    v_fract_f64_e32 v[10:11], v[0:1]
-; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[0:1], s4
+; GFX7-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v6
 ; GFX7-NEXT:    v_fract_f64_e32 v[12:13], v[2:3]
-; GFX7-NEXT:    v_cmp_class_f64_e64 s[10:11], v[2:3], s4
+; GFX7-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v6
 ; GFX7-NEXT:    v_floor_f64_e32 v[8:9], v[2:3]
 ; GFX7-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v10, 0, s[8:9]
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v11, 0, s[8:9]
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[10:11]
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[10:11]
-; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s10, 0
+; GFX7-NEXT:    s_mov_b32 s11, 0xf000
+; GFX7-NEXT:    s_mov_b32 s8, s10
+; GFX7-NEXT:    s_mov_b32 s9, s10
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v11, 0, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[4:5]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[4:5]
+; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_v2f64:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s6, 0x204
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x204
 ; GFX8-NEXT:    v_fract_f64_e32 v[10:11], v[0:1]
-; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], s6
+; GFX8-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v6
 ; GFX8-NEXT:    v_fract_f64_e32 v[12:13], v[2:3]
-; GFX8-NEXT:    v_cmp_class_f64_e64 s[6:7], v[2:3], s6
+; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v6
 ; GFX8-NEXT:    v_floor_f64_e32 v[8:9], v[2:3]
 ; GFX8-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v10, 0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v11, 0, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[6:7]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v11, 0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[4:5]
 ; GFX8-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index e3fada3459a07f..538ef42121b83b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -1,71 +1,43 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
-; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-DPP %s
+
+; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations.
+; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction
+; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes.
 
 define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
-; IR-ITERATIVE:       2:
-; IR-ITERATIVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-ITERATIVE-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-ITERATIVE-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-ITERATIVE:       14:
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    br label [[TMP16]]
-; IR-ITERATIVE:       16:
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
-; IR-ITERATIVE-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-ITERATIVE-NEXT:    [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
-; IR-ITERATIVE-NEXT:    [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
-; IR-ITERATIVE-NEXT:    br label [[TMP24]]
-; IR-ITERATIVE:       24:
-; IR-ITERATIVE-NEXT:    [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-ITERATIVE-NEXT:    ret float [[TMP25]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
-; IR-DPP:       2:
-; IR-DPP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-DPP-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-DPP-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-DPP-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-DPP-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-DPP:       14:
-; IR-DPP-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    br label [[TMP16]]
-; IR-DPP:       16:
-; IR-DPP-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-DPP-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
-; IR-DPP-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-DPP-NEXT:    [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-DPP-NEXT:    [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
-; IR-DPP-NEXT:    [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
-; IR-DPP-NEXT:    br label [[TMP24]]
-; IR-DPP:       24:
-; IR-DPP-NEXT:    [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-DPP-NEXT:    ret float [[TMP25]]
+; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR:       2:
+; IR-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
+; IR-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
+; IR-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
+; IR-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR:       14:
+; IR-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    br label [[TMP16]]
+; IR:       16:
+; IR-NEXT:    [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
+; IR-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
+; IR-NEXT:    [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
+; IR-NEXT:    [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
+; IR-NEXT:    [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
+; IR-NEXT:    [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
+; IR-NEXT:    br label [[TMP24]]
+; IR:       24:
+; IR-NEXT:    [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
+; IR-NEXT:    ret float [[TMP25]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret float %result
@@ -411,7 +383,6 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
   ret float %result
 }
 
-
 define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
 ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
 ; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
@@ -514,61 +485,33 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
 }
 
 define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; IR-ITERATIVE:       2:
-; IR-ITERATIVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-ITERATIVE:       10:
-; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    br label [[TMP12]]
-; IR-ITERATIVE:       12:
-; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-ITERATIVE-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
-; IR-ITERATIVE-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT:    [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-ITERATIVE-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
-; IR-ITERATIVE-NEXT:    [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
-; IR-ITERATIVE-NEXT:    br label [[TMP20]]
-; IR-ITERATIVE:       20:
-; IR-ITERATIVE-NEXT:    [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT:    ret float [[TMP21]]
-;
-; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; IR-DPP:       2:
-; IR-DPP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-DPP:       10:
-; IR-DPP-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    br label [[TMP12]]
-; IR-DPP:       12:
-; IR-DPP-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-DPP-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-DPP-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
-; IR-DPP-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-DPP-NEXT:    [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-DPP-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
-; IR-DPP-NEXT:    [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
-; IR-DPP-NEXT:    br label [[TMP20]]
-; IR-DPP:       20:
-; IR-DPP-NEXT:    [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
-; IR-DPP-NEXT:    ret float [[TMP21]]
+; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; IR:       2:
+; IR-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR:       10:
+; IR-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    br label [[TMP12]]
+; IR:       12:
+; IR-NEXT:    [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
+; IR-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-NEXT:    [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
+; IR-NEXT:    [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
+; IR-NEXT:    [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
+; IR-NEXT:    [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
+; IR-NEXT:    br label [[TMP20]]
+; IR:       20:
+; IR-NEXT:    [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
+; IR-NEXT:    ret float [[TMP21]]
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
@@ -1007,159 +950,109 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
   ret float %result
 }
 
-
 define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
-
 define amdgpu_ps float @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
   ret float %result
 }
 
 define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT:    ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT:    ret float [[RESULT]]
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
   ret float %result
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index f87932b9361ed9..cc7a45cbb6e374 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -1,55 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
-; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s
+; RUN:  opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-DPP %s
+
+; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations.
+; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction
+; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes.
 
 define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
-; IR-ITERATIVE:       2:
-; IR-ITERATIVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-ITERATIVE-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-ITERATIVE-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-ITERATIVE-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-ITERATIVE:       14:
-; IR-ITERATIVE-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    br label [[TMP16]]
-; IR-ITERATIVE:       16:
-; IR-ITERATIVE-NEXT:    br label [[TMP17]]
-; IR-ITERATIVE:       17:
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
-; IR-DPP:       2:
-; IR-DPP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-DPP-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-DPP-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-DPP-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-DPP-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-DPP:       14:
-; IR-DPP-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    br label [[TMP16]]
-; IR-DPP:       16:
-; IR-DPP-NEXT:    br label [[TMP17]]
-; IR-DPP:       17:
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR:       2:
+; IR-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
+; IR-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-NEXT:    [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
+; IR-NEXT:    [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
+; IR-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT:    br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR:       14:
+; IR-NEXT:    [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    br label [[TMP16]]
+; IR:       16:
+; IR-NEXT:    br label [[TMP17]]
+; IR:       17:
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret void
@@ -325,7 +305,6 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_uni_value_agent_scope_stri
   ret void
 }
 
-
 define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
 ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
 ; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
@@ -409,45 +388,25 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
 }
 
 define amdgpu_ps void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
-; IR-ITERATIVE:       2:
-; IR-ITERATIVE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-ITERATIVE:       10:
-; IR-ITERATIVE-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    br label [[TMP12]]
-; IR-ITERATIVE:       12:
-; IR-ITERATIVE-NEXT:    br label [[TMP13]]
-; IR-ITERATIVE:       13:
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
-; IR-DPP:       2:
-; IR-DPP-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-DPP:       10:
-; IR-DPP-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    br label [[TMP12]]
-; IR-DPP:       12:
-; IR-DPP-NEXT:    br label [[TMP13]]
-; IR-DPP:       13:
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR:       2:
+; IR-NEXT:    [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT:    br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR:       10:
+; IR-NEXT:    [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    br label [[TMP12]]
+; IR:       12:
+; IR-NEXT:    br label [[TMP13]]
+; IR:       13:
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
@@ -797,159 +756,109 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
   ret void
 }
 
-
 define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
-
 define amdgpu_ps void @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
   ret void
 }
 
 define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT:    ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-DPP-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT:    ret void
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT:    ret void
 ;
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
new file mode 100644
index 00000000000000..bfeb214c5af8f4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -0,0 +1,1618 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define float @sitofp_i128_to_f32(i128 %x) {
+; SDAG-LABEL: sitofp_i128_to_f32:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB0_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
+; SDAG-NEXT:    v_xor_b32_e32 v0, v5, v0
+; SDAG-NEXT:    v_xor_b32_e32 v1, v5, v1
+; SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
+; SDAG-NEXT:    v_xor_b32_e32 v2, v5, v2
+; SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v6, v5, v3
+; SDAG-NEXT:    v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v2, v4
+; SDAG-NEXT:    v_add_u32_e32 v2, 32, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v5
+; SDAG-NEXT:    v_min_u32_e32 v2, v2, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v0
+; SDAG-NEXT:    v_add_u32_e32 v6, 32, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v7, v1
+; SDAG-NEXT:    v_min_u32_e32 v6, v6, v7
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_add_u32_e32 v6, 64, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v6, 0x80, v7
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x7f, v7
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v6
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff98, v7
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr6
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 25, v6
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB0_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v6
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB0_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v12, 0x66, v7
+; SDAG-NEXT:    v_sub_u32_e32 v10, 64, v12
+; SDAG-NEXT:    v_lshrrev_b64 v[8:9], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[10:11], v10, v[4:5]
+; SDAG-NEXT:    v_sub_u32_e32 v13, 38, v7
+; SDAG-NEXT:    v_or_b32_e32 v11, v9, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT:    v_lshrrev_b64 v[8:9], v13, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; SDAG-NEXT:    v_add_u32_e32 v14, 26, v7
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[10:11], v13, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[12:13], v14, v[4:5]
+; SDAG-NEXT:    v_subrev_u32_e32 v7, 38, v7
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, v8, v0, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[7:8], v7, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v11, v13, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v14, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v5
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v8, v15, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v1, v9
+; SDAG-NEXT:  .LBB0_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB0_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v4, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_and_b32_e32 v4, 0x4000000, v0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; SDAG-NEXT:    v_alignbit_b32 v8, v1, v0, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_alignbit_b32 v8, v1, v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v2, v6
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB0_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x80000000, v3
+; SDAG-NEXT:    v_lshl_add_u32 v1, v2, 23, 1.0
+; SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffff, v8
+; SDAG-NEXT:    v_or3_b32 v4, v2, v0, v1
+; SDAG-NEXT:  .LBB0_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sitofp_i128_to_f32:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB0_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v0, v6, v0
+; GISEL-NEXT:    v_xor_b32_e32 v1, v6, v1
+; GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_xor_b32_e32 v2, v6, v2
+; GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v6, v3
+; GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v7, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v7, 32, v7
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v8, 0x80, v5
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x7f, v5
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 24, v8
+; GISEL-NEXT:    ; implicit-def: $vgpr4
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr2
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 26, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB0_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB0_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v4
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v4, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v4
+; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v12, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v13, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v9, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v10, v1, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v5, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v14, 64, v5
+; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v16, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v14, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v11, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v12, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v11, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v13, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB0_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB0_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v7, v8
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB0_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x80000000, v6
+; GISEL-NEXT:    v_lshl_add_u32 v1, v7, 23, 1.0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT:    v_or3_b32 v4, v2, v0, v1
+; GISEL-NEXT:  .LBB0_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = sitofp i128 %x to float
+  ret float %cvt
+}
+
+define float @uitofp_i128_to_f32(i128 %x) {
+; SDAG-LABEL: uitofp_i128_to_f32:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB1_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ffbh_u32_e32 v4, v2
+; SDAG-NEXT:    v_add_u32_e32 v4, 32, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v3
+; SDAG-NEXT:    v_min_u32_e32 v4, v4, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v0
+; SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v1
+; SDAG-NEXT:    v_min_u32_e32 v5, v5, v6
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_add_u32_e32 v5, 64, v5
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v5, 0x80, v6
+; SDAG-NEXT:    v_sub_u32_e32 v4, 0x7f, v6
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v5
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff98, v6
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr6
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 25, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB1_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB1_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v11, 0x66, v6
+; SDAG-NEXT:    v_sub_u32_e32 v9, 64, v11
+; SDAG-NEXT:    v_lshrrev_b64 v[7:8], v11, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[9:10], v9, v[2:3]
+; SDAG-NEXT:    v_sub_u32_e32 v12, 38, v6
+; SDAG-NEXT:    v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v7, v9
+; SDAG-NEXT:    v_lshrrev_b64 v[7:8], v12, v[2:3]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
+; SDAG-NEXT:    v_add_u32_e32 v13, 26, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[9:10], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[11:12], v13, v[2:3]
+; SDAG-NEXT:    v_subrev_u32_e32 v6, 38, v6
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v7, v0, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[6:7], v6, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v11, v9
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v13, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v7, v14, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v7
+; SDAG-NEXT:    v_mov_b32_e32 v1, v8
+; SDAG-NEXT:  .LBB1_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB1_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v2, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SDAG-NEXT:    v_alignbit_b32 v7, v1, v0, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_alignbit_b32 v7, v1, v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v4, v5
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB1_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v7
+; SDAG-NEXT:    v_lshl_or_b32 v0, v4, 23, v0
+; SDAG-NEXT:    v_add_u32_e32 v4, 1.0, v0
+; SDAG-NEXT:  .LBB1_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: uitofp_i128_to_f32:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB1_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v6, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v6, 32, v6
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x80, v5
+; GISEL-NEXT:    v_sub_u32_e32 v6, 0x7f, v5
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 24, v7
+; GISEL-NEXT:    ; implicit-def: $vgpr4
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr7
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr2
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 26, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB1_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB1_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v4
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v4, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v12, 64, v4
+; GISEL-NEXT:    v_or_b32_e32 v10, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v12, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v1, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v5, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v5
+; GISEL-NEXT:    v_or_b32_e32 v14, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[10:11], v13, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v11, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v8, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v9, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v10, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v12, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB1_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB1_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v7
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB1_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_add_u32 v0, v6, 23, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fffff
+; GISEL-NEXT:    v_and_or_b32 v4, v4, v1, v0
+; GISEL-NEXT:  .LBB1_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = uitofp i128 %x to float
+  ret float %cvt
+}
+
+define double @sitofp_i128_to_f64(i128 %x) {
+; SDAG-LABEL: sitofp_i128_to_f64:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_or_b32_e32 v1, v5, v3
+; SDAG-NEXT:    v_or_b32_e32 v0, v4, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB2_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; SDAG-NEXT:    v_xor_b32_e32 v4, v0, v4
+; SDAG-NEXT:    v_xor_b32_e32 v5, v0, v5
+; SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v0
+; SDAG-NEXT:    v_xor_b32_e32 v2, v0, v2
+; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v0, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v1, v0, v3
+; SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, v2, v0, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, v1, v0, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v0, v6
+; SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v7
+; SDAG-NEXT:    v_min_u32_e32 v0, v0, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v4
+; SDAG-NEXT:    v_add_u32_e32 v1, 32, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v2, v5
+; SDAG-NEXT:    v_min_u32_e32 v1, v1, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT:    v_add_u32_e32 v1, 64, v1
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v1, v0, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v8, 0x80, v9
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x7f, v9
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 54, v8
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v6, 0xffffffb5, v9
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 54, v8
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB2_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 55, v8
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB2_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v12, 0x49, v9
+; SDAG-NEXT:    v_sub_u32_e32 v10, 64, v12
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v12, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[10:11], v10, v[6:7]
+; SDAG-NEXT:    v_sub_u32_e32 v13, 9, v9
+; SDAG-NEXT:    v_or_b32_e32 v11, v1, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v0, v10
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], v13, v[6:7]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; SDAG-NEXT:    v_add_u32_e32 v16, 55, v9
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[10:11], v12, v[6:7]
+; SDAG-NEXT:    v_lshrrev_b64 v[12:13], v13, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[14:15], v16, v[6:7]
+; SDAG-NEXT:    v_add_u32_e32 v9, -9, v9
+; SDAG-NEXT:    v_or_b32_e32 v15, v15, v13
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v12
+; SDAG-NEXT:    v_lshlrev_b64 v[12:13], v9, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v11, 0, v11, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v13, v15, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; SDAG-NEXT:    v_lshlrev_b64 v[4:5], v16, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v12, v14, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v9, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT:    v_or_b32_e32 v5, v5, v7
+; SDAG-NEXT:    v_or_b32_e32 v4, v4, v6
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v6, v10
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; SDAG-NEXT:    v_mov_b32_e32 v7, v11
+; SDAG-NEXT:  .LBB2_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB2_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v0, 31, v5
+; SDAG-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v6, v6, v0
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v0, 2, v4
+; SDAG-NEXT:    v_and_or_b32 v0, v0, 1, v4
+; SDAG-NEXT:    v_add_co_u32_e32 v4, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], 2, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v7, 30, v6
+; SDAG-NEXT:    v_or_b32_e32 v10, v1, v7
+; SDAG-NEXT:    v_and_b32_e32 v1, 0x800000, v5
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_lshrrev_b64 v[0:1], 3, v[4:5]
+; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 29, v6
+; SDAG-NEXT:    v_or_b32_e32 v10, v1, v2
+; SDAG-NEXT:    v_mov_b32_e32 v2, v8
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB2_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v1, 0x80000000, v3
+; SDAG-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
+; SDAG-NEXT:    v_lshl_add_u32 v2, v2, 20, v3
+; SDAG-NEXT:    v_and_b32_e32 v3, 0xfffff, v10
+; SDAG-NEXT:    v_or3_b32 v1, v3, v1, v2
+; SDAG-NEXT:  .LBB2_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sitofp_i128_to_f64:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, v1
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_or_b32_e32 v0, v4, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v5, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB2_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v0, v6, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v6, v5
+; GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_xor_b32_e32 v2, v6, v2
+; GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v6, v3
+; GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v7, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v7, 32, v7
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v8, 0x80, v9
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x7f, v9
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 53, v8
+; GISEL-NEXT:    ; implicit-def: $vgpr10
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffffb5, v9
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 55, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB2_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 55, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB2_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v14, 0x49, v9
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v14
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GISEL-NEXT:    v_or_b32_e32 v10, v4, v10
+; GISEL-NEXT:    v_or_b32_e32 v11, v5, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v15, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v14, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GISEL-NEXT:    v_add_u32_e32 v14, 55, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v4, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v12, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v14, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v14
+; GISEL-NEXT:    v_or_b32_e32 v16, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v17, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v15, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v12, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v11, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v12, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v13, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB2_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB2_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[9:10], 1, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
+; GISEL-NEXT:    v_or_b32_e32 v11, v2, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v9
+; GISEL-NEXT:    v_mov_b32_e32 v1, v10
+; GISEL-NEXT:    v_mov_b32_e32 v2, v11
+; GISEL-NEXT:    v_mov_b32_e32 v3, v12
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v3, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0
+; GISEL-NEXT:    v_and_b32_e32 v10, 0x800000, v1
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
+; GISEL-NEXT:    v_lshl_or_b32 v10, v2, 30, v5
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v7, v8
+; GISEL-NEXT:    v_lshl_or_b32 v10, v2, 29, v5
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB2_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x80000000, v6
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0xfffff
+; GISEL-NEXT:    v_lshl_add_u32 v1, v7, 20, v1
+; GISEL-NEXT:    v_and_or_b32 v2, v10, v2, v0
+; GISEL-NEXT:    v_and_or_b32 v0, v4, -1, 0
+; GISEL-NEXT:    v_or3_b32 v1, v2, v1, 0
+; GISEL-NEXT:  .LBB2_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = sitofp i128 %x to double
+  ret double %cvt
+}
+
+define double @uitofp_i128_to_f64(i128 %x) {
+; SDAG-LABEL: uitofp_i128_to_f64:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB3_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ffbh_u32_e32 v4, v2
+; SDAG-NEXT:    v_add_u32_e32 v4, 32, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v3
+; SDAG-NEXT:    v_min_u32_e32 v4, v4, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v0
+; SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v1
+; SDAG-NEXT:    v_min_u32_e32 v5, v5, v6
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_add_u32_e32 v5, 64, v5
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v5, v4, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v7, 0x80, v8
+; SDAG-NEXT:    v_sub_u32_e32 v6, 0x7f, v8
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 54, v7
+; SDAG-NEXT:    ; implicit-def: $vgpr9
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffffb5, v8
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 54, v7
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB3_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 55, v7
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB3_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v11, 0x49, v8
+; SDAG-NEXT:    v_sub_u32_e32 v9, 64, v11
+; SDAG-NEXT:    v_lshrrev_b64 v[4:5], v11, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[9:10], v9, v[2:3]
+; SDAG-NEXT:    v_sub_u32_e32 v12, 9, v8
+; SDAG-NEXT:    v_or_b32_e32 v10, v5, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v4, v9
+; SDAG-NEXT:    v_lshrrev_b64 v[4:5], v12, v[2:3]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
+; SDAG-NEXT:    v_add_u32_e32 v15, 55, v8
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[9:10], v11, v[2:3]
+; SDAG-NEXT:    v_lshrrev_b64 v[11:12], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[13:14], v15, v[2:3]
+; SDAG-NEXT:    v_add_u32_e32 v8, -9, v8
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v12
+; SDAG-NEXT:    v_or_b32_e32 v13, v13, v11
+; SDAG-NEXT:    v_lshlrev_b64 v[11:12], v8, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v15
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v12, v14, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v15, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v11, v13, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_mov_b32_e32 v2, v9
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v4, v4, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    v_mov_b32_e32 v3, v10
+; SDAG-NEXT:  .LBB3_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB3_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 31, v1
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v3
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v3, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v3, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; SDAG-NEXT:    v_and_b32_e32 v3, 0x800000, v1
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; SDAG-NEXT:    v_alignbit_b32 v9, v2, v1, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; SDAG-NEXT:    v_alignbit_b32 v9, v2, v1, 3
+; SDAG-NEXT:    v_mov_b32_e32 v6, v7
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB3_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0xfffff, v9
+; SDAG-NEXT:    v_lshl_or_b32 v0, v6, 20, v0
+; SDAG-NEXT:    v_add_u32_e32 v5, 0x3ff00000, v0
+; SDAG-NEXT:  .LBB3_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: uitofp_i128_to_f64:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB3_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v6, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v6, 32, v6
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x80, v8
+; GISEL-NEXT:    v_sub_u32_e32 v6, 0x7f, v8
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 53, v7
+; GISEL-NEXT:    ; implicit-def: $vgpr9
+; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffffb5, v8
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v1, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr7
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 55, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB3_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 55, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB3_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v13, 0x49, v8
+; GISEL-NEXT:    v_sub_u32_e32 v9, 64, v13
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v13, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[9:10], v9, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v14, 64, v13
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v13, v[2:3]
+; GISEL-NEXT:    v_or_b32_e32 v9, v4, v9
+; GISEL-NEXT:    v_or_b32_e32 v10, v5, v10
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v14, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
+; GISEL-NEXT:    v_add_u32_e32 v8, 55, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v12, 64, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v4, v0, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v5, v1, s[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v8, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[12:13], v12, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v8
+; GISEL-NEXT:    v_or_b32_e32 v16, v4, v12
+; GISEL-NEXT:    v_or_b32_e32 v17, v5, v13
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v15, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v12, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v13, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v4, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v5, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v8, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v12, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v8, v14, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mov_b32_e32 v3, v11
+; GISEL-NEXT:  .LBB3_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB3_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], 1, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
+; GISEL-NEXT:    v_or_b32_e32 v10, v10, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mov_b32_e32 v3, v11
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v4, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GISEL-NEXT:    v_and_b32_e32 v9, 0x800000, v1
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_lshlrev_b64 v[8:9], 30, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 2, v1
+; GISEL-NEXT:    v_or_b32_e32 v9, v5, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 29, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 3, v1
+; GISEL-NEXT:    v_or_b32_e32 v9, v0, v2
+; GISEL-NEXT:    v_mov_b32_e32 v6, v7
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB3_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GISEL-NEXT:    v_lshl_add_u32 v0, v6, 20, v0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff, v9
+; GISEL-NEXT:    v_and_or_b32 v4, v4, -1, 0
+; GISEL-NEXT:    v_or3_b32 v5, v1, v0, 0
+; GISEL-NEXT:  .LBB3_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = uitofp i128 %x to double
+  ret double %cvt
+}
+
+define half @sitofp_i128_to_f16(i128 %x) {
+; SDAG-LABEL: sitofp_i128_to_f16:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB4_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
+; SDAG-NEXT:    v_xor_b32_e32 v0, v5, v0
+; SDAG-NEXT:    v_xor_b32_e32 v1, v5, v1
+; SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
+; SDAG-NEXT:    v_xor_b32_e32 v2, v5, v2
+; SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v6, v5, v3
+; SDAG-NEXT:    v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v2, v4
+; SDAG-NEXT:    v_add_u32_e32 v2, 32, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v5
+; SDAG-NEXT:    v_min_u32_e32 v2, v2, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v0
+; SDAG-NEXT:    v_add_u32_e32 v6, 32, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v7, v1
+; SDAG-NEXT:    v_min_u32_e32 v6, v6, v7
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_add_u32_e32 v6, 64, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v6, v2, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v6, 0x80, v7
+; SDAG-NEXT:    v_sub_u32_e32 v2, 0x7f, v7
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v6
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v4, 0xffffff98, v7
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr6
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB4_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 25, v6
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB4_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v6
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB4_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v12, 0x66, v7
+; SDAG-NEXT:    v_sub_u32_e32 v10, 64, v12
+; SDAG-NEXT:    v_lshrrev_b64 v[8:9], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[10:11], v10, v[4:5]
+; SDAG-NEXT:    v_sub_u32_e32 v13, 38, v7
+; SDAG-NEXT:    v_or_b32_e32 v11, v9, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT:    v_lshrrev_b64 v[8:9], v13, v[4:5]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; SDAG-NEXT:    v_add_u32_e32 v14, 26, v7
+; SDAG-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[10:11], v13, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[12:13], v14, v[4:5]
+; SDAG-NEXT:    v_subrev_u32_e32 v7, 38, v7
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, v8, v0, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[7:8], v7, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v11, v13, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v14, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v5
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v8, v15, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v1, v9
+; SDAG-NEXT:  .LBB4_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB4_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v4, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_and_b32_e32 v4, 0x4000000, v0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; SDAG-NEXT:    v_alignbit_b32 v8, v1, v0, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_alignbit_b32 v8, v1, v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v2, v6
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB4_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x80000000, v3
+; SDAG-NEXT:    v_lshl_add_u32 v1, v2, 23, 1.0
+; SDAG-NEXT:    v_and_b32_e32 v2, 0x7fffff, v8
+; SDAG-NEXT:    v_or3_b32 v0, v2, v0, v1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; SDAG-NEXT:  .LBB4_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sitofp_i128_to_f16:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB4_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v0, v6, v0
+; GISEL-NEXT:    v_xor_b32_e32 v1, v6, v1
+; GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_xor_b32_e32 v2, v6, v2
+; GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v6, v3
+; GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v7, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v7, 32, v7
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v8, 0x80, v5
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x7f, v5
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 24, v8
+; GISEL-NEXT:    ; implicit-def: $vgpr4
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr8
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr2
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB4_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 26, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB4_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v8
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB4_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v4
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v4, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v4
+; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v12, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v13, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v9, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v10, v1, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v5, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v14, 64, v5
+; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT:    v_or_b32_e32 v16, v10, v12
+; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v14, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v11, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v12, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v11, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v13, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB4_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB4_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v7, v8
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB4_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_and_b32_e32 v0, 0x80000000, v6
+; GISEL-NEXT:    v_lshl_add_u32 v1, v7, 23, 1.0
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT:    v_or3_b32 v0, v2, v0, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GISEL-NEXT:  .LBB4_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = sitofp i128 %x to half
+  ret half %cvt
+}
+
+define half @uitofp_i128_to_f16(i128 %x) {
+; SDAG-LABEL: uitofp_i128_to_f16:
+; SDAG:       ; %bb.0: ; %itofp-entry
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB5_14
+; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT:    v_ffbh_u32_e32 v4, v2
+; SDAG-NEXT:    v_add_u32_e32 v4, 32, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v3
+; SDAG-NEXT:    v_min_u32_e32 v4, v4, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v5, v0
+; SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v1
+; SDAG-NEXT:    v_min_u32_e32 v5, v5, v6
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_add_u32_e32 v5, 64, v5
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; SDAG-NEXT:    v_sub_u32_e32 v5, 0x80, v6
+; SDAG-NEXT:    v_sub_u32_e32 v4, 0x7f, v6
+; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v5
+; SDAG-NEXT:    ; implicit-def: $vgpr7
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff98, v6
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, 0, v0, vcc
+; SDAG-NEXT:    ; implicit-def: $vgpr5
+; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT:    ; implicit-def: $vgpr6
+; SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT:  ; %bb.3: ; %Flow3
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB5_13
+; SDAG-NEXT:  ; %bb.4: ; %NodeBlock
+; SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 25, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT:    s_cbranch_execz .LBB5_8
+; SDAG-NEXT:  ; %bb.5: ; %LeafBlock
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v5
+; SDAG-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT:    s_cbranch_execz .LBB5_7
+; SDAG-NEXT:  ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT:    v_sub_u32_e32 v11, 0x66, v6
+; SDAG-NEXT:    v_sub_u32_e32 v9, 64, v11
+; SDAG-NEXT:    v_lshrrev_b64 v[7:8], v11, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[9:10], v9, v[2:3]
+; SDAG-NEXT:    v_sub_u32_e32 v12, 38, v6
+; SDAG-NEXT:    v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v7, v9
+; SDAG-NEXT:    v_lshrrev_b64 v[7:8], v12, v[2:3]
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v11
+; SDAG-NEXT:    v_add_u32_e32 v13, 26, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SDAG-NEXT:    v_lshrrev_b64 v[9:10], v12, v[0:1]
+; SDAG-NEXT:    v_lshlrev_b64 v[11:12], v13, v[2:3]
+; SDAG-NEXT:    v_subrev_u32_e32 v6, 38, v6
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v7, v0, s[4:5]
+; SDAG-NEXT:    v_lshlrev_b64 v[6:7], v6, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, v1, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v11, v9
+; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v13
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], v13, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v7, v14, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, v7
+; SDAG-NEXT:    v_mov_b32_e32 v1, v8
+; SDAG-NEXT:  .LBB5_7: ; %Flow1
+; SDAG-NEXT:    s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT:  .LBB5_8: ; %Flow2
+; SDAG-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; SDAG-NEXT:    v_and_or_b32 v0, v2, 1, v0
+; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SDAG-NEXT:    v_alignbit_b32 v7, v1, v0, 2
+; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT:  ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT:    v_alignbit_b32 v7, v1, v0, 3
+; SDAG-NEXT:    v_mov_b32_e32 v4, v5
+; SDAG-NEXT:  ; %bb.12: ; %Flow
+; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT:  .LBB5_13: ; %Flow4
+; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v7
+; SDAG-NEXT:    v_lshl_or_b32 v0, v4, 23, v0
+; SDAG-NEXT:    v_add_u32_e32 v0, 1.0, v0
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; SDAG-NEXT:  .LBB5_14: ; %Flow5
+; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: uitofp_i128_to_f16:
+; GISEL:       ; %bb.0: ; %itofp-entry
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB5_14
+; GISEL-NEXT:  ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v6, v2
+; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_add_u32_e32 v6, 32, v6
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT:    v_min_u32_e32 v5, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v7, 0x80, v5
+; GISEL-NEXT:    v_sub_u32_e32 v6, 0x7f, v5
+; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 24, v7
+; GISEL-NEXT:    ; implicit-def: $vgpr4
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    ; implicit-def: $vgpr7
+; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr2
+; GISEL-NEXT:  ; %bb.3: ; %Flow3
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB5_13
+; GISEL-NEXT:  ; %bb.4: ; %NodeBlock
+; GISEL-NEXT:    v_cmp_le_i32_e32 vcc, 26, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT:    s_cbranch_execz .LBB5_8
+; GISEL-NEXT:  ; %bb.5: ; %LeafBlock
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v7
+; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT:    s_cbranch_execz .LBB5_7
+; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT:    v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v4
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v4, v[0:1]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT:    v_subrev_u32_e32 v12, 64, v4
+; GISEL-NEXT:    v_or_b32_e32 v10, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v12, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v8, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v1, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[8:9], v5, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v13, 64, v5
+; GISEL-NEXT:    v_or_b32_e32 v14, v8, v10
+; GISEL-NEXT:    v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[10:11], v13, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v11, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v2, v8, v2
+; GISEL-NEXT:    v_and_b32_e32 v3, v9, v3
+; GISEL-NEXT:    v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v10, v1, v3
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v3, v12, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:  .LBB5_7: ; %Flow1
+; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT:  .LBB5_8: ; %Flow2
+; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT:  ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:    v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v7
+; GISEL-NEXT:  ; %bb.12: ; %Flow
+; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT:  .LBB5_13: ; %Flow4
+; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT:    v_lshl_add_u32 v0, v6, 23, 1.0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fffff
+; GISEL-NEXT:    v_and_or_b32 v0, v4, v1, v0
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; GISEL-NEXT:  .LBB5_14: ; %Flow5
+; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %cvt = uitofp i128 %x to half
+  ret half %cvt
+}
+
+; FIXME: ExpandLargeFpConvert asserts on bfloat
+; define bfloat @sitofp_i128_to_bf16(i128 %x) {
+;   %cvt = sitofp i128 %x to bfloat
+;   ret bfloat %cvt
+; }
+
+; define bfloat @uitofp_i128_to_bf16(i128 %x) {
+;   %cvt = uitofp i128 %x to bfloat
+;   ret bfloat %cvt
+; }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 5007f77316f5b4..0ff5dd3680dfab 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -195,13 +195,13 @@
 ; GCN-O1-NEXT:      Uniformity Analysis
 ; GCN-O1-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-NEXT:      Expand Atomic instructions
-; GCN-O1-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-NEXT:      Dominator Tree Construction
+; GCN-O1-NEXT:      Natural Loop Information
+; GCN-O1-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-NEXT:      Cycle Info Analysis
 ; GCN-O1-NEXT:      Uniformity Analysis
 ; GCN-O1-NEXT:      AMDGPU IR optimizations
 ; GCN-O1-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT:      Natural Loop Information
 ; GCN-O1-NEXT:      Canonicalize natural loops
 ; GCN-O1-NEXT:      Scalar Evolution Analysis
 ; GCN-O1-NEXT:      Loop Pass Manager
@@ -470,9 +470,9 @@
 ; GCN-O1-OPTS-NEXT:      Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:      AMDGPU atomic optimizations
 ; GCN-O1-OPTS-NEXT:      Expand Atomic instructions
-; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
+; GCN-O1-OPTS-NEXT:      AMDGPU Promote Alloca
 ; GCN-O1-OPTS-NEXT:      Canonicalize natural loops
 ; GCN-O1-OPTS-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O1-OPTS-NEXT:      Lazy Block Frequency Analysis
@@ -775,9 +775,9 @@
 ; GCN-O2-NEXT:      Uniformity Analysis
 ; GCN-O2-NEXT:      AMDGPU atomic optimizations
 ; GCN-O2-NEXT:      Expand Atomic instructions
-; GCN-O2-NEXT:      AMDGPU Promote Alloca
 ; GCN-O2-NEXT:      Dominator Tree Construction
 ; GCN-O2-NEXT:      Natural Loop Information
+; GCN-O2-NEXT:      AMDGPU Promote Alloca
 ; GCN-O2-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
 ; GCN-O2-NEXT:      Scalar Evolution Analysis
 ; GCN-O2-NEXT:      Straight line strength reduction
@@ -1084,9 +1084,9 @@
 ; GCN-O3-NEXT:      Uniformity Analysis
 ; GCN-O3-NEXT:      AMDGPU atomic optimizations
 ; GCN-O3-NEXT:      Expand Atomic instructions
-; GCN-O3-NEXT:      AMDGPU Promote Alloca
 ; GCN-O3-NEXT:      Dominator Tree Construction
 ; GCN-O3-NEXT:      Natural Loop Information
+; GCN-O3-NEXT:      AMDGPU Promote Alloca
 ; GCN-O3-NEXT:      Split GEPs to a variadic base and a constant offset for better CSE
 ; GCN-O3-NEXT:      Scalar Evolution Analysis
 ; GCN-O3-NEXT:      Straight line strength reduction
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
new file mode 100644
index 00000000000000..fdcb1773d0a3f4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
+
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16)
+
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v48, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT:    v_add_nc_u32_e32 v0, s0, v28
+; GCN-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
+; GCN-NEXT:    ds_load_b128 v[8:11], v0
+; GCN-NEXT:    ds_load_b128 v[12:15], v0 offset:512
+; GCN-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
+; GCN-NEXT:    ds_load_b128 v[20:23], v0 offset:3072
+; GCN-NEXT:    ds_load_b128 v[24:27], v0 offset:5120
+; GCN-NEXT:    ds_load_b128 v[4:7], v0 offset:11280
+; GCN-NEXT:    ds_load_b128 v[0:3], v0 offset:11264
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x6
+; GCN-NEXT:    v_mov_b32_e32 v31, v11
+; GCN-NEXT:    s_wait_dscnt 0x5
+; GCN-NEXT:    v_mov_b32_e32 v35, v15
+; GCN-NEXT:    s_wait_dscnt 0x4
+; GCN-NEXT:    v_mov_b32_e32 v39, v19
+; GCN-NEXT:    s_wait_dscnt 0x3
+; GCN-NEXT:    v_mov_b32_e32 v43, v23
+; GCN-NEXT:    s_wait_dscnt 0x2
+; GCN-NEXT:    v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
+; GCN-NEXT:    v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
+; GCN-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
+; GCN-NEXT:    v_mov_b32_e32 v32, v12
+; GCN-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
+; GCN-NEXT:    v_mov_b32_e32 v36, v16
+; GCN-NEXT:    v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
+; GCN-NEXT:    v_mov_b32_e32 v40, v20
+; GCN-NEXT:    v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
+; GCN-NEXT:    v_mov_b32_e32 v44, v24
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
+; GCN-NEXT:    ds_store_b128 v49, v[28:31]
+; GCN-NEXT:    ds_store_b128 v50, v[32:35] offset:512
+; GCN-NEXT:    ds_store_b128 v50, v[36:39] offset:1024
+; GCN-NEXT:    ds_store_b128 v50, v[40:43] offset:1536
+; GCN-NEXT:    ds_store_b128 v50, v[44:47] offset:2048
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
+; GCN-NEXT:    s_endpgm
+;
+; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
+; EXACTCUTOFF:       ; %bb.0: ; %entry
+; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v28, 4, v0
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v48, 0
+; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; EXACTCUTOFF-NEXT:    v_add_nc_u32_e32 v0, s0, v28
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[8:11], v0
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[12:15], v0 offset:512
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[16:19], v0 offset:1536
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[20:23], v0 offset:3072
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[24:27], v0 offset:5120
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[4:7], v0 offset:11280
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[0:3], v0 offset:11264
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x6
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v31, v11
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x5
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v35, v15
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x4
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v39, v19
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x3
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v43, v23
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x2
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v32, v12
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v36, v16
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v40, v20
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v44, v24
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
+; EXACTCUTOFF-NEXT:    ds_store_b128 v49, v[28:31]
+; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[32:35] offset:512
+; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[36:39] offset:1024
+; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[40:43] offset:1536
+; EXACTCUTOFF-NEXT:    ds_store_b128 v50, v[44:47] offset:2048
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
+  %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32
+  %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
+  %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64
+  %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
+  %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96
+  %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
+  %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128
+  %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
+  %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192
+  %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
+  %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
+  %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
+  %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
+  %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
+  %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
+  %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
+  store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
+  %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
+  store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
+  %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
+  store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
+  %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
+  store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
+  %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
+  store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
+  ; 7 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0)
+  ; 5 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0)
+  ; 5 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_lshl_add_u32 v17, v0, 5, s0
+; GCN-NEXT:    v_lshl_add_u32 v0, v0, 4, s1
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:1024
+; GCN-NEXT:    ds_load_b128 v[1:4], v17
+; GCN-NEXT:    ds_load_b128 v[5:8], v17 offset:16
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x2
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16]
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:2560
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16] offset:512
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:4608
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16] offset:1024
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:7168
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16] offset:1536
+; GCN-NEXT:    ds_load_b128 v[9:12], v17 offset:10240
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT:    ds_store_b128 v0, v[13:16] offset:2048
+; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT:    s_endpgm
+;
+; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
+; EXACTCUTOFF:       ; %bb.0: ; %entry
+; EXACTCUTOFF-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v18, 0
+; EXACTCUTOFF-NEXT:    s_wait_kmcnt 0x0
+; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v17, v0, 5, s0
+; EXACTCUTOFF-NEXT:    v_lshl_add_u32 v0, v0, 4, s1
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:1024
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[1:4], v17
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[5:8], v17 offset:16
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x2
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16]
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:2560
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v0, s1
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16] offset:512
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:4608
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16] offset:1024
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:7168
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16] offset:1536
+; EXACTCUTOFF-NEXT:    ds_load_b128 v[9:12], v17 offset:10240
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT:    v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT:    v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    ds_store_b128 v0, v[13:16] offset:2048
+; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx
+  %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
+  %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64
+  %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
+  %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96
+  %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
+  %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128
+  %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
+  %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160
+  %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
+  %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192
+  %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
+  %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
+  %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
+  %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
+  %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
+  %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
+  %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
+  store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
+  %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
+  store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
+  %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
+  store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
+  %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
+  store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
+  %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
+  store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
+  ; 3 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ; 1 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ; 1 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ; 1 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ; 1 DS read
+  call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+  ; 1 SWMMAC
+  call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  ; 1 DS write
+  call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
new file mode 100644
index 00000000000000..ab03177d1edc51
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK:      Scoring:   %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-NEXT:   => Final Score:1
+; CHECK-NEXT: Scoring:   %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+1]:   store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
+; CHECK-NEXT:   [+1]:   %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
+; CHECK-NEXT:   [+1]:   store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
+; CHECK-NEXT:   [+1]:   %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT:   => Final Score:4
+; CHECK-NEXT: Sorted Worklist:
+; CHECK-NEXT:     %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:     %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+define amdgpu_kernel void @simple_users_scores() #0 {
+entry:
+  ; should get a score of 1
+  %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+  ; should get a score of 4
+  %manyusers = alloca [4 x i64], align 4, addrspace(5)
+
+  store i32 42, ptr addrspace(5) %simpleuser
+
+  %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+  %v0 = load i8, ptr addrspace(5)  %manyusers.1
+  %v0.ext = zext i8 %v0 to i32
+  store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+
+  %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
+  %v1 = load i8, ptr addrspace(5)  %manyusers.2
+  %v1.ext = zext i8 %v0 to i32
+  store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+
+  ret void
+}
+
+; CHECK:      Scoring:   %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:   [+5]:   store i32 32, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT:   [+1]:   store i32 42, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT:   [+9]:   store i32 32, ptr addrspace(5) %stack.1, align 4
+; CHECK-NEXT:   [+5]:   %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
+; CHECK-NEXT:   [+1]:   store i32 64, ptr addrspace(5) %stack.2, align 4
+; CHECK-NEXT:   [+9]:   %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
+; CHECK-NEXT:   => Final Score:30
+define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
+entry:
+  ; should get a score of 1
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
+  %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+
+  store i32 42, ptr addrspace(5) %stack
+  br label %loop.outer
+
+loop.outer:
+  store i32 32, ptr addrspace(5) %stack
+  %outer.cmp = load i1, ptr addrspace(5) %stack.1
+  br label %loop.inner
+
+loop.inner:
+  store i32 32, ptr addrspace(5) %stack.1
+  %inner.cmp = load i1, ptr addrspace(5) %stack.2
+  br i1 %inner.cmp, label %loop.inner, label %loop.outer
+
+exit:
+  store i32 64, ptr addrspace(5) %stack.2
+  ret void
+}
diff --git a/llvm/test/CodeGen/AVR/bug-81911.ll b/llvm/test/CodeGen/AVR/bug-81911.ll
new file mode 100644
index 00000000000000..2a22666a1ff927
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/bug-81911.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=avr -mcpu=atmega328 -O1 -verify-machineinstrs | FileCheck %s
+
+define internal i8 @main() {
+; CHECK-LABEL: main:
+; CHECK:       ; %bb.0: ; %bb0
+; CHECK-NEXT:    push r2
+; CHECK-NEXT:    push r3
+; CHECK-NEXT:    push r4
+; CHECK-NEXT:    push r5
+; CHECK-NEXT:    push r6
+; CHECK-NEXT:    push r7
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    push r11
+; CHECK-NEXT:    push r12
+; CHECK-NEXT:    push r13
+; CHECK-NEXT:    push r14
+; CHECK-NEXT:    push r15
+; CHECK-NEXT:    push r16
+; CHECK-NEXT:    push r17
+; CHECK-NEXT:    push r28
+; CHECK-NEXT:    push r29
+; CHECK-NEXT:    in r28, 61
+; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    sbiw r28, 13
+; CHECK-NEXT:    in r0, 63
+; CHECK-NEXT:    cli
+; CHECK-NEXT:    out 62, r29
+; CHECK-NEXT:    out 63, r0
+; CHECK-NEXT:    out 61, r28
+; CHECK-NEXT:    ldi r16, 0
+; CHECK-NEXT:    ldi r17, 0
+; CHECK-NEXT:    ldi r18, -1
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ldi r24, 123
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    std Y+1, r24 ; 1-byte Folded Spill
+; CHECK-NEXT:    movw r24, r28
+; CHECK-NEXT:    adiw r24, 6
+; CHECK-NEXT:    std Y+3, r25 ; 2-byte Folded Spill
+; CHECK-NEXT:    std Y+2, r24 ; 2-byte Folded Spill
+; CHECK-NEXT:    movw r8, r16
+; CHECK-NEXT:    movw r6, r16
+; CHECK-NEXT:    movw r4, r16
+; CHECK-NEXT:    movw r2, r16
+; CHECK-NEXT:    rjmp .LBB0_2
+; CHECK-NEXT:  .LBB0_1: ; %bb1
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    andi r30, 1
+; CHECK-NEXT:    ldd r31, Y+4 ; 1-byte Folded Reload
+; CHECK-NEXT:    dec r31
+; CHECK-NEXT:    cpi r30, 0
+; CHECK-NEXT:    movw r8, r18
+; CHECK-NEXT:    movw r6, r20
+; CHECK-NEXT:    movw r4, r22
+; CHECK-NEXT:    movw r2, r24
+; CHECK-NEXT:    mov r18, r31
+; CHECK-NEXT:    brne .LBB0_2
+; CHECK-NEXT:    rjmp .LBB0_4
+; CHECK-NEXT:  .LBB0_2: ; %bb1
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    std Y+4, r18 ; 1-byte Folded Spill
+; CHECK-NEXT:    movw r18, r8
+; CHECK-NEXT:    movw r20, r6
+; CHECK-NEXT:    movw r22, r4
+; CHECK-NEXT:    movw r24, r2
+; CHECK-NEXT:    ldi r26, 10
+; CHECK-NEXT:    ldi r27, 0
+; CHECK-NEXT:    movw r10, r26
+; CHECK-NEXT:    movw r12, r16
+; CHECK-NEXT:    movw r14, r16
+; CHECK-NEXT:    call __udivdi3
+; CHECK-NEXT:    std Y+13, r25
+; CHECK-NEXT:    std Y+12, r24
+; CHECK-NEXT:    std Y+11, r23
+; CHECK-NEXT:    std Y+10, r22
+; CHECK-NEXT:    std Y+9, r21
+; CHECK-NEXT:    std Y+8, r20
+; CHECK-NEXT:    std Y+7, r19
+; CHECK-NEXT:    std Y+6, r18
+; CHECK-NEXT:    ldd r30, Y+2 ; 2-byte Folded Reload
+; CHECK-NEXT:    ldd r31, Y+3 ; 2-byte Folded Reload
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    ldi r30, 1
+; CHECK-NEXT:    cp r8, r1
+; CHECK-NEXT:    cpc r9, r1
+; CHECK-NEXT:    cpc r6, r16
+; CHECK-NEXT:    cpc r7, r17
+; CHECK-NEXT:    cpc r4, r16
+; CHECK-NEXT:    cpc r5, r17
+; CHECK-NEXT:    cpc r2, r16
+; CHECK-NEXT:    cpc r3, r17
+; CHECK-NEXT:    breq .LBB0_3
+; CHECK-NEXT:    rjmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: ; %bb1
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mov r30, r1
+; CHECK-NEXT:    rjmp .LBB0_1
+; CHECK-NEXT:  .LBB0_4: ; %bb3
+; CHECK-NEXT:    ldd r24, Y+1 ; 1-byte Folded Reload
+; CHECK-NEXT:    std Y+5, r24
+; CHECK-NEXT:    movw r24, r28
+; CHECK-NEXT:    adiw r24, 5
+; CHECK-NEXT:    ;APP
+; CHECK-NEXT:    ;NO_APP
+; CHECK-NEXT:    ldd r24, Y+5
+; CHECK-NEXT:    adiw r28, 13
+; CHECK-NEXT:    in r0, 63
+; CHECK-NEXT:    cli
+; CHECK-NEXT:    out 62, r29
+; CHECK-NEXT:    out 63, r0
+; CHECK-NEXT:    out 61, r28
+; CHECK-NEXT:    pop r29
+; CHECK-NEXT:    pop r28
+; CHECK-NEXT:    pop r17
+; CHECK-NEXT:    pop r16
+; CHECK-NEXT:    pop r15
+; CHECK-NEXT:    pop r14
+; CHECK-NEXT:    pop r13
+; CHECK-NEXT:    pop r12
+; CHECK-NEXT:    pop r11
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    pop r7
+; CHECK-NEXT:    pop r6
+; CHECK-NEXT:    pop r5
+; CHECK-NEXT:    pop r4
+; CHECK-NEXT:    pop r3
+; CHECK-NEXT:    pop r2
+; CHECK-NEXT:    ret
+bb0:
+  %0 = alloca i64
+  %1 = alloca i8
+  %2 = tail call i8 asm sideeffect "ldi ${0}, 123", "=&r,~{sreg},~{memory}"()
+
+  br label %bb1
+
+bb1:
+  %3 = phi i64 [ %5, %bb1 ], [ 0, %bb0 ]
+  %4 = phi i8 [ %6, %bb1 ], [ 0, %bb0 ]
+  %5 = udiv i64 %3, 10
+  %6 = add i8 %4, 1
+
+  store i64 %5, ptr %0
+  call void asm sideeffect "", "r,~{memory}"(ptr %0)
+
+  %7 = icmp eq i64 %3, 0
+  %8 = icmp eq i8 %6, 0
+
+  br i1 %7, label %bb3, label %bb1
+
+bb3:
+  store i8 %2, ptr %1
+  call void asm sideeffect "", "r,~{memory}"(ptr %1)
+
+  %9 = load i8, ptr %1
+
+  ret i8 %9
+}
diff --git a/llvm/test/CodeGen/BPF/addr-space-globals.ll b/llvm/test/CodeGen/BPF/addr-space-globals.ll
index 878ba0dfce6cd1..73e80b7a0400b3 100644
--- a/llvm/test/CodeGen/BPF/addr-space-globals.ll
+++ b/llvm/test/CodeGen/BPF/addr-space-globals.ll
@@ -18,7 +18,7 @@
 
 ; Verify that a,b,c reside in the same section
 
-; CHECK:     .section .arena.272,"aw",@progbits
+; CHECK:     .section .addr_space.272,"aw",@progbits
 ; CHECK-NOT: .section
 ; CHECK:     .globl  a
 ; CHECK:     .ascii  "\001\002"
diff --git a/llvm/test/CodeGen/BPF/addr-space-globals2.ll b/llvm/test/CodeGen/BPF/addr-space-globals2.ll
index d1e2318948751e..5944cb27ee18a8 100644
--- a/llvm/test/CodeGen/BPF/addr-space-globals2.ll
+++ b/llvm/test/CodeGen/BPF/addr-space-globals2.ll
@@ -14,12 +14,12 @@
 
 ; Verify that a,b reside in separate sections
 
-; CHECK:     .section .arena.1,"aw",@progbits
+; CHECK:     .section .addr_space.1,"aw",@progbits
 ; CHECK-NOT: .section
 ; CHECK:     .globl  a
 ; CHECK:     .ascii  "\001\002"
 
-; CHECK:     .section .arena.2,"aw",@progbits
+; CHECK:     .section .addr_space.2,"aw",@progbits
 ; CHECK-NOT: .section
 ; CHECK:     .globl  b
 ; CHECK:     .ascii  "\003\004"
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll
index 865fefeac335dd..d027216e4213d5 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll
@@ -3,10 +3,11 @@
 
 target triple = "dxil-pc-shadermodel6.7-library"
 
-; CHECK: ; Shader Flags Value: 0x00000021
+; CHECK: ; Shader Flags Value: 0x00000044
 ; CHECK: ; Note: shader requires additional functionality:
 ; CHECK-NEXT: ;       Double-precision floating point
 ; CHECK-NEXT: ;       Double-precision extensions for 11.1
+; CHECK-NEXT: ; Note: extra DXIL module flags:
 ; CHECK-NEXT: {{^;$}}
 define double @div(double %a, double %b) {
   %res = fdiv double %a, %b
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll
index f90db61661f09f..c1a4c219a16951 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll
@@ -3,10 +3,12 @@
 
 target triple = "dxil-pc-shadermodel6.7-library"
 
-; CHECK: ; Shader Flags Value: 0x00000001
+; CHECK: ; Shader Flags Value: 0x00000004
 ; CHECK: ; Note: shader requires additional functionality:
 ; CHECK-NEXT: ;       Double-precision floating point
+; CHECK-NEXT: ; Note: extra DXIL module flags:
 ; CHECK-NEXT: {{^;$}}
+
 define double @add(double %a, double %b) {
   %sum = fadd double %a, %b
   ret double %sum
diff --git a/llvm/test/CodeGen/DirectX/any.ll b/llvm/test/CodeGen/DirectX/any.ll
new file mode 100644
index 00000000000000..e8d87075d65cfe
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/any.ll
@@ -0,0 +1,113 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for any are generated for float and half.
+
+; CHECK-LABEL: any_bool
+; CHECK: icmp ne i1 %{{.*}}, false
+define noundef i1 @any_bool(i1 noundef %p0) {
+entry:
+  %p0.addr = alloca i8, align 1
+  %frombool = zext i1 %p0 to i8
+  store i8 %frombool, ptr %p0.addr, align 1
+  %0 = load i8, ptr %p0.addr, align 1
+  %tobool = trunc i8 %0 to i1
+  %dx.any = call i1 @llvm.dx.any.i1(i1 %tobool)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_int64_t
+; CHECK: icmp ne i64 %{{.*}}, 0
+define noundef i1 @any_int64_t(i64 noundef %p0) {
+entry:
+  %p0.addr = alloca i64, align 8
+  store i64 %p0, ptr %p0.addr, align 8
+  %0 = load i64, ptr %p0.addr, align 8
+  %dx.any = call i1 @llvm.dx.any.i64(i64 %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_int
+; CHECK: icmp ne i32 %{{.*}}, 0
+define noundef i1 @any_int(i32 noundef %p0) {
+entry:
+  %p0.addr = alloca i32, align 4
+  store i32 %p0, ptr %p0.addr, align 4
+  %0 = load i32, ptr %p0.addr, align 4
+  %dx.any = call i1 @llvm.dx.any.i32(i32 %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_int16_t
+; CHECK: icmp ne i16 %{{.*}}, 0
+define noundef i1 @any_int16_t(i16 noundef %p0) {
+entry:
+  %p0.addr = alloca i16, align 2
+  store i16 %p0, ptr %p0.addr, align 2
+  %0 = load i16, ptr %p0.addr, align 2
+  %dx.any = call i1 @llvm.dx.any.i16(i16 %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_double
+; CHECK: fcmp une double %{{.*}}, 0.000000e+00
+define noundef i1 @any_double(double noundef %p0) {
+entry:
+  %p0.addr = alloca double, align 8
+  store double %p0, ptr %p0.addr, align 8
+  %0 = load double, ptr %p0.addr, align 8
+  %dx.any = call i1 @llvm.dx.any.f64(double %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_float
+; CHECK: fcmp une float %{{.*}}, 0.000000e+00
+define noundef i1 @any_float(float noundef %p0) {
+entry:
+  %p0.addr = alloca float, align 4
+  store float %p0, ptr %p0.addr, align 4
+  %0 = load float, ptr %p0.addr, align 4
+  %dx.any = call i1 @llvm.dx.any.f32(float %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_half
+; CHECK: fcmp une half %{{.*}}, 0xH0000
+define noundef i1 @any_half(half noundef %p0) {
+entry:
+  %p0.addr = alloca half, align 2
+  store half %p0, ptr %p0.addr, align 2
+  %0 = load half, ptr %p0.addr, align 2
+  %dx.any = call i1 @llvm.dx.any.f16(half %0)
+  ret i1 %dx.any
+}
+
+; CHECK-LABEL: any_bool4
+; CHECK: icmp ne <4 x i1> %extractvec, zeroinitialize
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 0
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 1
+; CHECK: or i1  %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 2
+; CHECK: or i1  %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i1> %{{.*}}, i64 3
+; CHECK: or i1  %{{.*}}, %{{.*}}
+define noundef i1 @any_bool4(<4 x i1> noundef %p0) {
+entry:
+  %p0.addr = alloca i8, align 1
+  %insertvec = shufflevector <4 x i1> %p0, <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  %0 = bitcast <8 x i1> %insertvec to i8
+  store i8 %0, ptr %p0.addr, align 1
+  %load_bits = load i8, ptr %p0.addr, align 1
+  %1 = bitcast i8 %load_bits to <8 x i1>
+  %extractvec = shufflevector <8 x i1> %1, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %dx.any = call i1 @llvm.dx.any.v4i1(<4 x i1> %extractvec)
+  ret i1 %dx.any
+}
+
+declare i1 @llvm.dx.any.v4i1(<4 x i1>)
+declare i1 @llvm.dx.any.i1(i1)
+declare i1 @llvm.dx.any.i16(i16)
+declare i1 @llvm.dx.any.i32(i32)
+declare i1 @llvm.dx.any.i64(i64)
+declare i1 @llvm.dx.any.f16(half)
+declare i1 @llvm.dx.any.f32(float)
+declare i1 @llvm.dx.any.f64(double)
diff --git a/llvm/test/CodeGen/DirectX/clamp-vec.ll b/llvm/test/CodeGen/DirectX/clamp-vec.ll
new file mode 100644
index 00000000000000..d4f33a18b71573
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/clamp-vec.ll
@@ -0,0 +1,74 @@
+; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s
+
+; Make sure dxil operation function calls for clamp are generated for float/int/uint vectors.
+
+; CHECK-LABEL: clamp_half3
+define noundef <3 x half> @clamp_half3(<3 x half> noundef %a, <3 x half> noundef %b, <3 x half> noundef %c) {
+entry:
+  ; CHECK: call <3 x half> @llvm.maxnum.v3f16(<3 x half>  %a, <3 x half>  %b)
+  ; CHECK: call <3 x half> @llvm.minnum.v3f16(<3 x half>  %{{.*}}, <3 x half>  %c)
+  %dx.clamp = call <3 x half> @llvm.dx.clamp.v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c)
+  ret <3 x half> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_float4
+define noundef <4 x float> @clamp_float4(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c) {
+entry:
+  ; CHECK: call <4 x float> @llvm.maxnum.v4f32(<4 x float>  %a, <4 x float>  %b)
+  ; CHECK: call <4 x float> @llvm.minnum.v4f32(<4 x float>  %{{.*}}, <4 x float>  %c)
+  %dx.clamp = call <4 x float> @llvm.dx.clamp.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_double2
+define noundef <2 x double> @clamp_double2(<2 x double> noundef %a, <2 x double> noundef %b, <2 x double> noundef %c) {
+entry:
+  ; CHECK: call <2 x double> @llvm.maxnum.v2f64(<2 x double>  %a, <2 x double>  %b)
+  ; CHECK: call <2 x double> @llvm.minnum.v2f64(<2 x double>  %{{.*}}, <2 x double>  %c)
+  %dx.clamp = call <2 x double> @llvm.dx.clamp.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_int4
+define noundef <4 x i32> @clamp_int4(<4 x i32> noundef %a, <4 x i32> noundef %b, <4 x i32> noundef %c) {
+entry:
+  ; CHECK: call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ; CHECK: call <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> %c)
+  %dx.clamp = call <4 x i32> @llvm.dx.clamp.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_uint16_t3
+define noundef <3 x i16> @clamp_uint16_t3(<3 x i16> noundef %a, <3 x i16> noundef %b, <3 x i16> noundef %c) {
+entry:
+  ; CHECK: call <3 x i16> @llvm.umax.v3i16(<3 x i16>  %a, <3 x i16>  %b)
+  ; CHECK: call <3 x i16> @llvm.umin.v3i16(<3 x i16>  %{{.*}}, <3 x i16>  %c)
+  %dx.clamp = call <3 x i16> @llvm.dx.uclamp.v3i16(<3 x i16> %a, <3 x i16> %b, <3 x i16> %c)
+  ret <3 x i16> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_uint4
+define noundef <4 x i32> @clamp_uint4(<4 x i32> noundef %a, <4 x i32> noundef %b, <4 x i32> noundef %c) {
+entry:
+  ; CHECK: call <4 x i32> @llvm.umax.v4i32(<4 x i32>  %a, <4 x i32>  %b)
+  ; CHECK: call <4 x i32> @llvm.umin.v4i32(<4 x i32>  %{{.*}}, <4 x i32>  %c)
+  %dx.clamp = call <4 x i32> @llvm.dx.uclamp.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %dx.clamp
+}
+
+; CHECK-LABEL: clamp_uint64_t4
+define noundef <2 x i64> @clamp_uint64_t4(<2 x i64> noundef %a, <2 x i64> noundef %b, <2 x i64> noundef %c) {
+entry:
+  ; CHECK: call <2 x i64> @llvm.umax.v2i64(<2 x i64>  %a, <2 x i64>  %b)
+  ; CHECK: call <2 x i64> @llvm.umin.v2i64(<2 x i64>  %{{.*}}, <2 x i64>  %c)
+  %dx.clamp = call <2 x i64> @llvm.dx.uclamp.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+  ret <2 x i64> %dx.clamp
+}
+
+declare <3 x half> @llvm.dx.clamp.v3f16(<3 x half>, <3 x half>, <3 x half>)
+declare <4 x float> @llvm.dx.clamp.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.dx.clamp.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x i32> @llvm.dx.clamp.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <3 x i16> @llvm.dx.uclamp.v3i32(<3 x i16>, <3 x i32>, <3 x i16>)
+declare <4 x i32> @llvm.dx.uclamp.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.dx.uclamp.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/DirectX/clamp.ll b/llvm/test/CodeGen/DirectX/clamp.ll
new file mode 100644
index 00000000000000..f122313b8d7dcc
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/clamp.ll
@@ -0,0 +1,94 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for clamp/uclamp are generated for half/float/double/i16/i32/i64.
+
+; CHECK-LABEL:test_clamp_i16
+define noundef i16 @test_clamp_i16(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 37, i16 %{{.*}}, i16 %{{.*}})
+; CHECK: call i16 @dx.op.binary.i16(i32 38, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.dx.clamp.i16(i16 %a, i16 %b, i16 %c)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_clamp_i32
+define noundef i32 @test_clamp_i32(i32 noundef %a, i32 noundef %b, i32 noundef %c) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 37, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: call i32 @dx.op.binary.i32(i32 38, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.dx.clamp.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_clamp_i64
+define noundef i64 @test_clamp_i64(i64 noundef %a, i64 noundef %b, i64 noundef %c) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 37, i64 %a, i64 %b)
+; CHECK: call i64 @dx.op.binary.i64(i32 38, i64 %{{.*}}, i64 %c)
+  %0 = call i64 @llvm.dx.clamp.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %0
+}
+
+; CHECK-LABEL:test_clamp_half
+define noundef half @test_clamp_half(half noundef %a, half noundef %b, half noundef %c) {
+entry:
+; CHECK: call half @dx.op.binary.f16(i32 35, half %{{.*}}, half %{{.*}})
+; CHECK: call half @dx.op.binary.f16(i32 36, half %{{.*}}, half %{{.*}})
+  %0 = call half @llvm.dx.clamp.f16(half %a, half %b, half %c)
+  ret half %0
+}
+
+; CHECK-LABEL:test_clamp_float
+define noundef float @test_clamp_float(float noundef %a, float noundef %b, float noundef %c) {
+entry:
+; CHECK: call float @dx.op.binary.f32(i32 35, float %{{.*}}, float %{{.*}})
+; CHECK: call float @dx.op.binary.f32(i32 36, float %{{.*}}, float %{{.*}})
+  %0 = call float @llvm.dx.clamp.f32(float %a, float %b, float %c)
+  ret float %0
+}
+
+; CHECK-LABEL:test_clamp_double
+define noundef double @test_clamp_double(double noundef %a, double noundef %b, double noundef %c) {
+entry:
+; CHECK: call double @dx.op.binary.f64(i32 35, double %{{.*}}, double %{{.*}})
+; CHECK: call double @dx.op.binary.f64(i32 36, double %{{.*}}, double %{{.*}})
+  %0 = call double @llvm.dx.clamp.f64(double %a, double %b, double %c)
+  ret double %0
+}
+
+; CHECK-LABEL:test_uclamp_i16
+define noundef i16 @test_uclamp_i16(i16 noundef %a, i16 noundef %b, i16 noundef %c) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 39, i16 %{{.*}}, i16 %{{.*}})
+; CHECK: call i16 @dx.op.binary.i16(i32 40, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.dx.uclamp.i16(i16 %a, i16 %b, i16 %c)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_uclamp_i32
+define noundef i32 @test_uclamp_i32(i32 noundef %a, i32 noundef %b, i32 noundef %c) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 39, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: call i32 @dx.op.binary.i32(i32 40, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.dx.uclamp.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_uclamp_i64
+define noundef i64 @test_uclamp_i64(i64 noundef %a, i64 noundef %b, i64 noundef %c) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 39, i64 %a, i64 %b)
+; CHECK: call i64 @dx.op.binary.i64(i32 40, i64 %{{.*}}, i64 %c)
+  %0 = call i64 @llvm.dx.uclamp.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %0
+}
+
+declare half @llvm.dx.clamp.f16(half, half, half)
+declare float @llvm.dx.clamp.f32(float, float, float)
+declare double @llvm.dx.clamp.f64(double, double, double)
+declare i16 @llvm.dx.clamp.i16(i16, i16, i16)
+declare i32 @llvm.dx.clamp.i32(i32, i32, i32)
+declare i64 @llvm.dx.clamp.i64(i64, i64, i64)
+declare i16 @llvm.dx.uclamp.i16(i16, i16, i16)
+declare i32 @llvm.dx.uclamp.i32(i32, i32, i32)
+declare i64 @llvm.dx.uclamp.i64(i64, i64, i64)
diff --git a/llvm/test/CodeGen/DirectX/exp-vec.ll b/llvm/test/CodeGen/DirectX/exp-vec.ll
new file mode 100644
index 00000000000000..c9371557190549
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/exp-vec.ll
@@ -0,0 +1,17 @@
+; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s
+
+; Make sure dxil operation function calls for exp are generated for float and half.
+
+; CHECK-LABEL: exp_float4
+; CHECK: fmul <4 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>,  %{{.*}}
+; CHECK: call <4 x float> @llvm.exp2.v4f32(<4 x float>  %{{.*}})
+define noundef <4 x float> @exp_float4(<4 x float> noundef %p0) {
+entry:
+  %p0.addr = alloca <4 x float>, align 16
+  store <4 x float> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x float>, ptr %p0.addr, align 16
+  %elt.exp = call <4 x float> @llvm.exp.v4f32(<4 x float> %0)
+  ret <4 x float> %elt.exp
+}
+
+declare <4 x float> @llvm.exp.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/exp.ll b/llvm/test/CodeGen/DirectX/exp.ll
new file mode 100644
index 00000000000000..fdafc1438cf0e8
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/exp.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for exp are generated for float and half.
+
+; CHECK-LABEL: exp_float
+; CHECK: fmul float 0x3FF7154760000000, %{{.*}}
+; CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}})
+define noundef float @exp_float(float noundef %a) {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, ptr %a.addr, align 4
+  %0 = load float, ptr %a.addr, align 4
+  %elt.exp = call float @llvm.exp.f32(float %0)
+  ret float %elt.exp
+}
+
+; CHECK-LABEL: exp_half
+; CHECK: fmul half 0xH3DC5, %{{.*}}
+; CHECK: call half @dx.op.unary.f16(i32 21, half %{{.*}})
+; Function Attrs: noinline nounwind optnone
+define noundef half @exp_half(half noundef %a) {
+entry:
+  %a.addr = alloca half, align 2
+  store half %a, ptr %a.addr, align 2
+  %0 = load half, ptr %a.addr, align 2
+  %elt.exp = call half @llvm.exp.f16(half %0)
+  ret half %elt.exp
+}
+
+declare half @llvm.exp.f16(half)
+declare float @llvm.exp.f32(float)
diff --git a/llvm/test/CodeGen/DirectX/fmax.ll b/llvm/test/CodeGen/DirectX/fmax.ll
new file mode 100644
index 00000000000000..aff722c29309c0
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/fmax.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for fmax are generated for half/float/double.
+
+; CHECK-LABEL:test_fmax_half
+define noundef half @test_fmax_half(half noundef %a, half noundef %b) {
+entry:
+; CHECK: call half @dx.op.binary.f16(i32 35, half %{{.*}}, half %{{.*}})
+  %0 = call half @llvm.maxnum.f16(half %a, half %b)
+  ret half %0
+}
+
+; CHECK-LABEL:test_fmax_float
+define noundef float @test_fmax_float(float noundef %a, float noundef %b) {
+entry:
+; CHECK: call float @dx.op.binary.f32(i32 35, float %{{.*}}, float %{{.*}})
+  %0 = call float @llvm.maxnum.f32(float %a, float %b)
+  ret float %0
+}
+
+; CHECK-LABEL:test_fmax_double
+define noundef double @test_fmax_double(double noundef %a, double noundef %b) {
+entry:
+; CHECK: call double @dx.op.binary.f64(i32 35, double %{{.*}}, double %{{.*}})
+  %0 = call double @llvm.maxnum.f64(double %a, double %b)
+  ret double %0
+}
+
+declare half @llvm.maxnum.f16(half, half)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
diff --git a/llvm/test/CodeGen/DirectX/fmin.ll b/llvm/test/CodeGen/DirectX/fmin.ll
new file mode 100644
index 00000000000000..2f7c209f0278ae
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/fmin.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for fmin are generated for half/float/double.
+
+; CHECK-LABEL:test_fmin_half
+define noundef half @test_fmin_half(half noundef %a, half noundef %b) {
+entry:
+; CHECK: call half @dx.op.binary.f16(i32 36, half %{{.*}}, half %{{.*}})
+  %0 = call half @llvm.minnum.f16(half %a, half %b)
+  ret half %0
+}
+
+; CHECK-LABEL:test_fmin_float
+define noundef float @test_fmin_float(float noundef %a, float noundef %b) {
+entry:
+; CHECK: call float @dx.op.binary.f32(i32 36, float %{{.*}}, float %{{.*}})
+  %0 = call float @llvm.minnum.f32(float %a, float %b)
+  ret float %0
+}
+
+; CHECK-LABEL:test_fmin_double
+define noundef double @test_fmin_double(double noundef %a, double noundef %b) {
+entry:
+; CHECK: call double @dx.op.binary.f64(i32 36, double %{{.*}}, double %{{.*}})
+  %0 = call double @llvm.minnum.f64(double %a, double %b)
+  ret double %0
+}
+
+declare half @llvm.minnum.f16(half, half)
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
diff --git a/llvm/test/CodeGen/DirectX/idot.ll b/llvm/test/CodeGen/DirectX/idot.ll
new file mode 100644
index 00000000000000..9f89a8d6d340d5
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/idot.ll
@@ -0,0 +1,100 @@
+; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-op-lower  < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+
+; Make sure dxil operation function calls for dot are generated for int/uint vectors.
+
+; CHECK-LABEL: dot_int16_t2
+define noundef i16 @dot_int16_t2(<2 x i16> noundef %a, <2 x i16> noundef %b) {
+entry:
+; CHECK: extractelement <2 x i16> %a, i64 0
+; CHECK: extractelement <2 x i16> %b, i64 0
+; CHECK: mul i16 %{{.*}}, %{{.*}}
+; CHECK: extractelement <2 x i16> %a, i64 1
+; CHECK: extractelement <2 x i16> %b, i64 1
+; EXPCHECK: call i16 @llvm.dx.imad.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; DOPCHECK: call i16 @dx.op.tertiary.i16(i32 48, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+  %dx.dot = call i16 @llvm.dx.sdot.v3i16(<2 x i16> %a, <2 x i16> %b)
+  ret i16 %dx.dot
+}
+
+; CHECK-LABEL: sdot_int4
+define noundef i32 @sdot_int4(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+entry:
+; CHECK: extractelement <4 x i32> %a, i64 0
+; CHECK: extractelement <4 x i32> %b, i64 0
+; CHECK: mul i32 %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i32> %a, i64 1
+; CHECK: extractelement <4 x i32> %b, i64 1
+; EXPCHECK: call i32 @llvm.dx.imad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: extractelement <4 x i32> %a, i64 2
+; CHECK: extractelement <4 x i32> %b, i64 2
+; EXPCHECK: call i32 @llvm.dx.imad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: extractelement <4 x i32> %a, i64 3
+; CHECK: extractelement <4 x i32> %b, i64 3
+; EXPCHECK: call i32 @llvm.dx.imad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 48, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  %dx.dot = call i32 @llvm.dx.sdot.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret i32 %dx.dot
+}
+
+; CHECK-LABEL: dot_uint16_t3
+define noundef i16 @dot_uint16_t3(<3 x i16> noundef %a, <3 x i16> noundef %b) {
+entry:
+; CHECK: extractelement <3 x i16> %a, i64 0
+; CHECK: extractelement <3 x i16> %b, i64 0
+; CHECK: mul i16 %{{.*}}, %{{.*}}
+; CHECK: extractelement <3 x i16> %a, i64 1
+; CHECK: extractelement <3 x i16> %b, i64 1
+; EXPCHECK: call i16 @llvm.dx.umad.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; DOPCHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; CHECK: extractelement <3 x i16> %a, i64 2
+; CHECK: extractelement <3 x i16> %b, i64 2
+; EXPCHECK: call i16 @llvm.dx.umad.i16(i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+; DOPCHECK: call i16 @dx.op.tertiary.i16(i32 49, i16 %{{.*}}, i16 %{{.*}}, i16 %{{.*}})
+  %dx.dot = call i16 @llvm.dx.udot.v3i16(<3 x i16> %a, <3 x i16> %b)
+  ret i16 %dx.dot
+}
+
+; CHECK-LABEL: dot_uint4
+define noundef i32 @dot_uint4(<4 x i32> noundef %a, <4 x i32> noundef %b) {
+entry:
+; CHECK: extractelement <4 x i32> %a, i64 0
+; CHECK: extractelement <4 x i32> %b, i64 0
+; CHECK: mul i32 %{{.*}}, %{{.*}}
+; CHECK: extractelement <4 x i32> %a, i64 1
+; CHECK: extractelement <4 x i32> %b, i64 1
+; EXPCHECK: call i32 @llvm.dx.umad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: extractelement <4 x i32> %a, i64 2
+; CHECK: extractelement <4 x i32> %b, i64 2
+; EXPCHECK: call i32 @llvm.dx.umad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: extractelement <4 x i32> %a, i64 3
+; CHECK: extractelement <4 x i32> %b, i64 3
+; EXPCHECK: call i32 @llvm.dx.umad.i32(i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+; DOPCHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  %dx.dot = call i32 @llvm.dx.udot.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret i32 %dx.dot
+}
+
+; CHECK-LABEL: dot_uint64_t4
+define noundef i64 @dot_uint64_t4(<2 x i64> noundef %a, <2 x i64> noundef %b) {
+entry:
+; CHECK: extractelement <2 x i64> %a, i64 0
+; CHECK: extractelement <2 x i64> %b, i64 0
+; CHECK: mul i64 %{{.*}}, %{{.*}}
+; CHECK: extractelement <2 x i64> %a, i64 1
+; CHECK: extractelement <2 x i64> %b, i64 1
+; EXPCHECK: call i64 @llvm.dx.umad.i64(i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+; DOPCHECK: call i64 @dx.op.tertiary.i64(i32 49, i64 %{{.*}}, i64 %{{.*}}, i64 %{{.*}})
+  %dx.dot = call i64 @llvm.dx.udot.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret i64 %dx.dot
+}
+
+declare i16 @llvm.dx.sdot.v2i16(<2 x i16>, <2 x i16>)
+declare i32 @llvm.dx.sdot.v4i32(<4 x i32>, <4 x i32>)
+declare i16 @llvm.dx.udot.v3i32(<3 x i16>, <3 x i16>)
+declare i32 @llvm.dx.udot.v4i32(<4 x i32>, <4 x i32>)
+declare i64 @llvm.dx.udot.v2i64(<2 x i64>, <2 x i64>)
diff --git a/llvm/test/CodeGen/DirectX/lerp.ll b/llvm/test/CodeGen/DirectX/lerp.ll
new file mode 100644
index 00000000000000..ebd7e133b5163c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/lerp.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for lerp are generated for float and half.
+
+; CHECK-LABEL: lerp_half
+; CHECK: fsub half %{{.*}}, %{{.*}}
+; CHECK: fmul half %{{.*}}, %{{.*}}
+; CHECK: fadd half %{{.*}}, %{{.*}}
+define noundef half @lerp_half(half noundef %p0) {
+entry:
+  %p0.addr = alloca half, align 2
+  store half %p0, ptr %p0.addr, align 2
+  %0 = load half, ptr %p0.addr, align 2
+  %1 = load half, ptr %p0.addr, align 2
+  %2 = load half, ptr %p0.addr, align 2
+  %dx.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
+  ret half %dx.lerp
+}
+
+; CHECK-LABEL: lerp_float
+; CHECK: fsub float %{{.*}}, %{{.*}}
+; CHECK: fmul float %{{.*}}, %{{.*}}
+; CHECK: fadd float %{{.*}}, %{{.*}}
+define noundef float @lerp_float(float noundef %p0, float noundef %p1) {
+entry:
+  %p1.addr = alloca float, align 4
+  %p0.addr = alloca float, align 4
+  store float %p1, ptr %p1.addr, align 4
+  store float %p0, ptr %p0.addr, align 4
+  %0 = load float, ptr %p0.addr, align 4
+  %1 = load float, ptr %p0.addr, align 4
+  %2 = load float, ptr %p0.addr, align 4
+  %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
+  ret float %dx.lerp
+}
+
+; CHECK-LABEL: lerp_float4
+; CHECK: fsub <4 x float> %{{.*}}, %{{.*}}
+; CHECK: fmul <4 x float> %{{.*}}, %{{.*}}
+; CHECK: fadd <4 x float> %{{.*}}, %{{.*}}
+define noundef <4 x float> @lerp_float4(<4 x float> noundef %p0, <4 x float> noundef %p1) {
+entry:
+  %p1.addr = alloca <4 x float>, align 16
+  %p0.addr = alloca <4 x float>, align 16
+  store <4 x float> %p1, ptr %p1.addr, align 16
+  store <4 x float> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x float>, ptr %p0.addr, align 16
+  %1 = load <4 x float>, ptr %p0.addr, align 16
+  %2 = load <4 x float>, ptr %p0.addr, align 16
+  %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+  ret <4 x float> %dx.lerp
+}
+
+declare half @llvm.dx.lerp.f16(half, half, half)
+declare float @llvm.dx.lerp.f32(float, float, float)
+declare <4 x float> @llvm.dx.lerp.v4f32(<4 x float>, <4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/lib_entry.ll b/llvm/test/CodeGen/DirectX/lib_entry.ll
index 9208d6d3f32246..5254a088055888 100644
--- a/llvm/test/CodeGen/DirectX/lib_entry.ll
+++ b/llvm/test/CodeGen/DirectX/lib_entry.ll
@@ -7,7 +7,7 @@ target triple = "dxil-unknown-shadermodel6.7-library"
 ; Make sure generate empty entry for lib profile.
 ;CHECK:![[empty_entry]] = !{null, !"", null, null, ![[shader_flags:[0-9]+]]}
 ; Make sure double is marked for shader flags.
-;CHECK:![[shader_flags]] = !{i32 0, i64 1}
+;CHECK:![[shader_flags]] = !{i32 0, i64 4}
 ;CHECK:![[entry]] = !{ptr @entry, !"entry", null, null, ![[extra:[0-9]+]]}
 ;CHECK:![[extra]] = !{i32 8, i32 5, i32 4, ![[numthreads:[0-9]+]]}
 ;CHECK:![[numthreads]] = !{i32 1, i32 2, i32 1}
diff --git a/llvm/test/CodeGen/DirectX/rcp.ll b/llvm/test/CodeGen/DirectX/rcp.ll
new file mode 100644
index 00000000000000..65abe832db53fe
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rcp.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for rcp are generated for float, double, and half.
+
+; CHECK-LABEL: rcp_float4
+; CHECK: fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %{{.*}}
+define noundef <4 x float> @rcp_float4(<4 x float> noundef %p0) {
+entry:
+  %p0.addr = alloca <4 x float>, align 16
+  store <4 x float> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x float>, ptr %p0.addr, align 16
+  %dx.rcp = call <4 x float> @llvm.dx.rcp.v4f32(<4 x float> %0)
+  ret <4 x float> %dx.rcp
+}
+
+; CHECK-LABEL: rcp_double4
+; CHECK: fdiv <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %{{.*}}
+define noundef <4 x double> @rcp_double4(<4 x double> noundef %p0) {
+entry:
+  %p0.addr = alloca <4 x double>, align 16
+  store <4 x double> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x double>, ptr %p0.addr, align 16
+  %dx.rcp = call <4 x double> @llvm.dx.rcp.v4f64(<4 x double> %0)
+  ret <4 x double> %dx.rcp
+}
+
+; CHECK-LABEL: rcp_half4
+; CHECK: fdiv <4 x half> <half  0xH3C00, half  0xH3C00, half  0xH3C00, half  0xH3C00>, %{{.*}} 
+define noundef <4 x half> @rcp_half4(<4 x half> noundef %p0) {
+entry:
+  %p0.addr = alloca <4 x half>, align 16
+  store <4 x half> %p0, ptr %p0.addr, align 16
+  %0 = load <4 x half>, ptr %p0.addr, align 16
+  %dx.rcp = call <4 x half> @llvm.dx.rcp.v4f16(<4 x half> %0)
+  ret <4 x half> %dx.rcp
+}
+
+; CHECK-LABEL: rcp_half
+; CHECK: fdiv half 0xH3C00, %{{.*}} 
+define noundef half @rcp_half(half noundef %p0) {
+entry:
+  %p0.addr = alloca half, align 2
+  store half %p0, ptr %p0.addr, align 2
+  %0 = load half, ptr %p0.addr, align 2
+  %dx.rcp = call half @llvm.dx.rcp.f16(half %0)
+  ret half %dx.rcp
+}
+
+declare half @llvm.dx.rcp.f16(half)
+declare <4 x half> @llvm.dx.rcp.v4f16(<4 x half>)
+declare <4 x float> @llvm.dx.rcp.v4f32(<4 x float>)
+declare <4 x double> @llvm.dx.rcp.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/DirectX/rsqrt.ll b/llvm/test/CodeGen/DirectX/rsqrt.ll
new file mode 100644
index 00000000000000..52af0e62220b3e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rsqrt.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for rsqrt are generated for float and half.
+
+; CHECK-LABEL: rsqrt_float
+; CHECK: call float @dx.op.unary.f32(i32 25, float %{{.*}})
+define noundef float @rsqrt_float(float noundef %a) {
+entry:
+  %a.addr = alloca float, align 4
+  store float %a, ptr %a.addr, align 4
+  %0 = load float, ptr %a.addr, align 4
+  %dx.rsqrt = call float @llvm.dx.rsqrt.f32(float %0)
+  ret float %dx.rsqrt
+}
+
+; CHECK-LABEL: rsqrt_half
+; CHECK: call half @dx.op.unary.f16(i32 25, half %{{.*}})
+define noundef half @rsqrt_half(half noundef %a) {
+entry:
+  %a.addr = alloca half, align 2
+  store half %a, ptr %a.addr, align 2
+  %0 = load half, ptr %a.addr, align 2
+  %dx.rsqrt = call half @llvm.dx.rsqrt.f16(half %0)
+  ret half %dx.rsqrt
+}
+
+declare half @llvm.dx.rsqrt.f16(half)
+declare float @llvm.dx.rsqrt.f32(float)
diff --git a/llvm/test/CodeGen/DirectX/rsqrt_error.ll b/llvm/test/CodeGen/DirectX/rsqrt_error.ll
new file mode 100644
index 00000000000000..9cd5002c20f7ec
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/rsqrt_error.ll
@@ -0,0 +1,14 @@
+; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s
+
+; DXIL operation rsqrt does not support double overload type
+; CHECK: LLVM ERROR: Invalid Overload Type
+
+; Function Attrs: noinline nounwind optnone
+define noundef double @rsqrt_double(double noundef %a) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  store double %a, ptr %a.addr, align 8
+  %0 = load double, ptr %a.addr, align 8
+  %dx.rsqrt = call double @llvm.dx.rsqrt.f64(double %0)
+  ret double %dx.rsqrt
+}
diff --git a/llvm/test/CodeGen/DirectX/smax.ll b/llvm/test/CodeGen/DirectX/smax.ll
new file mode 100644
index 00000000000000..8b2406782c0938
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/smax.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for smax are generated for i16/i32/i64.
+
+; CHECK-LABEL:test_smax_i16
+define noundef i16 @test_smax_i16(i16 noundef %a, i16 noundef %b) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 37, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.smax.i16(i16 %a, i16 %b)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_smax_i32
+define noundef i32 @test_smax_i32(i32 noundef %a, i32 noundef %b) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 37, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_smax_i64
+define noundef i64 @test_smax_i64(i64 noundef %a, i64 noundef %b) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 37, i64 %{{.*}}, i64 %{{.*}})
+  %0 = call i64 @llvm.smax.i64(i64 %a, i64 %b)
+  ret i64 %0
+}
+
+declare i16 @llvm.smax.i16(i16, i16)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i64 @llvm.smax.i64(i64, i64)
diff --git a/llvm/test/CodeGen/DirectX/smin.ll b/llvm/test/CodeGen/DirectX/smin.ll
new file mode 100644
index 00000000000000..b2b40a1b624335
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/smin.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for smin are generated for i16/i32/i64.
+
+; CHECK-LABEL:test_smin_i16
+define noundef i16 @test_smin_i16(i16 noundef %a, i16 noundef %b) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 38, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.smin.i16(i16 %a, i16 %b)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_smin_i32
+define noundef i32 @test_smin_i32(i32 noundef %a, i32 noundef %b) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 38, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_smin_i64
+define noundef i64 @test_smin_i64(i64 noundef %a, i64 noundef %b) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 38, i64 %{{.*}}, i64 %{{.*}})
+  %0 = call i64 @llvm.smin.i64(i64 %a, i64 %b)
+  ret i64 %0
+}
+
+declare i16 @llvm.smin.i16(i16, i16)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i64 @llvm.smin.i64(i64, i64)
diff --git a/llvm/test/CodeGen/DirectX/umax.ll b/llvm/test/CodeGen/DirectX/umax.ll
index c7b6a87599279a..be0f557fc8da69 100644
--- a/llvm/test/CodeGen/DirectX/umax.ll
+++ b/llvm/test/CodeGen/DirectX/umax.ll
@@ -1,30 +1,31 @@
 ; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
 
-; Make sure dxil operation function calls for umax are generated for i32/i64.
+; Make sure dxil operation function calls for umax are generated for i16/i32/i64.
 
-target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
-target triple = "dxil-pc-shadermodel6.7-library"
+; CHECK-LABEL:test_umax_i16
+define noundef i16 @test_umax_i16(i16 noundef %a, i16 noundef %b) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 39, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.umax.i16(i16 %a, i16 %b)
+  ret i16 %0
+}
 
 ; CHECK-LABEL:test_umax_i32
-; Function Attrs: noinline nounwind optnone
-define noundef i32 @test_umax_i32(i32 noundef %a, i32 noundef %b) #0 {
+define noundef i32 @test_umax_i32(i32 noundef %a, i32 noundef %b) {
 entry:
-; CHECK:call i32 @dx.op.binary.i32(i32 39, i32 %{{.*}}, i32 %{{.*}})
+; CHECK: call i32 @dx.op.binary.i32(i32 39, i32 %{{.*}}, i32 %{{.*}})
   %0 = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   ret i32 %0
 }
 
 ; CHECK-LABEL:test_umax_i64
-define noundef i64 @test_umax_i64(i64 noundef %a, i64 noundef %b) #0 {
+define noundef i64 @test_umax_i64(i64 noundef %a, i64 noundef %b) {
 entry:
-; CHECK:call i64 @dx.op.binary.i64(i32 39, i64 %{{.*}}, i64 %{{.*}})
+; CHECK: call i64 @dx.op.binary.i64(i32 39, i64 %{{.*}}, i64 %{{.*}})
   %0 = call i64 @llvm.umax.i64(i64 %a, i64 %b)
   ret i64 %0
 }
 
-; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
-declare i32 @llvm.umax.i32(i32, i32) #1
-declare i64 @llvm.umax.i64(i64, i64) #1
-
-attributes #0 = { noinline nounwind }
-attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
+declare i16 @llvm.umax.i16(i16, i16)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i64 @llvm.umax.i64(i64, i64)
diff --git a/llvm/test/CodeGen/DirectX/umin.ll b/llvm/test/CodeGen/DirectX/umin.ll
new file mode 100644
index 00000000000000..5051c711744892
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/umin.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -dxil-op-lower < %s | FileCheck %s
+
+; Make sure dxil operation function calls for umin are generated for i16/i32/i64.
+
+; CHECK-LABEL:test_umin_i16
+define noundef i16 @test_umin_i16(i16 noundef %a, i16 noundef %b) {
+entry:
+; CHECK: call i16 @dx.op.binary.i16(i32 40, i16 %{{.*}}, i16 %{{.*}})
+  %0 = call i16 @llvm.umin.i16(i16 %a, i16 %b)
+  ret i16 %0
+}
+
+; CHECK-LABEL:test_umin_i32
+define noundef i32 @test_umin_i32(i32 noundef %a, i32 noundef %b) {
+entry:
+; CHECK: call i32 @dx.op.binary.i32(i32 40, i32 %{{.*}}, i32 %{{.*}})
+  %0 = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+  ret i32 %0
+}
+
+; CHECK-LABEL:test_umin_i64
+define noundef i64 @test_umin_i64(i64 noundef %a, i64 noundef %b) {
+entry:
+; CHECK: call i64 @dx.op.binary.i64(i32 40, i64 %{{.*}}, i64 %{{.*}})
+  %0 = call i64 @llvm.umin.i64(i64 %a, i64 %b)
+  ret i64 %0
+}
+
+declare i16 @llvm.umin.i16(i16, i16)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i64 @llvm.umin.i64(i64, i64)
diff --git a/llvm/test/CodeGen/Generic/ForceStackAlign.ll b/llvm/test/CodeGen/Generic/ForceStackAlign.ll
index 2c35ad350f0473..7993b3eff65b68 100644
--- a/llvm/test/CodeGen/Generic/ForceStackAlign.ll
+++ b/llvm/test/CodeGen/Generic/ForceStackAlign.ll
@@ -8,7 +8,7 @@
 ; Stack realignment not supported.
 ; XFAIL: target=sparc{{.*}}
 
-; NVPTX cannot select dynamic_stackalloc
+; NVPTX can only select dynamic_stackalloc on sm_52+ and with ptx73+
 ; XFAIL: target=nvptx{{.*}}
 
 define i32 @f(ptr %p) nounwind {
diff --git a/llvm/test/CodeGen/Hexagon/build-attributes.ll b/llvm/test/CodeGen/Hexagon/build-attributes.ll
new file mode 100644
index 00000000000000..48ee31a4d19bd6
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/build-attributes.ll
@@ -0,0 +1,16 @@
+;; Generate build attributes from llc.
+
+; RUN: llc -mtriple=hexagon-unknown-elf \
+; RUN:  -mattr=+hvxv73,+cabac,+v71,+hvx-ieee-fp,+hvx-length128b %s -o - | FileCheck  %s
+
+;      CHECK: .attribute      4, 71  // Tag_arch
+; CHECK-NEXT: .attribute      5, 73  // Tag_hvx_arch
+; CHECK-NEXT: .attribute      6, 1   // Tag_hvx_ieeefp
+; CHECK-NEXT: .attribute      7, 1   // Tag_hvx_qfloat
+; CHECK-NEXT: .attribute      8, 1   // Tag_zreg
+; CHECK-NEXT: .attribute      10, 1   // Tag_cabac
+
+define i32 @addi(i32 %a) {
+  %1 = add i32 %a, 1
+  ret i32 %1
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/Hexagon/livephysregs-regmask-clobber.mir b/llvm/test/CodeGen/Hexagon/livephysregs-regmask-clobber.mir
index 8f1cb42b96a6fa..52213070f53567 100644
--- a/llvm/test/CodeGen/Hexagon/livephysregs-regmask-clobber.mir
+++ b/llvm/test/CodeGen/Hexagon/livephysregs-regmask-clobber.mir
@@ -17,6 +17,8 @@
 
 name: f0
 tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, offset: 0, size: 128, alignment: 128 }
   - { id: 1, offset: 128, size: 128, alignment: 128 }
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid4.mir b/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid4.mir
deleted file mode 100644
index d66dd104486922..00000000000000
--- a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid4.mir
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: not llc -mtriple=aarch64-- -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
-# When a low-level type is 0 bits
----
-name: test_scalar_size_0
-body: |
-  bb.0:
-    liveins: $x0
-    ; CHECK: [[@LINE+1]]:10: invalid size for scalar type
-    %0:_(s0) = G_IMPLICIT_DEF
-...
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir b/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir
index 698568701fb1a4..632e5fa81db111 100644
--- a/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/parse-low-level-type-invalid6.mir
@@ -5,6 +5,6 @@ name: test_vector_element_size_0
 body: |
   bb.0:
     liveins: $x0
-    ; CHECK: [[@LINE+1]]:15: invalid size for scalar type
+    ; CHECK: [[@LINE+1]]:15: invalid size for scalar element in vector
     %0:_(<2 x s0>) = G_IMPLICIT_DEF
 ...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
index e40d1879399cef..9831f786b847d3 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir
@@ -29,6 +29,8 @@ liveins:
   - { reg: '$vgpr0', virtual-reg: '' }
   - { reg: '$vgpr1', virtual-reg: '' }
   - { reg: '$vgpr2', virtual-reg: '' }
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, name: '', type: spill-slot, offset: 0, size: 8, alignment: 4,
       stack-id: sgpr-spill, callee-saved-register: '', callee-saved-restored: true,
diff --git a/llvm/test/CodeGen/Mips/avoid-zero-copy.mir b/llvm/test/CodeGen/Mips/avoid-zero-copy.mir
index 5c7cffd109ea62..e3990bdf9bc3fb 100644
--- a/llvm/test/CodeGen/Mips/avoid-zero-copy.mir
+++ b/llvm/test/CodeGen/Mips/avoid-zero-copy.mir
@@ -19,6 +19,8 @@
 ...
 ---
 name:            a
+frameInfo:
+  adjustsStack:    true
 body:             |
   bb.0 (%ir-block.0):
     liveins: $a0_64, $t9_64, $ra_64, $fp_64, $gp_64
diff --git a/llvm/test/CodeGen/Mips/msa/emergency-spill.mir b/llvm/test/CodeGen/Mips/msa/emergency-spill.mir
index e1c7b2158d6174..20894645286618 100644
--- a/llvm/test/CodeGen/Mips/msa/emergency-spill.mir
+++ b/llvm/test/CodeGen/Mips/msa/emergency-spill.mir
@@ -90,7 +90,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    16
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index 6a27c9f5dac9b2..45c7ab980edd64 100644
--- a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -405,8 +405,9 @@ define void @uitofp(i32 %a) {
 ; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(uitofp)))
 ; MIPS64-N32-NEXT:    lui $2, 17200
 ; MIPS64-N32-NEXT:    sw $2, 12($sp)
-; MIPS64-N32-NEXT:    sll $2, $4, 0
-; MIPS64-N32-NEXT:    sw $2, 8($sp)
+; MIPS64R5-N32-NEXT:    sll $2, $4, 0
+; MIPS64R5-N32-NEXT:    sw $2, 8($sp)
+; MIPSR6-N32-NEXT:    sw $4, 8($sp)
 ; MIPS64-N32-NEXT:    lw $2, %got_page(.LCPI5_0)($1)
 ; MIPS64-N32-NEXT:    ldc1 $f0, %got_ofst(.LCPI5_0)($2)
 ; MIPS64-N32-NEXT:    ldc1 $f1, 8($sp)
@@ -430,8 +431,9 @@ define void @uitofp(i32 %a) {
 ; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(uitofp)))
 ; MIPS64-N64-NEXT:    lui $2, 17200
 ; MIPS64-N64-NEXT:    sw $2, 12($sp)
-; MIPS64-N64-NEXT:    sll $2, $4, 0
-; MIPS64-N64-NEXT:    sw $2, 8($sp)
+; MIPS64R5-N64-NEXT:    sll $2, $4, 0
+; MIPS64R5-N64-NEXT:    sw $2, 8($sp)
+; MIPSR6-N64-NEXT:    sw $4, 8($sp)
 ; MIPS64-N64-NEXT:    ld $2, %got_page(.LCPI5_0)($1)
 ; MIPS64-N64-NEXT:    ldc1 $f0, %got_ofst(.LCPI5_0)($2)
 ; MIPS64-N64-NEXT:    ldc1 $f1, 8($sp)
diff --git a/llvm/test/CodeGen/Mips/no-unaligned-access-r6.ll b/llvm/test/CodeGen/Mips/no-unaligned-access-r6.ll
new file mode 100644
index 00000000000000..0695868f9c8038
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/no-unaligned-access-r6.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+;; Test the strict-align feature which is similar to AArch64/arm64-strict-align.ll.
+
+; RUN: llc --mtriple=mipsisa32r6 < %s | FileCheck %s --check-prefix=MIPS32R6-UNALIGNED
+; RUN: llc --mtriple=mipsisa32r6 --mattr=-strict-align < %s | FileCheck %s --check-prefix=MIPS32R6-UNALIGNED
+; RUN: llc --mtriple=mipsisa32r6 --mattr=+strict-align < %s | FileCheck %s --check-prefix=MIPS32R6-ALIGNED
+
+; RUN: llc --mtriple=mipsisa64r6 < %s | FileCheck %s --check-prefix=MIPS64R6-UNALIGNED
+; RUN: llc --mtriple=mipsisa64r6 --mattr=-strict-align < %s | FileCheck %s --check-prefix=MIPS64R6-UNALIGNED
+; RUN: llc --mtriple=mipsisa64r6 --mattr=+strict-align < %s | FileCheck %s --check-prefix=MIPS64R6-ALIGNED
+
+define i32 @f0(ptr %p) nounwind {
+; MIPS32R6-UNALIGNED-LABEL: f0:
+; MIPS32R6-UNALIGNED:       # %bb.0:
+; MIPS32R6-UNALIGNED-NEXT:    lw $2, 0($4)
+; MIPS32R6-UNALIGNED-NEXT:    jrc $ra
+;
+; MIPS32R6-ALIGNED-LABEL: f0:
+; MIPS32R6-ALIGNED:       # %bb.0:
+; MIPS32R6-ALIGNED-NEXT:    lhu $1, 2($4)
+; MIPS32R6-ALIGNED-NEXT:    lhu $2, 0($4)
+; MIPS32R6-ALIGNED-NEXT:    sll $2, $2, 16
+; MIPS32R6-ALIGNED-NEXT:    jr $ra
+; MIPS32R6-ALIGNED-NEXT:    or $2, $2, $1
+;
+; MIPS64R6-UNALIGNED-LABEL: f0:
+; MIPS64R6-UNALIGNED:       # %bb.0:
+; MIPS64R6-UNALIGNED-NEXT:    lw $2, 0($4)
+; MIPS64R6-UNALIGNED-NEXT:    jrc $ra
+;
+; MIPS64R6-ALIGNED-LABEL: f0:
+; MIPS64R6-ALIGNED:       # %bb.0:
+; MIPS64R6-ALIGNED-NEXT:    lhu $1, 2($4)
+; MIPS64R6-ALIGNED-NEXT:    lhu $2, 0($4)
+; MIPS64R6-ALIGNED-NEXT:    sll $2, $2, 16
+; MIPS64R6-ALIGNED-NEXT:    jr $ra
+; MIPS64R6-ALIGNED-NEXT:    or $2, $2, $1
+  %tmp = load i32, ptr %p, align 2
+  ret i32 %tmp
+}
+
+define i64 @f1(ptr %p) nounwind {
+; MIPS32R6-UNALIGNED-LABEL: f1:
+; MIPS32R6-UNALIGNED:       # %bb.0:
+; MIPS32R6-UNALIGNED-NEXT:    lw $2, 0($4)
+; MIPS32R6-UNALIGNED-NEXT:    lw $3, 4($4)
+; MIPS32R6-UNALIGNED-NEXT:    jrc $ra
+;
+; MIPS32R6-ALIGNED-LABEL: f1:
+; MIPS32R6-ALIGNED:       # %bb.0:
+; MIPS32R6-ALIGNED-NEXT:    lw $2, 0($4)
+; MIPS32R6-ALIGNED-NEXT:    lw $3, 4($4)
+; MIPS32R6-ALIGNED-NEXT:    jrc $ra
+;
+; MIPS64R6-UNALIGNED-LABEL: f1:
+; MIPS64R6-UNALIGNED:       # %bb.0:
+; MIPS64R6-UNALIGNED-NEXT:    ld $2, 0($4)
+; MIPS64R6-UNALIGNED-NEXT:    jrc $ra
+;
+; MIPS64R6-ALIGNED-LABEL: f1:
+; MIPS64R6-ALIGNED:       # %bb.0:
+; MIPS64R6-ALIGNED-NEXT:    lwu $1, 4($4)
+; MIPS64R6-ALIGNED-NEXT:    lwu $2, 0($4)
+; MIPS64R6-ALIGNED-NEXT:    dsll $2, $2, 32
+; MIPS64R6-ALIGNED-NEXT:    jr $ra
+; MIPS64R6-ALIGNED-NEXT:    or $2, $2, $1
+  %tmp = load i64, ptr %p, align 4
+  ret i64 %tmp
+}
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
new file mode 100644
index 00000000000000..9cc45fbe313b7e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK64
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx62 | FileCheck %s --check-prefixes=CHECKPTX62
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %val) {
+; CHECK-LABEL: test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK-NEXT:    atom.add.noftz.f16 %rs2, [%r1], %rs1;
+; CHECK-NEXT:    ld.param.u32 %r2, [test_param_1];
+; CHECK-NEXT:    mov.b16 %rs3, 0x3C00;
+; CHECK-NEXT:    atom.add.noftz.f16 %rs4, [%r1], %rs3;
+; CHECK-NEXT:    ld.param.u32 %r3, [test_param_2];
+; CHECK-NEXT:    atom.global.add.noftz.f16 %rs5, [%r2], %rs1;
+; CHECK-NEXT:    atom.shared.add.noftz.f16 %rs6, [%r3], %rs1;
+; CHECK-NEXT:    ret;
+;
+; CHECK64-LABEL: test(
+; CHECK64:       {
+; CHECK64-NEXT:    .reg .b16 %rs<7>;
+; CHECK64-NEXT:    .reg .b64 %rd<4>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT:  // %bb.0:
+; CHECK64-NEXT:    ld.param.u64 %rd1, [test_param_0];
+; CHECK64-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK64-NEXT:    atom.add.noftz.f16 %rs2, [%rd1], %rs1;
+; CHECK64-NEXT:    ld.param.u64 %rd2, [test_param_1];
+; CHECK64-NEXT:    mov.b16 %rs3, 0x3C00;
+; CHECK64-NEXT:    atom.add.noftz.f16 %rs4, [%rd1], %rs3;
+; CHECK64-NEXT:    ld.param.u64 %rd3, [test_param_2];
+; CHECK64-NEXT:    atom.global.add.noftz.f16 %rs5, [%rd2], %rs1;
+; CHECK64-NEXT:    atom.shared.add.noftz.f16 %rs6, [%rd3], %rs1;
+; CHECK64-NEXT:    ret;
+;
+; CHECKPTX62-LABEL: test(
+; CHECKPTX62:       {
+; CHECKPTX62-NEXT:    .reg .pred %p<5>;
+; CHECKPTX62-NEXT:    .reg .b16 %rs<19>;
+; CHECKPTX62-NEXT:    .reg .b32 %r<58>;
+; CHECKPTX62-EMPTY:
+; CHECKPTX62-NEXT:  // %bb.0:
+; CHECKPTX62-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECKPTX62-NEXT:    ld.param.u32 %r23, [test_param_2];
+; CHECKPTX62-NEXT:    ld.param.u32 %r22, [test_param_1];
+; CHECKPTX62-NEXT:    ld.param.u32 %r24, [test_param_0];
+; CHECKPTX62-NEXT:    and.b32 %r1, %r24, -4;
+; CHECKPTX62-NEXT:    and.b32 %r25, %r24, 3;
+; CHECKPTX62-NEXT:    shl.b32 %r2, %r25, 3;
+; CHECKPTX62-NEXT:    mov.b32 %r26, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX62-NEXT:    not.b32 %r3, %r27;
+; CHECKPTX62-NEXT:    ld.u32 %r54, [%r1];
+; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r28, %r54, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r28;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs4, %rs2, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r29, %rs4;
+; CHECKPTX62-NEXT:    shl.b32 %r30, %r29, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r31, %r54, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r32, %r31, %r30;
+; CHECKPTX62-NEXT:    atom.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p1, %r6, %r54;
+; CHECKPTX62-NEXT:    mov.u32 %r54, %r6;
+; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
+; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECKPTX62-NEXT:    ld.u32 %r55, [%r1];
+; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start9
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r33, %r55, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs6, %r33;
+; CHECKPTX62-NEXT:    mov.b16 %rs8, 0x3C00;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs9, %rs6, %rs8;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs9;
+; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r36, %r55, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX62-NEXT:    atom.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p2, %r9, %r55;
+; CHECKPTX62-NEXT:    mov.u32 %r55, %r9;
+; CHECKPTX62-NEXT:    @%p2 bra $L__BB0_3;
+; CHECKPTX62-NEXT:  // %bb.4: // %atomicrmw.end8
+; CHECKPTX62-NEXT:    and.b32 %r10, %r22, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r38, %r22, 3;
+; CHECKPTX62-NEXT:    and.b32 %r11, %r38, 24;
+; CHECKPTX62-NEXT:    shl.b32 %r40, %r26, %r11;
+; CHECKPTX62-NEXT:    not.b32 %r12, %r40;
+; CHECKPTX62-NEXT:    ld.global.u32 %r56, [%r10];
+; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start27
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r41, %r56, %r11;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs11, %r41;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs13, %rs11, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs13;
+; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r11;
+; CHECKPTX62-NEXT:    and.b32 %r44, %r56, %r12;
+; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX62-NEXT:    atom.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p3, %r15, %r56;
+; CHECKPTX62-NEXT:    mov.u32 %r56, %r15;
+; CHECKPTX62-NEXT:    @%p3 bra $L__BB0_5;
+; CHECKPTX62-NEXT:  // %bb.6: // %atomicrmw.end26
+; CHECKPTX62-NEXT:    and.b32 %r16, %r23, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r46, %r23, 3;
+; CHECKPTX62-NEXT:    and.b32 %r17, %r46, 24;
+; CHECKPTX62-NEXT:    shl.b32 %r48, %r26, %r17;
+; CHECKPTX62-NEXT:    not.b32 %r18, %r48;
+; CHECKPTX62-NEXT:    ld.shared.u32 %r57, [%r16];
+; CHECKPTX62-NEXT:  $L__BB0_7: // %atomicrmw.start45
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r49, %r57, %r17;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs15, %r49;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs17, %rs15, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r50, %rs17;
+; CHECKPTX62-NEXT:    shl.b32 %r51, %r50, %r17;
+; CHECKPTX62-NEXT:    and.b32 %r52, %r57, %r18;
+; CHECKPTX62-NEXT:    or.b32 %r53, %r52, %r51;
+; CHECKPTX62-NEXT:    atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p4, %r21, %r57;
+; CHECKPTX62-NEXT:    mov.u32 %r57, %r21;
+; CHECKPTX62-NEXT:    @%p4 bra $L__BB0_7;
+; CHECKPTX62-NEXT:  // %bb.8: // %atomicrmw.end44
+; CHECKPTX62-NEXT:    ret;
+  %r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
+  %r2 = atomicrmw fadd ptr %dp0, half 1.0 seq_cst
+  %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val seq_cst
+  %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val seq_cst
+  ret void
+}
+
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index e99d0fd05e346b..6f2b5dcf47f13b 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -175,6 +175,13 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
   ret float %ret
 }
 
+; CHECK-LABEL: atomicrmw_add_f16_generic
+define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
+; CHECK: atom.cas
+  %ret = atomicrmw fadd ptr %addr, half %val seq_cst
+  ret half %ret
+}
+
 ; CHECK-LABEL: atomicrmw_add_f32_addrspace1
 define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
 ; CHECK: atom.global.add.f32
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
new file mode 100644
index 00000000000000..3f929ec6a75d0a
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i16 @bswap16(i16 %a) {
+; CHECK-LABEL: bswap16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [bswap16_param_0];
+; CHECK-NEXT:    shr.u16 %rs2, %rs1, 8;
+; CHECK-NEXT:    shl.b16 %rs3, %rs1, 8;
+; CHECK-NEXT:    or.b16 %rs4, %rs3, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+  %b = tail call i16 @llvm.bswap.i16(i16 %a)
+  ret i16 %b
+}
+
+
+define i32 @bswap32(i32 %a) {
+; CHECK-LABEL: bswap32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bswap32_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT:    ret;
+  %b = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %b
+}
+
+
+define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
+; CHECK-LABEL: bswapv2i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bswapv2i16_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 8961;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT:    ret;
+  %b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
+  ret <2 x i16> %b
+}
+
+define i64 @bswap64(i64 %a) {
+; CHECK-LABEL: bswap64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [bswap64_param_0];
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 291;
+; CHECK-NEXT:    mov.b64 %rd2, {%r4, %r2};
+; CHECK-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; CHECK-NEXT:    ret;
+  %b = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %b
+}
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
+declare i64 @llvm.bswap.i64(i64)
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 3ef55ca5309f88..09297fb819ce55 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -1,10 +1,44 @@
-; RUN: not llc -march=nvptx < %s 2>&1 | FileCheck %s
-; RUN: not llc -march=nvptx64 < %s 2>&1 | FileCheck %s
+; RUN: not llc < %s -march=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
+; RUN: not llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
 
-; CHECK: in function test_dynamic_stackalloc{{.*}}: dynamic alloca unsupported by NVPTX backend
+; RUN: llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32
+; RUN: llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
 
-define void @test_dynamic_stackalloc(i64 %n) {
-  %alloca = alloca i32, i64 %n
-  store volatile i32 0, ptr %alloca
-  ret void
+; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.
+
+; CHECK-LABEL: .visible .func  (.param .b32 func_retval0) test_dynamic_stackalloc(
+; CHECK-NOT: __local_depot
+
+; CHECK-32:       ld.param.u32  %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-32-NEXT:  mad.lo.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 1, 7;
+; CHECK-32-NEXT:  and.b32         %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
+; CHECK-32-NEXT:  alloca.u32  %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
+; CHECK-32-NEXT:  cvta.local.u32  %r[[ALLOCA]], %r[[ALLOCA]];
+; CHECK-32-NEXT:  { // callseq 0, 0
+; CHECK-32-NEXT:  .reg .b32 temp_param_reg;
+; CHECK-32-NEXT:  .param .b32 param0;
+; CHECK-32-NEXT:  st.param.b32  [param0+0], %r[[ALLOCA]];
+
+; CHECK-64:       ld.param.u64  %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-64-NEXT:  add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
+; CHECK-64-NEXT:  and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
+; CHECK-64-NEXT:  alloca.u64  %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
+; CHECK-64-NEXT:  cvta.local.u64  %rd[[ALLOCA]], %rd[[ALLOCA]];
+; CHECK-64-NEXT:  { // callseq 0, 0
+; CHECK-64-NEXT:  .reg .b32 temp_param_reg;
+; CHECK-64-NEXT:  .param .b64 param0;
+; CHECK-64-NEXT:  st.param.b64  [param0+0], %rd[[ALLOCA]];
+
+; CHECK-NEXT:     .param .b32 retval0;
+; CHECK-NEXT:     call.uni (retval0),
+; CHECK-NEXT:     bar,
+
+define i32 @test_dynamic_stackalloc(i64 %n) {
+  %alloca = alloca i8, i64 %n, align 16
+  %call = call i32 @bar(ptr %alloca)
+  ret i32 %call
 }
+
+declare i32 @bar(ptr)
diff --git a/llvm/test/CodeGen/PowerPC/aix-codemodel-attr.ll b/llvm/test/CodeGen/PowerPC/aix-codemodel-attr.ll
new file mode 100644
index 00000000000000..ef1156e338a89a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-codemodel-attr.ll
@@ -0,0 +1,166 @@
+; RUN: llc --verify-machineinstrs -mtriple powerpc-ibm-aix --code-model=small < \
+; RUN: %s | FileCheck --check-prefixes=CHECK,CHECK32,CHECK-SMALL,CHECK-SMALL32 %s
+
+; RUN: llc --verify-machineinstrs -mtriple powerpc-ibm-aix --code-model=large < \
+; RUN: %s | FileCheck --check-prefixes=CHECK,CHECK32,CHECK-LARGE,CHECK-LARGE32 %s
+
+; RUN: llc --verify-machineinstrs -mtriple powerpc64-ibm-aix --code-model=small < \
+; RUN: %s | FileCheck --check-prefixes=CHECK,CHECK64,CHECK-SMALL,CHECK-SMALL64 %s
+
+; RUN: llc --verify-machineinstrs -mtriple powerpc64-ibm-aix --code-model=large < \
+; RUN: %s | FileCheck --check-prefixes=CHECK,CHECK64,CHECK-LARGE,CHECK-LARGE64 %s
+
+@a = external dso_local global i32, code_model "small", align 4
+@b = external dso_local global i32, code_model "large", align 4
+@c = dso_local global i32 55, code_model "small", align 4
+@d = dso_local global i32 41, code_model "large", align 4
+@e = external dso_local global i32, align 4
+@f = dso_local global i32 2748, align 4
+
+@large_aliasee = global i32 10, code_model "large", align 4
+@small_aliasee = global i32 171, code_model "small", align 4
+@normal_aliasee = global i32 2748, align 4
+
+@al = alias i32, ptr @large_aliasee
+@as = alias i32, ptr @small_aliasee
+@an = alias i32, ptr @normal_aliasee
+
+define i32 @A() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @a, align 4
+  ret i32 %0
+}
+; CHECK32:  lwz [[SCRATCH:[0-9]+]], L..C[[TL_A:[0-9]+]](2)                         # @a
+; CHECK64:  ld [[SCRATCH:[0-9]+]], L..C[[TL_A:[0-9]+]](2)                         # @a
+; CHECK:    lwz 3, 0([[SCRATCH]])
+; CHECK:    blr
+
+define i32 @B() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @b, align 4
+  ret i32 %0
+}
+; CHECK:   addis [[HI:[0-9]+]], L..C[[TL_B:[0-9]+]]@u(2)
+; CHECK32: lwz [[ADDR:[0-9]+]], L..C[[TL_B]]@l([[HI]])
+; CHECK64: ld [[ADDR:[0-9]+]], L..C[[TL_B]]@l([[HI]])
+; CHECK:   lwz 3, 0([[ADDR]])
+; CHECK:   blr
+
+define i32 @C() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @c, align 4
+  ret i32 %0
+}
+; CHECK32:  lwz [[SCRATCH:[0-9]+]], L..C[[TL_C:[0-9]+]](2)                         # @c
+; CHECK64:  ld [[SCRATCH:[0-9]+]], L..C[[TL_C:[0-9]+]](2)                         # @c
+; CHECK:    lwz 3, 0([[SCRATCH]])
+; CHECK:    blr
+
+define i32 @D() local_unnamed_addr {
+entry:
+  %0 = load i32, ptr @d, align 4
+  ret i32 %0
+}
+; CHECK: addis [[HI:[0-9]+]], L..C[[TL_D:[0-9]+]]@u(2)
+; CHECK32: lwz [[ADDR:[0-9]+]], L..C[[TL_D]]@l([[HI]])
+; CHECK64: ld [[ADDR:[0-9]+]], L..C[[TL_D]]@l([[HI]])
+; CHECK: lwz 3, 0([[ADDR]])
+; CHECK: blr
+
+define i32 @E() {
+entry:
+  %0 = load i32, ptr @e, align 4
+  ret i32 %0
+}
+; CHECK-LARGE: addis [[HI:[0-9]+]], L..C[[TL_E:[0-9]+]]@u(2)
+; CHECK-LARGE32: lwz [[SCRATCH:[0-9]+]], L..C[[TL_E]]@l([[HI]])
+; CHECK-SMALL32: lwz [[SCRATCH:[0-9]+]], L..C[[TL_E:[0-9]+]](2)
+; CHECK-LARGE64: ld  [[SCRATCH:[0-9]+]], L..C[[TL_E]]@l([[HI]])
+; CHECK-SMALL64: ld  [[SCRATCH:[0-9]+]], L..C[[TL_E:[0-9]+]](2)
+; CHECK: lwz 3, 0([[SCRATCH]])
+; CHECK: blr
+
+define i32 @F() {
+entry:
+  %0 = load i32, ptr @f, align 4
+  ret i32 %0
+}
+; CHECK-LARGE: addis [[HI:[0-9]+]], L..C[[TL_F:[0-9]+]]@u(2)
+; CHECK-LARGE32: lwz [[SCRATCH:[0-9]+]], L..C[[TL_F]]@l([[HI]])
+; CHECK-SMALL32: lwz [[SCRATCH:[0-9]+]], L..C[[TL_F:[0-9]+]](2)
+; CHECK-LARGE64: ld [[SCRATCH:[0-9]+]], L..C[[TL_F]]@l([[HI]])
+; CHECK-SMALL64: ld  [[SCRATCH:[0-9]+]], L..C[[TL_F:[0-9]+]](2)
+; CHECK: lwz 3, 0([[SCRATCH]])
+; CHECK: blr
+
+define noundef nonnull ptr @addr_a() local_unnamed_addr {
+entry:
+  ret ptr @a
+}
+; CHECK32:  lwz 3, L..C[[TL_A]](2)                         # @a
+; CHECK64:  ld 3, L..C[[TL_A]](2)                         # @a
+; CHECK:    blr
+
+define noundef nonnull ptr @addr_b() local_unnamed_addr {
+entry:
+  ret ptr @b
+}
+; CHECK:    addis [[HI:[0-9]+]], L..C[[TL_B]]@u(2)
+; CHECK32:  lwz 3, L..C[[TL_B]]@l([[HI]])
+; CHECK64:  ld 3, L..C[[TL_B]]@l([[HI]])
+; CHECK:    blr
+
+
+define noundef nonnull ptr @addr_c() local_unnamed_addr {
+entry:
+  ret ptr @c
+}
+; CHECK32:  lwz 3, L..C[[TL_C]](2)                         # @c
+; CHECK64:  ld 3, L..C[[TL_C]](2)                         # @c
+; CHECK:    blr
+
+define noundef nonnull ptr @addr_d() local_unnamed_addr {
+entry:
+  ret ptr @d
+}
+; CHECK:   addis [[HI:[0-9]+]], L..C[[TL_D]]@u(2)
+; CHECK32: lwz 3, L..C[[TL_D]]@l([[HI]])
+; CHECK64: ld 3, L..C[[TL_D]]@l([[HI]])
+; CHECK:   blr
+
+define i32 @G() {
+   %tmp = load i32, ptr @al
+   ret i32 %tmp
+}
+; CHECK:   addis [[HI:[0-9]+]], L..C[[TL_AL:[0-9]+]]@u(2)
+; CHECK32: lwz [[ADDR:[0-9]+]], L..C[[TL_AL]]@l([[HI]])
+; CHECK64: ld  [[ADDR:[0-9]+]], L..C[[TL_AL]]@l([[HI]])
+; CHECK:   lwz 3, 0([[ADDR]])
+
+define i32 @H() {
+   %tmp = load i32, ptr @as
+   ret i32 %tmp
+}
+; CHECK32: lwz [[ADDR:[0-9]+]], L..C[[TL_AS:[0-9]+]](2)
+; CHECK64: ld [[ADDR:[0-9]+]], L..C[[TL_AS:[0-9]+]](2)
+; CHECK:   lwz 3, 0([[ADDR]])
+
+;; Check TOC entires have correct storage mapping class
+; CHECK:         L..C[[TL_A]]:
+; CHECK:           .tc a[TC],a[UA]
+; CHECK:         L..C[[TL_B]]:
+; CHECK:           .tc b[TE],b[UA]
+; CHECK:         L..C[[TL_C]]:
+; CHECK:           .tc c[TC],c[RW]
+; CHECK:         L..C[[TL_D]]:
+; CHECK:           .tc d[TE],d[RW]
+; CHECK:         L..C[[TL_E]]:
+; CHECK-SMALL:     .tc e[TC],e[UA]
+; CHECK-LARGE:     .tc e[TE],e[UA]
+; CHECK:         L..C[[TL_F]]:
+; CHECK-SMALL:     .tc f[TC],f[RW]
+; CHECK-LARGE:     .tc f[TE],f[RW]
+; CHECK:         L..C[[TL_AL]]:
+; CHECK:           .tc al[TE],al
+; CHECK:         L..C[[TL_AS]]:
+; CHECK:           .tc as[TC],as
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
index c1d1461664418c..50ebe0471dceac 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple powerpc64le < %s | FileCheck %s
 
 ; Check constrained ops converted to call
-define void @test(ptr %cast) {
+define void @test(ptr %cast) strictfp {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %root
 ; CHECK-NEXT:    mflr 0
@@ -51,7 +51,7 @@ for.body:
 }
 
 ; Check constrained ops converted to native instruction
-define void @test2(ptr %cast) {
+define void @test2(ptr %cast) strictfp {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    li 4, 255
diff --git a/llvm/test/CodeGen/PowerPC/fp-classify.ll b/llvm/test/CodeGen/PowerPC/fp-classify.ll
index 7de35b880a5d9c..f527b3c48040e7 100644
--- a/llvm/test/CodeGen/PowerPC/fp-classify.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-classify.ll
@@ -57,30 +57,18 @@ entry:
 define zeroext i1 @abs_isinfq(fp128 %x) {
 ; P8-LABEL: abs_isinfq:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    mflr 0
-; P8-NEXT:    stdu 1, -48(1)
-; P8-NEXT:    std 0, 64(1)
-; P8-NEXT:    .cfi_def_cfa_offset 48
-; P8-NEXT:    .cfi_offset lr, 16
 ; P8-NEXT:    xxswapd 0, 34
-; P8-NEXT:    addi 3, 1, 32
+; P8-NEXT:    addi 3, 1, -16
+; P8-NEXT:    li 5, 32767
 ; P8-NEXT:    stxvd2x 0, 0, 3
-; P8-NEXT:    lbz 4, 47(1)
-; P8-NEXT:    clrlwi 4, 4, 25
-; P8-NEXT:    stb 4, 47(1)
-; P8-NEXT:    lxvd2x 0, 0, 3
-; P8-NEXT:    addis 3, 2, .LCPI2_0@toc@ha
-; P8-NEXT:    addi 3, 3, .LCPI2_0@toc@l
-; P8-NEXT:    xxswapd 34, 0
-; P8-NEXT:    lxvd2x 0, 0, 3
-; P8-NEXT:    xxswapd 35, 0
-; P8-NEXT:    bl __eqkf2
-; P8-NEXT:    nop
-; P8-NEXT:    cntlzw 3, 3
-; P8-NEXT:    srwi 3, 3, 5
-; P8-NEXT:    addi 1, 1, 48
-; P8-NEXT:    ld 0, 16(1)
-; P8-NEXT:    mtlr 0
+; P8-NEXT:    rldic 5, 5, 48, 1
+; P8-NEXT:    ld 4, -8(1)
+; P8-NEXT:    ld 3, -16(1)
+; P8-NEXT:    clrldi 4, 4, 1
+; P8-NEXT:    xor 4, 4, 5
+; P8-NEXT:    or 3, 3, 4
+; P8-NEXT:    cntlzd 3, 3
+; P8-NEXT:    rldicl 3, 3, 58, 63
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinfq:
@@ -99,12 +87,13 @@ entry:
 define zeroext i1 @abs_isinfornanf(float %x) {
 ; P8-LABEL: abs_isinfornanf:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    addis 3, 2, .LCPI3_0@toc@ha
-; P8-NEXT:    xsabsdp 0, 1
-; P8-NEXT:    lfs 1, .LCPI3_0@toc@l(3)
-; P8-NEXT:    li 3, 1
-; P8-NEXT:    fcmpu 0, 0, 1
-; P8-NEXT:    isellt 3, 0, 3
+; P8-NEXT:    xscvdpspn 0, 1
+; P8-NEXT:    lis 4, 32639
+; P8-NEXT:    ori 4, 4, 65535
+; P8-NEXT:    mffprwz 3, 0
+; P8-NEXT:    clrlwi 3, 3, 1
+; P8-NEXT:    sub 3, 4, 3
+; P8-NEXT:    rldicl 3, 3, 1, 63
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinfornanf:
@@ -123,12 +112,15 @@ entry:
 define zeroext i1 @abs_isinfornan(double %x) {
 ; P8-LABEL: abs_isinfornan:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
-; P8-NEXT:    xsabsdp 0, 1
-; P8-NEXT:    lfs 1, .LCPI4_0@toc@l(3)
-; P8-NEXT:    li 3, 1
-; P8-NEXT:    fcmpu 0, 0, 1
-; P8-NEXT:    isellt 3, 0, 3
+; P8-NEXT:    mffprd 3, 1
+; P8-NEXT:    li 4, -33
+; P8-NEXT:    rldicl 4, 4, 47, 1
+; P8-NEXT:    sradi 5, 4, 63
+; P8-NEXT:    clrldi 3, 3, 1
+; P8-NEXT:    rldicl 6, 3, 1, 63
+; P8-NEXT:    subc 3, 4, 3
+; P8-NEXT:    adde 3, 6, 5
+; P8-NEXT:    xori 3, 3, 1
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinfornan:
@@ -147,53 +139,18 @@ entry:
 define zeroext i1 @abs_isinfornanq(fp128 %x) {
 ; P8-LABEL: abs_isinfornanq:
 ; P8:       # %bb.0: # %entry
-; P8-NEXT:    mflr 0
-; P8-NEXT:    stdu 1, -112(1)
-; P8-NEXT:    std 0, 128(1)
-; P8-NEXT:    .cfi_def_cfa_offset 112
-; P8-NEXT:    .cfi_offset lr, 16
-; P8-NEXT:    .cfi_offset r30, -16
-; P8-NEXT:    .cfi_offset v30, -48
-; P8-NEXT:    .cfi_offset v31, -32
-; P8-NEXT:    li 3, 64
 ; P8-NEXT:    xxswapd 0, 34
-; P8-NEXT:    std 30, 96(1) # 8-byte Folded Spill
-; P8-NEXT:    stvx 30, 1, 3 # 16-byte Folded Spill
-; P8-NEXT:    li 3, 80
-; P8-NEXT:    stvx 31, 1, 3 # 16-byte Folded Spill
-; P8-NEXT:    addi 3, 1, 48
+; P8-NEXT:    addi 3, 1, -16
+; P8-NEXT:    li 4, -3
 ; P8-NEXT:    stxvd2x 0, 0, 3
-; P8-NEXT:    lbz 4, 63(1)
-; P8-NEXT:    clrlwi 4, 4, 25
-; P8-NEXT:    stb 4, 63(1)
-; P8-NEXT:    lxvd2x 0, 0, 3
-; P8-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
-; P8-NEXT:    addi 3, 3, .LCPI5_0@toc@l
-; P8-NEXT:    xxswapd 63, 0
-; P8-NEXT:    lxvd2x 0, 0, 3
-; P8-NEXT:    vmr 2, 31
-; P8-NEXT:    xxswapd 62, 0
-; P8-NEXT:    vmr 3, 30
-; P8-NEXT:    bl __eqkf2
-; P8-NEXT:    nop
-; P8-NEXT:    cntlzw 3, 3
-; P8-NEXT:    vmr 2, 31
-; P8-NEXT:    vmr 3, 30
-; P8-NEXT:    srwi 30, 3, 5
-; P8-NEXT:    bl __unordkf2
-; P8-NEXT:    nop
-; P8-NEXT:    cntlzw 3, 3
-; P8-NEXT:    li 4, 80
-; P8-NEXT:    lvx 31, 1, 4 # 16-byte Folded Reload
-; P8-NEXT:    li 4, 64
-; P8-NEXT:    srwi 3, 3, 5
-; P8-NEXT:    lvx 30, 1, 4 # 16-byte Folded Reload
+; P8-NEXT:    rldicl 4, 4, 47, 1
+; P8-NEXT:    ld 3, -8(1)
+; P8-NEXT:    sradi 5, 4, 63
+; P8-NEXT:    clrldi 3, 3, 1
+; P8-NEXT:    rldicl 6, 3, 1, 63
+; P8-NEXT:    subc 3, 4, 3
+; P8-NEXT:    adde 3, 6, 5
 ; P8-NEXT:    xori 3, 3, 1
-; P8-NEXT:    or 3, 3, 30
-; P8-NEXT:    ld 30, 96(1) # 8-byte Folded Reload
-; P8-NEXT:    addi 1, 1, 112
-; P8-NEXT:    ld 0, 16(1)
-; P8-NEXT:    mtlr 0
 ; P8-NEXT:    blr
 ;
 ; P9-LABEL: abs_isinfornanq:
diff --git a/llvm/test/CodeGen/PowerPC/rldimi.ll b/llvm/test/CodeGen/PowerPC/rldimi.ll
index 322975f547c996..78ea9aa862f2c2 100644
--- a/llvm/test/CodeGen/PowerPC/rldimi.ll
+++ b/llvm/test/CodeGen/PowerPC/rldimi.ll
@@ -59,8 +59,8 @@ entry:
   ret i64 %8
 }
 
-define i64 @rldimi_intrinsic(i64 %a) {
-; CHECK-LABEL: rldimi_intrinsic:
+define i64 @rldimi4(i64 %a) {
+; CHECK-LABEL: rldimi4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    rldimi 3, 3, 8, 0
 ; CHECK-NEXT:    rldimi 3, 3, 16, 0
@@ -72,4 +72,71 @@ define i64 @rldimi_intrinsic(i64 %a) {
   ret i64 %r3
 }
 
+define i64 @rldimi5(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rldimi 4, 3, 8, 40
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 8, i64 16776960) ; 0xffff << 8
+  ret i64 %r
+}
+
+define i64 @rldimi6(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 3, 3, 1
+; CHECK-NEXT:    rldimi 4, 3, 7, 41
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 8, i64 8388480) ; 0xffff << 7
+  ret i64 %r
+}
+
+define i64 @rldimi7(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 3, 3, 63
+; CHECK-NEXT:    rldimi 4, 3, 9, 39
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 8, i64 33553920) ; 0xffff << 9
+  ret i64 %r
+}
+
+define i64 @rldimi8(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 0, i64 0)
+  ret i64 %r
+}
+
+define i64 @rldimi9(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 63, i64 0)
+  ret i64 %r
+}
+
+define i64 @rldimi10(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 0, i64 -1)
+  ret i64 %r
+}
+
+define i64 @rldimi11(i64 %a, i64 %b) {
+; CHECK-LABEL: rldimi11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 3, 3, 8
+; CHECK-NEXT:    blr
+  %r = call i64 @llvm.ppc.rldimi(i64 %a, i64 %b, i32 8, i64 -1)
+  ret i64 %r
+}
+
 declare i64 @llvm.ppc.rldimi(i64, i64, i32 immarg, i64 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/rlwimi.ll b/llvm/test/CodeGen/PowerPC/rlwimi.ll
index 8b126cd3393c10..8da769508c9ae8 100644
--- a/llvm/test/CodeGen/PowerPC/rlwimi.ll
+++ b/llvm/test/CodeGen/PowerPC/rlwimi.ll
@@ -107,11 +107,51 @@ entry:
 define i32 @test9(i32 %a, i32 %b) {
 ; CHECK-LABEL: test9:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rlwimi 3, 4, 8, 20, 26
+; CHECK-NEXT:    rlwimi 4, 3, 8, 20, 26
+; CHECK-NEXT:    mr 3, 4
 ; CHECK-NEXT:    blr
 entry:
   %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 8, i32 4064)
   ret i32 %r
 }
 
+define i32 @test10(i32 %a, i32 %b) {
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 0, i32 -1)
+  ret i32 %r
+}
+
+define i32 @test11(i32 %a, i32 %b) {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rotlwi 3, 3, 8
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 8, i32 -1)
+  ret i32 %r
+}
+
+define i32 @test12(i32 %a, i32 %b) {
+; CHECK-LABEL: test12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 0, i32 0)
+  ret i32 %r
+}
+
+define i32 @test13(i32 %a, i32 %b) {
+; CHECK-LABEL: test13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 0, 27, 19
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 0, i32 4064)
+  ret i32 %r
+}
+
 declare i32 @llvm.ppc.rlwimi(i32, i32, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/rlwinm.ll b/llvm/test/CodeGen/PowerPC/rlwinm.ll
index c6d4e5bb000040..363eb171276566 100644
--- a/llvm/test/CodeGen/PowerPC/rlwinm.ll
+++ b/llvm/test/CodeGen/PowerPC/rlwinm.ll
@@ -97,4 +97,24 @@ entry:
   ret i32 %r
 }
 
+define i32 @test10(i32 %a, i32 %s) {
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwnm(i32 %a, i32 %s, i32 0)
+  ret i32 %r
+}
+
+define i32 @test11(i32 %a, i32 %s) {
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rotlw 3, 3, 4
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwnm(i32 %a, i32 %s, i32 -1)
+  ret i32 %r
+}
+
 declare i32 @llvm.ppc.rlwnm(i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll b/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll
index 6f68679325c579..798637b6840f1e 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-double-ldst.ll
@@ -7281,3 +7281,61 @@ entry:
   store double %str, ptr inttoptr (i64 1000000000000 to ptr), align 4096
   ret void
 }
+
+define dso_local void @st_reversed_double_from_i8(ptr %ptr) {
+; CHECK-P10-LABEL: st_reversed_double_from_i8:
+; CHECK-P10:       # %bb.0: # %entry
+; CHECK-P10-NEXT:    li r4, 8
+; CHECK-P10-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P10-NEXT:    xxspltidp vs2, -1023410176
+; CHECK-P10-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P10-NEXT:    xscvuxddp f0, f0
+; CHECK-P10-NEXT:    xscvuxddp f1, f1
+; CHECK-P10-NEXT:    xsadddp f0, f0, f2
+; CHECK-P10-NEXT:    xsadddp f1, f1, f2
+; CHECK-P10-NEXT:    stfd f1, 0(r3)
+; CHECK-P10-NEXT:    stfd f0, 8(r3)
+; CHECK-P10-NEXT:    blr
+;
+; CHECK-P9-LABEL: st_reversed_double_from_i8:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    li r4, 8
+; CHECK-P9-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P9-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P9-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P9-NEXT:    xscvuxddp f0, f0
+; CHECK-P9-NEXT:    xscvuxddp f1, f1
+; CHECK-P9-NEXT:    xsadddp f0, f0, f2
+; CHECK-P9-NEXT:    xsadddp f1, f1, f2
+; CHECK-P9-NEXT:    stfd f0, 8(r3)
+; CHECK-P9-NEXT:    stfd f1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-P8-LABEL: st_reversed_double_from_i8:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lbz r4, 0(r3)
+; CHECK-P8-NEXT:    lbz r5, 8(r3)
+; CHECK-P8-NEXT:    mtfprwz f0, r4
+; CHECK-P8-NEXT:    mtfprwz f1, r5
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P8-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P8-NEXT:    xscvuxddp f0, f0
+; CHECK-P8-NEXT:    xscvuxddp f1, f1
+; CHECK-P8-NEXT:    xsadddp f0, f0, f2
+; CHECK-P8-NEXT:    xsadddp f1, f1, f2
+; CHECK-P8-NEXT:    stfd f1, 0(r3)
+; CHECK-P8-NEXT:    stfd f0, 8(r3)
+; CHECK-P8-NEXT:    blr
+entry:
+  %idx = getelementptr inbounds i8, ptr %ptr, i64 8
+  %i0 = load i8, ptr %ptr, align 1
+  %i1 = load i8, ptr %idx, align 1
+  %f0 = uitofp i8 %i0 to double
+  %f1 = uitofp i8 %i1 to double
+  %a0 = fadd double %f0, -1.280000e+02
+  %a1 = fadd double %f1, -1.280000e+02
+  store double %a1, ptr %ptr, align 8
+  store double %a0, ptr %idx, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll b/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll
index 824dd4c4db6cb7..f3960573421298 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-float-ldst.ll
@@ -7271,3 +7271,61 @@ entry:
   store double %conv, ptr inttoptr (i64 1000000000000 to ptr), align 4096
   ret void
 }
+
+define dso_local void @st_reversed_float_from_i8(ptr %ptr) {
+; CHECK-P10-LABEL: st_reversed_float_from_i8:
+; CHECK-P10:       # %bb.0: # %entry
+; CHECK-P10-NEXT:    li r4, 8
+; CHECK-P10-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P10-NEXT:    xxspltidp vs2, -1023410176
+; CHECK-P10-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P10-NEXT:    xscvuxdsp f0, f0
+; CHECK-P10-NEXT:    xscvuxdsp f1, f1
+; CHECK-P10-NEXT:    xsaddsp f0, f0, f2
+; CHECK-P10-NEXT:    xsaddsp f1, f1, f2
+; CHECK-P10-NEXT:    stfs f0, 8(r3)
+; CHECK-P10-NEXT:    stfs f1, 0(r3)
+; CHECK-P10-NEXT:    blr
+;
+; CHECK-P9-LABEL: st_reversed_float_from_i8:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    li r4, 8
+; CHECK-P9-NEXT:    lxsibzx f0, 0, r3
+; CHECK-P9-NEXT:    lxsibzx f1, r3, r4
+; CHECK-P9-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P9-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P9-NEXT:    xscvuxdsp f0, f0
+; CHECK-P9-NEXT:    xscvuxdsp f1, f1
+; CHECK-P9-NEXT:    xsaddsp f0, f0, f2
+; CHECK-P9-NEXT:    xsaddsp f1, f1, f2
+; CHECK-P9-NEXT:    stfs f0, 8(r3)
+; CHECK-P9-NEXT:    stfs f1, 0(r3)
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-P8-LABEL: st_reversed_float_from_i8:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lbz r4, 0(r3)
+; CHECK-P8-NEXT:    lbz r5, 8(r3)
+; CHECK-P8-NEXT:    mtfprwz f0, r4
+; CHECK-P8-NEXT:    mtfprwz f1, r5
+; CHECK-P8-NEXT:    addis r4, r2, .LCPI300_0@toc@ha
+; CHECK-P8-NEXT:    lfs f2, .LCPI300_0@toc@l(r4)
+; CHECK-P8-NEXT:    xscvuxdsp f0, f0
+; CHECK-P8-NEXT:    xscvuxdsp f1, f1
+; CHECK-P8-NEXT:    xsaddsp f0, f0, f2
+; CHECK-P8-NEXT:    xsaddsp f1, f1, f2
+; CHECK-P8-NEXT:    stfs f1, 0(r3)
+; CHECK-P8-NEXT:    stfs f0, 8(r3)
+; CHECK-P8-NEXT:    blr
+entry:
+  %idx = getelementptr inbounds i8, ptr %ptr, i64 8
+  %i0 = load i8, ptr %ptr, align 1
+  %i1 = load i8, ptr %idx, align 1
+  %f0 = uitofp i8 %i0 to float
+  %f1 = uitofp i8 %i1 to float
+  %a0 = fadd float %f0, -1.280000e+02
+  %a1 = fadd float %f1, -1.280000e+02
+  store float %a1, ptr %ptr, align 8
+  store float %a0, ptr %idx, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ld.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ld.ll
new file mode 100644
index 00000000000000..31b3c3fe3c5be8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/vec-ld.ll
@@ -0,0 +1,948 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -global-isel -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck  -check-prefixes=RV32 %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -global-isel -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck  -check-prefixes=RV64 %s
+
+define <vscale x 1 x i8>  @vload_nx1i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x i8>, ptr %pa
+  ret <vscale x 1 x i8> %va
+}
+
+define <vscale x 2 x i8>  @vload_nx2i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i8>, ptr %pa
+  ret <vscale x 2 x i8> %va
+}
+
+define <vscale x 4 x i8>  @vload_nx4i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i8>, ptr %pa
+  ret <vscale x 4 x i8> %va
+}
+
+define <vscale x 8 x i8>  @vload_nx8i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx8i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 8 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 8 x i8>, ptr %pa
+  ret <vscale x 8 x i8> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 32 x i8>  @vload_nx32i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx32i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx32i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 32 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 32 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m4
+  %va = load <vscale x 32 x i8>, ptr %pa
+  ret <vscale x 32 x i8> %va
+}
+
+define <vscale x 64 x i8>  @vload_nx64i8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx64i8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m8
+  ;
+  ; RV64-LABEL: name: vload_nx64i8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 64 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 64 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 64 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 64 x i8>, ptr %pa
+  ret <vscale x 64 x i8> %va
+}
+
+define <vscale x 1 x i16>  @vload_nx1i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x i16>, ptr %pa
+  ret <vscale x 1 x i16> %va
+}
+
+define <vscale x 2 x i16>  @vload_nx2i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i16>, ptr %pa
+  ret <vscale x 2 x i16> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 8 x i16>  @vload_nx8i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx8i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 8 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 8 x i16>, ptr %pa
+  ret <vscale x 8 x i16> %va
+}
+
+define <vscale x 16 x i16>  @vload_nx16i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx16i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 16 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m4
+  %va = load <vscale x 16 x i16>, ptr %pa
+  ret <vscale x 16 x i16> %va
+}
+
+define <vscale x 32 x i16>  @vload_nx32i16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx32i16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m8
+  ;
+  ; RV64-LABEL: name: vload_nx32i16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 32 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 32 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 32 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 32 x i16>, ptr %pa
+  ret <vscale x 32 x i16> %va
+}
+
+define <vscale x 1 x i32>  @vload_nx1i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x i32>, ptr %pa
+  ret <vscale x 1 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 4 x i32>  @vload_nx4i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx4i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 4 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 4 x i32>, ptr %pa
+  ret <vscale x 4 x i32> %va
+}
+
+define <vscale x 8 x i32>  @vload_nx8i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx8i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 8 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m4
+  %va = load <vscale x 8 x i32>, ptr %pa
+  ret <vscale x 8 x i32> %va
+}
+
+define <vscale x 16 x i32>  @vload_nx16i32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m8
+  ;
+  ; RV64-LABEL: name: vload_nx16i32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 16 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 16 x i32>, ptr %pa
+  ret <vscale x 16 x i32> %va
+}
+
+define <vscale x 1 x i64>  @vload_nx1i64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x i64>, ptr %pa
+  ret <vscale x 1 x i64> %va
+}
+
+define <vscale x 2 x i64>  @vload_nx2i64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 4 x i64>  @vload_nx4i64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx4i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 4 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m4
+  %va = load <vscale x 4 x i64>, ptr %pa
+  ret <vscale x 4 x i64> %va
+}
+
+define <vscale x 8 x i64>  @vload_nx8i64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8i64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m8
+  ;
+  ; RV64-LABEL: name: vload_nx8i64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 8 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 8 x i64>, ptr %pa
+  ret <vscale x 8 x i64> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8_align1(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8_align1
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8_align1
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 1)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa, align 1
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8_align2(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8_align2
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8_align2
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 2)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa, align 2
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8_align16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa, align 16
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 16 x i8>  @vload_nx16i8_align64(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx16i8_align64
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx16i8_align64
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 16 x s8>) from %ir.pa, align 64)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 16 x s8>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 16 x i8>, ptr %pa, align 64
+  ret <vscale x 16 x i8> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16_align1(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align1
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 1)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align1
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 1)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 1
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16_align2(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align2
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align2
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 2)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 2
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16_align4(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align4
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align4
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 4)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 4
+  ret <vscale x 4 x i16> %va
+}
+define <vscale x 4 x i16>  @vload_nx4i16_align8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 8
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 4 x i16>  @vload_nx4i16_align16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx4i16_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx4i16_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 4 x s16>) from %ir.pa, align 16)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 4 x s16>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 4 x i16>, ptr %pa, align 16
+  ret <vscale x 4 x i16> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align2(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align2
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 2)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align2
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 2)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 2
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align4(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align4
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align4
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 4)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 4
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 8
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 16)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 16
+  ret <vscale x 2 x i32> %va
+}
+
+define <vscale x 2 x i32>  @vload_nx2i32_align256(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i32_align256
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2i32_align256
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s32>) from %ir.pa, align 256)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x s32>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 2 x i32>, ptr %pa, align 256
+  ret <vscale x 2 x i32> %va
+}
+define <vscale x 2 x i64>  @vload_nx2i64_align4(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64_align4
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 4)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64_align4
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 4)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa, align 4
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 2 x i64>  @vload_nx2i64_align8(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64_align8
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64_align8
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 8)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa, align 8
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 2 x i64>  @vload_nx2i64_align16(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64_align16
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64_align16
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa, align 16
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 2 x i64>  @vload_nx2i64_align32(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2i64_align32
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+  ; RV32-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m2
+  ;
+  ; RV64-LABEL: name: vload_nx2i64_align32
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x s64>) from %ir.pa, align 32)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x s64>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2 x i64>, ptr %pa, align 32
+  ret <vscale x 2 x i64> %va
+}
+
+define <vscale x 1 x ptr>  @vload_nx1ptr(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx1ptr
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx1ptr
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 1 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 1 x p0>) from %ir.pa)
+  ; RV64-NEXT:   $v8 = COPY [[LOAD]](<vscale x 1 x p0>)
+  ; RV64-NEXT:   PseudoRET implicit $v8
+  %va = load <vscale x 1 x ptr>, ptr %pa
+  ret <vscale x 1 x ptr> %va
+}
+
+define <vscale x 2 x ptr>  @vload_nx2ptr(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx2ptr
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+  ; RV32-NEXT:   $v8 = COPY [[LOAD]](<vscale x 2 x p0>)
+  ; RV32-NEXT:   PseudoRET implicit $v8
+  ;
+  ; RV64-LABEL: name: vload_nx2ptr
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 2 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 2 x p0>) from %ir.pa)
+  ; RV64-NEXT:   $v8m2 = COPY [[LOAD]](<vscale x 2 x p0>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m2
+  %va = load <vscale x 2x ptr>, ptr %pa
+  ret <vscale x 2 x ptr> %va
+}
+
+define <vscale x 8 x ptr>  @vload_nx8ptr(ptr %pa) {
+  ; RV32-LABEL: name: vload_nx8ptr
+  ; RV32: bb.1 (%ir-block.0):
+  ; RV32-NEXT:   liveins: $x10
+  ; RV32-NEXT: {{  $}}
+  ; RV32-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV32-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+  ; RV32-NEXT:   $v8m4 = COPY [[LOAD]](<vscale x 8 x p0>)
+  ; RV32-NEXT:   PseudoRET implicit $v8m4
+  ;
+  ; RV64-LABEL: name: vload_nx8ptr
+  ; RV64: bb.1 (%ir-block.0):
+  ; RV64-NEXT:   liveins: $x10
+  ; RV64-NEXT: {{  $}}
+  ; RV64-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+  ; RV64-NEXT:   [[LOAD:%[0-9]+]]:_(<vscale x 8 x p0>) = G_LOAD [[COPY]](p0) :: (load (<vscale x 8 x p0>) from %ir.pa)
+  ; RV64-NEXT:   $v8m8 = COPY [[LOAD]](<vscale x 8 x p0>)
+  ; RV64-NEXT:   PseudoRET implicit $v8m8
+  %va = load <vscale x 8 x ptr>, ptr %pa
+  ret <vscale x 8 x ptr> %va
+}
+
diff --git a/llvm/test/CodeGen/RISCV/live-sp.mir b/llvm/test/CodeGen/RISCV/live-sp.mir
index 8dd307f521f5bd..fa6297a3913a9b 100644
--- a/llvm/test/CodeGen/RISCV/live-sp.mir
+++ b/llvm/test/CodeGen/RISCV/live-sp.mir
@@ -44,7 +44,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    4
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll
index cfdefec04600c8..ebf232cc458ba0 100644
--- a/llvm/test/CodeGen/RISCV/machine-combiner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll
@@ -740,9 +740,9 @@ define i8 @test_reassoc_minu_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 ; CHECK-LABEL: test_reassoc_minu_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a3, a3, 255
-; CHECK-NEXT:    andi a2, a2, 255
 ; CHECK-NEXT:    andi a1, a1, 255
 ; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    andi a2, a2, 255
 ; CHECK-NEXT:    minu a0, a0, a1
 ; CHECK-NEXT:    minu a1, a2, a3
 ; CHECK-NEXT:    minu a0, a0, a1
@@ -757,9 +757,9 @@ define i16 @test_reassoc_minu_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 ; CHECK-LABEL: test_reassoc_minu_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    zext.h a3, a3
-; CHECK-NEXT:    zext.h a2, a2
 ; CHECK-NEXT:    zext.h a1, a1
 ; CHECK-NEXT:    zext.h a0, a0
+; CHECK-NEXT:    zext.h a2, a2
 ; CHECK-NEXT:    minu a0, a0, a1
 ; CHECK-NEXT:    minu a1, a2, a3
 ; CHECK-NEXT:    minu a0, a0, a1
@@ -774,9 +774,9 @@ define i32 @test_reassoc_minu_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_minu_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.w a3, a3
-; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    sext.w a1, a1
 ; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    minu a0, a0, a1
 ; CHECK-NEXT:    minu a1, a2, a3
 ; CHECK-NEXT:    minu a0, a0, a1
@@ -804,9 +804,9 @@ define i8 @test_reassoc_min_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 ; CHECK-LABEL: test_reassoc_min_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.b a3, a3
-; CHECK-NEXT:    sext.b a2, a2
 ; CHECK-NEXT:    sext.b a1, a1
 ; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    sext.b a2, a2
 ; CHECK-NEXT:    min a0, a0, a1
 ; CHECK-NEXT:    min a1, a2, a3
 ; CHECK-NEXT:    min a0, a0, a1
@@ -821,9 +821,9 @@ define i16 @test_reassoc_min_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 ; CHECK-LABEL: test_reassoc_min_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.h a3, a3
-; CHECK-NEXT:    sext.h a2, a2
 ; CHECK-NEXT:    sext.h a1, a1
 ; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    sext.h a2, a2
 ; CHECK-NEXT:    min a0, a0, a1
 ; CHECK-NEXT:    min a1, a2, a3
 ; CHECK-NEXT:    min a0, a0, a1
@@ -838,9 +838,9 @@ define i32 @test_reassoc_min_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_min_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.w a3, a3
-; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    sext.w a1, a1
 ; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    min a0, a0, a1
 ; CHECK-NEXT:    min a1, a2, a3
 ; CHECK-NEXT:    min a0, a0, a1
@@ -868,9 +868,9 @@ define i8 @test_reassoc_maxu_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 ; CHECK-LABEL: test_reassoc_maxu_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a3, a3, 255
-; CHECK-NEXT:    andi a2, a2, 255
 ; CHECK-NEXT:    andi a1, a1, 255
 ; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    andi a2, a2, 255
 ; CHECK-NEXT:    maxu a0, a0, a1
 ; CHECK-NEXT:    maxu a1, a2, a3
 ; CHECK-NEXT:    maxu a0, a0, a1
@@ -885,9 +885,9 @@ define i16 @test_reassoc_maxu_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 ; CHECK-LABEL: test_reassoc_maxu_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    zext.h a3, a3
-; CHECK-NEXT:    zext.h a2, a2
 ; CHECK-NEXT:    zext.h a1, a1
 ; CHECK-NEXT:    zext.h a0, a0
+; CHECK-NEXT:    zext.h a2, a2
 ; CHECK-NEXT:    maxu a0, a0, a1
 ; CHECK-NEXT:    maxu a1, a2, a3
 ; CHECK-NEXT:    maxu a0, a0, a1
@@ -902,9 +902,9 @@ define i32 @test_reassoc_maxu_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_maxu_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.w a3, a3
-; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    sext.w a1, a1
 ; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    maxu a0, a0, a1
 ; CHECK-NEXT:    maxu a1, a2, a3
 ; CHECK-NEXT:    maxu a0, a0, a1
@@ -932,9 +932,9 @@ define i8 @test_reassoc_max_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 ; CHECK-LABEL: test_reassoc_max_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.b a3, a3
-; CHECK-NEXT:    sext.b a2, a2
 ; CHECK-NEXT:    sext.b a1, a1
 ; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    sext.b a2, a2
 ; CHECK-NEXT:    max a0, a0, a1
 ; CHECK-NEXT:    max a1, a2, a3
 ; CHECK-NEXT:    max a0, a0, a1
@@ -949,9 +949,9 @@ define i16 @test_reassoc_max_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 ; CHECK-LABEL: test_reassoc_max_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.h a3, a3
-; CHECK-NEXT:    sext.h a2, a2
 ; CHECK-NEXT:    sext.h a1, a1
 ; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    sext.h a2, a2
 ; CHECK-NEXT:    max a0, a0, a1
 ; CHECK-NEXT:    max a1, a2, a3
 ; CHECK-NEXT:    max a0, a0, a1
@@ -966,9 +966,9 @@ define i32 @test_reassoc_max_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_max_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sext.w a3, a3
-; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    sext.w a1, a1
 ; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    sext.w a2, a2
 ; CHECK-NEXT:    max a0, a0, a1
 ; CHECK-NEXT:    max a1, a2, a3
 ; CHECK-NEXT:    max a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 139579b3d2a361..87d95d7596d4fa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -161,71 +161,72 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-LABEL: fv128:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_0)
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vmsltu.vx v0, v16, a2
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 4, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_1)
-; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v10, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 6, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 4
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_2)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 6
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_3)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_3)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 6
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 10, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 8
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_4)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_4)
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v0, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 12, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 10
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_5)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_5)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 4
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetivli zero, 14, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 12
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_6)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_6)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 6
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 8
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsext.vf8 v16, v8
+; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v0, v16, 14
 ; CHECK-NEXT:    ret
   %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
   ret <128 x i1> %mask
diff --git a/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir b/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir
index 52557288210394..080a89e41f0d54 100644
--- a/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/addi-rvv-stack-object.mir
@@ -22,7 +22,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    16
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   functionContext: ''
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
index ed434deea1a837..c64216180c2af7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll
@@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
 ; RV32-NEXT:    th.swia a0, (a1), 4, 0
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV32-NEXT:    vle8.v v10, (a3)
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
 ; RV32-NEXT:    vslideup.vi v10, v9, 4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vzext.vf4 v12, v10
@@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
 ; RV64-NEXT:    th.swia a0, (a1), 4, 0
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV64-NEXT:    vle8.v v10, (a3)
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
 ; RV64-NEXT:    vslideup.vi v10, v9, 4
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vzext.vf4 v12, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index e15e6452163b1c..76aa2b913c6525 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -469,8 +469,9 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v13, v10, a0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
 ; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 68740eec56e4c4..7dcfb247d37cbe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -1599,15 +1599,16 @@ define float @vreduce_fminimum_v2f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB99_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB99_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x float>, ptr %x
@@ -1619,15 +1620,8 @@ define float @vreduce_fminimum_v2f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v2f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x float>, ptr %x
@@ -1641,24 +1635,16 @@ define float @vreduce_fminimum_v4f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB101_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB101_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %x
@@ -1670,24 +1656,8 @@ define float @vreduce_fminimum_v4f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v4f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %x
@@ -1701,33 +1671,16 @@ define float @vreduce_fminimum_v8f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v10, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v10
+; CHECK-NEXT:    beqz a0, .LBB103_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB103_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %x
@@ -1739,33 +1692,8 @@ define float @vreduce_fminimum_v8f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v8f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %x
@@ -1779,42 +1707,16 @@ define float @vreduce_fminimum_v16f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v12, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v12
+; CHECK-NEXT:    beqz a0, .LBB105_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB105_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %x
@@ -1826,42 +1728,8 @@ define float @vreduce_fminimum_v16f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v16f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %x
@@ -1876,51 +1744,16 @@ define float @vreduce_fminimum_v32f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB107_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB107_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x float>, ptr %x
@@ -1933,51 +1766,8 @@ define float @vreduce_fminimum_v32f32_nonans(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x float>, ptr %x
@@ -2009,52 +1799,18 @@ define float @vreduce_fminimum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB109_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    j .LBB109_3
+; CHECK-NEXT:  .LBB109_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB109_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -2073,51 +1829,8 @@ define float @vreduce_fminimum_v64f32_nonans(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <64 x float>, ptr %x
@@ -2208,52 +1921,18 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB111_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    j .LBB111_3
+; CHECK-NEXT:  .LBB111_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB111_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -2281,51 +1960,8 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) {
 ; CHECK-NEXT:    vle32.v v0, (a1)
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16
 ; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <128 x float>, ptr %x
@@ -2339,15 +1975,16 @@ define double @vreduce_fminimum_v2f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB113_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI113_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI113_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB113_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x double>, ptr %x
@@ -2359,15 +1996,8 @@ define double @vreduce_fminimum_v2f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v2f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x double>, ptr %x
@@ -2381,24 +2011,16 @@ define double @vreduce_fminimum_v4f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v10, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v10
+; CHECK-NEXT:    beqz a0, .LBB115_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI115_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI115_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB115_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %x
@@ -2410,24 +2032,8 @@ define double @vreduce_fminimum_v4f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v4f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %x
@@ -2441,33 +2047,16 @@ define double @vreduce_fminimum_v8f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vle64.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v12, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v12
+; CHECK-NEXT:    beqz a0, .LBB117_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI117_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI117_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB117_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x double>, ptr %x
@@ -2479,33 +2068,8 @@ define double @vreduce_fminimum_v8f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v8f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vle64.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x double>, ptr %x
@@ -2519,42 +2083,16 @@ define double @vreduce_fminimum_v16f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v16f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI119_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI119_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x double>, ptr %x
@@ -2566,42 +2104,8 @@ define double @vreduce_fminimum_v16f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v16f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x double>, ptr %x
@@ -2632,43 +2136,18 @@ define double @vreduce_fminimum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB121_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI121_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI121_0)(a0)
+; CHECK-NEXT:    j .LBB121_3
+; CHECK-NEXT:  .LBB121_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB121_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -2686,42 +2165,8 @@ define double @vreduce_fminimum_v32f64_nonans(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
@@ -2811,43 +2256,18 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB123_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI123_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI123_0)(a0)
+; CHECK-NEXT:    j .LBB123_3
+; CHECK-NEXT:  .LBB123_2:
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB123_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -2874,42 +2294,8 @@ define double @vreduce_fminimum_v64f64_nonans(ptr %x) {
 ; CHECK-NEXT:    vle64.v v0, (a1)
 ; CHECK-NEXT:    vfmin.vv v16, v24, v16
 ; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmin.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmin.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmin.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmin.vv v8, v11, v8
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <64 x double>, ptr %x
@@ -2923,15 +2309,16 @@ define float @vreduce_fmaximum_v2f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB125_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB125_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x float>, ptr %x
@@ -2943,15 +2330,8 @@ define float @vreduce_fmaximum_v2f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v2f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x float>, ptr %x
@@ -2965,24 +2345,16 @@ define float @vreduce_fmaximum_v4f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB127_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB127_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %x
@@ -2994,24 +2366,8 @@ define float @vreduce_fmaximum_v4f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v4f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %x
@@ -3025,33 +2381,16 @@ define float @vreduce_fmaximum_v8f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v10, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v10
+; CHECK-NEXT:    beqz a0, .LBB129_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB129_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %x
@@ -3063,33 +2402,8 @@ define float @vreduce_fmaximum_v8f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v8f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x float>, ptr %x
@@ -3103,42 +2417,16 @@ define float @vreduce_fmaximum_v16f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v12, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v12
+; CHECK-NEXT:    beqz a0, .LBB131_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB131_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %x
@@ -3150,42 +2438,8 @@ define float @vreduce_fmaximum_v16f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v16f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x float>, ptr %x
@@ -3200,51 +2454,16 @@ define float @vreduce_fmaximum_v32f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB133_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB133_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x float>, ptr %x
@@ -3257,51 +2476,8 @@ define float @vreduce_fmaximum_v32f32_nonans(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x float>, ptr %x
@@ -3333,52 +2509,18 @@ define float @vreduce_fmaximum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB135_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    j .LBB135_3
+; CHECK-NEXT:  .LBB135_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB135_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -3397,51 +2539,8 @@ define float @vreduce_fmaximum_v64f32_nonans(ptr %x) {
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v16, (a0)
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <64 x float>, ptr %x
@@ -3532,52 +2631,18 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB137_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, 523264
+; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    j .LBB137_3
+; CHECK-NEXT:  .LBB137_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB137_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -3605,51 +2670,8 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) {
 ; CHECK-NEXT:    vle32.v v0, (a1)
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16
 ; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v9, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v9, v11, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <128 x float>, ptr %x
@@ -3663,15 +2685,16 @@ define double @vreduce_fmaximum_v2f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v9, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v9
+; CHECK-NEXT:    beqz a0, .LBB139_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI139_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI139_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB139_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x double>, ptr %x
@@ -3683,15 +2706,8 @@ define double @vreduce_fmaximum_v2f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v2f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v9, (a0)
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <2 x double>, ptr %x
@@ -3705,24 +2721,16 @@ define double @vreduce_fmaximum_v4f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v10, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v10
+; CHECK-NEXT:    beqz a0, .LBB141_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI141_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI141_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB141_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %x
@@ -3734,24 +2742,8 @@ define double @vreduce_fmaximum_v4f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v4f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vle64.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <4 x double>, ptr %x
@@ -3765,33 +2757,16 @@ define double @vreduce_fmaximum_v8f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vle64.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v12, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v12
+; CHECK-NEXT:    beqz a0, .LBB143_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI143_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI143_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB143_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x double>, ptr %x
@@ -3803,33 +2778,8 @@ define double @vreduce_fmaximum_v8f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v8f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vle64.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <8 x double>, ptr %x
@@ -3843,42 +2793,16 @@ define double @vreduce_fmaximum_v16f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v16f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB145_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI145_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI145_0)(a0)
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB145_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x double>, ptr %x
@@ -3890,42 +2814,8 @@ define double @vreduce_fmaximum_v16f64_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v16f64_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <16 x double>, ptr %x
@@ -3956,43 +2846,18 @@ define double @vreduce_fmaximum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB147_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI147_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI147_0)(a0)
+; CHECK-NEXT:    j .LBB147_3
+; CHECK-NEXT:  .LBB147_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB147_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -4010,42 +2875,8 @@ define double @vreduce_fmaximum_v32f64_nonans(ptr %x) {
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle64.v v16, (a0)
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
@@ -4135,43 +2966,18 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmfne.vv v16, v8, v8
+; CHECK-NEXT:    vcpop.m a0, v16
+; CHECK-NEXT:    beqz a0, .LBB149_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lui a0, %hi(.LCPI149_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI149_0)(a0)
+; CHECK-NEXT:    j .LBB149_3
+; CHECK-NEXT:  .LBB149_2:
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
+; CHECK-NEXT:  .LBB149_3:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
@@ -4198,42 +3004,8 @@ define double @vreduce_fmaximum_v64f64_nonans(ptr %x) {
 ; CHECK-NEXT:    vle64.v v0, (a1)
 ; CHECK-NEXT:    vfmax.vv v16, v24, v16
 ; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
-; CHECK-NEXT:    vmfeq.vv v8, v16, v16
-; CHECK-NEXT:    vmerge.vvm v12, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vfmax.vv v12, v12, v8
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v12, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
-; CHECK-NEXT:    vmerge.vvm v10, v16, v12, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v12, v16, v0
-; CHECK-NEXT:    vfmax.vv v10, v10, v8
-; CHECK-NEXT:    vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10
-; CHECK-NEXT:    vmerge.vvm v9, v12, v10, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v10, v12, v0
-; CHECK-NEXT:    vfmax.vv v9, v9, v8
-; CHECK-NEXT:    vslidedown.vi v10, v9, 1
-; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vmfeq.vv v8, v9, v9
-; CHECK-NEXT:    vmerge.vvm v11, v10, v9, v0
-; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
-; CHECK-NEXT:    vfmax.vv v8, v11, v8
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
   %v = load <64 x double>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 8474f95edd813f..e5bef20fd9e24d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -5,59 +5,6 @@
 ; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
-define <8 x i16> @concat_2xv4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: concat_2xv4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 4
-; CHECK-NEXT:    ret
-  %ab = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %ab
-}
-
-define <8 x i16> @concat_4xv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
-; CHECK-LABEL: concat_4xv2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    ret
-  %ab = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cd = shufflevector <2 x i16> %c, <2 x i16> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %abcd = shufflevector <4 x i16> %ab, <4 x i16> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %abcd
-}
-
-define <8 x i16> @concat_8xv1i16(<1 x i16> %a, <1 x i16> %b, <1 x i16> %c, <1 x i16> %d, <1 x i16> %e, <1 x i16> %f, <1 x i16> %g, <1 x i16> %h) {
-; CHECK-LABEL: concat_8xv1i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v12, v13, 1
-; CHECK-NEXT:    vsetivli zero, 3, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 2
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v15, 3
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-NEXT:    vsetivli zero, 3, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 3
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 4
-; CHECK-NEXT:    ret
-  %ab = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
-  %cd = shufflevector <1 x i16> %c, <1 x i16> %d, <2 x i32> <i32 0, i32 1>
-  %abcd = shufflevector <2 x i16> %ab, <2 x i16> %cd, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %ef = shufflevector <1 x i16> %e, <1 x i16> %f, <2 x i32> <i32 0, i32 1>
-  %gh = shufflevector <1 x i16> %g, <1 x i16> %h, <2 x i32> <i32 0, i32 1>
-  %efgh = shufflevector <2 x i16> %ef, <2 x i16> %gh, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %abcdefgh = shufflevector <4 x i16> %abcd, <4 x i16> %efgh, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %abcdefgh
-}
-
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: concat_2xv4i32:
 ; CHECK:       # %bb.0:
@@ -72,11 +19,14 @@ define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
 ; CHECK-LABEL: concat_4xv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v11
+; CHECK-NEXT:    vmv1r.v v14, v9
+; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v14, 2
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 6
 ; CHECK-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -87,18 +37,24 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) {
 ; CHECK-LABEL: concat_8xv1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 1
-; CHECK-NEXT:    vslideup.vi v12, v13, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 1
-; CHECK-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v15
+; CHECK-NEXT:    vmv1r.v v18, v13
+; CHECK-NEXT:    vmv1r.v v20, v11
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v22, 1
+; CHECK-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v20, 3
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v18, 5
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v14, 6
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 7
 ; CHECK-NEXT:    ret
   %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> <i32 0, i32 1>
@@ -124,14 +80,15 @@ define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) {
 define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; CHECK-LABEL: concat_4xv4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v14, v11
-; CHECK-NEXT:    vmv1r.v v12, v10
-; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vmv1r.v v12, v11
+; CHECK-NEXT:    vmv1r.v v16, v10
+; CHECK-NEXT:    vmv1r.v v20, v9
+; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v20, 4
+; CHECK-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 8
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vslideup.vi v8, v12, 12
 ; CHECK-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -142,18 +99,26 @@ define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) {
 ; CHECK-LABEL: concat_8xv2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v14, v15, 2
-; CHECK-NEXT:    vslideup.vi v12, v13, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v12, v14, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 2
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v15
+; CHECK-NEXT:    vmv1r.v v20, v14
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vmv1r.v v28, v11
+; CHECK-NEXT:    vmv1r.v v4, v10
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v0, 2
+; CHECK-NEXT:    vsetivli zero, 6, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v4, 4
+; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v28, 6
+; CHECK-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v24, 10
+; CHECK-NEXT:    vsetivli zero, 14, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v20, 12
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 14
 ; CHECK-NEXT:    ret
   %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -187,27 +152,29 @@ define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
 define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
 ; VLA-LABEL: concat_4xv8i32:
 ; VLA:       # %bb.0:
-; VLA-NEXT:    vmv2r.v v20, v14
-; VLA-NEXT:    vmv2r.v v16, v12
-; VLA-NEXT:    vmv2r.v v12, v10
-; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLA-NEXT:    vslideup.vi v16, v20, 8
-; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    vmv2r.v v16, v14
+; VLA-NEXT:    vmv2r.v v24, v12
+; VLA-NEXT:    vmv2r.v v0, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 8
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 16
 ; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    vslideup.vi v8, v16, 24
 ; VLA-NEXT:    ret
 ;
 ; VLS-LABEL: concat_4xv8i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv2r.v v20, v14
-; VLS-NEXT:    vmv2r.v v16, v12
-; VLS-NEXT:    vmv2r.v v12, v10
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    vmv2r.v v16, v14
+; VLS-NEXT:    vmv2r.v v24, v12
+; VLS-NEXT:    vmv2r.v v0, v10
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 8
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 16
 ; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    vslideup.vi v8, v16, 24
 ; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -218,49 +185,123 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) {
 ; VLA-LABEL: concat_8xv4i32:
 ; VLA:       # %bb.0:
-; VLA-NEXT:    vmv1r.v v18, v15
-; VLA-NEXT:    vmv1r.v v20, v14
-; VLA-NEXT:    vmv1r.v v22, v13
+; VLA-NEXT:    addi sp, sp, -16
+; VLA-NEXT:    .cfi_def_cfa_offset 16
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    sub sp, sp, a0
+; VLA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLA-NEXT:    vmv1r.v v16, v15
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    mv a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v14
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v13
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; VLA-NEXT:    vmv1r.v v16, v12
-; VLA-NEXT:    vmv1r.v v14, v11
-; VLA-NEXT:    vmv1r.v v12, v10
-; VLA-NEXT:    vmv1r.v v10, v9
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v20, v18, 4
-; VLA-NEXT:    vslideup.vi v16, v22, 4
-; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLA-NEXT:    vslideup.vi v16, v20, 8
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v12, v14, 4
-; VLA-NEXT:    vslideup.vi v8, v10, 4
-; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    addi a0, sp, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v0, v11
+; VLA-NEXT:    vmv1r.v v24, v10
+; VLA-NEXT:    vmv1r.v v16, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 8
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 12
+; VLA-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 20
+; VLA-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 24
 ; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    mv a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 28
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    add sp, sp, a0
+; VLA-NEXT:    addi sp, sp, 16
 ; VLA-NEXT:    ret
 ;
 ; VLS-LABEL: concat_8xv4i32:
 ; VLS:       # %bb.0:
-; VLS-NEXT:    vmv1r.v v18, v15
-; VLS-NEXT:    vmv1r.v v20, v14
-; VLS-NEXT:    vmv1r.v v22, v13
+; VLS-NEXT:    addi sp, sp, -16
+; VLS-NEXT:    .cfi_def_cfa_offset 16
+; VLS-NEXT:    addi sp, sp, -512
+; VLS-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLS-NEXT:    vmv1r.v v16, v15
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v14
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v13
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; VLS-NEXT:    vmv1r.v v16, v12
-; VLS-NEXT:    vmv1r.v v14, v11
-; VLS-NEXT:    vmv1r.v v12, v10
-; VLS-NEXT:    vmv1r.v v10, v9
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v20, v18, 4
-; VLS-NEXT:    vslideup.vi v16, v22, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v16, v20, 8
-; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLS-NEXT:    vslideup.vi v12, v14, 4
-; VLS-NEXT:    vslideup.vi v8, v10, 4
-; VLS-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLS-NEXT:    vslideup.vi v8, v12, 8
-; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    addi a0, sp, 16
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v0, v11
+; VLS-NEXT:    vmv1r.v v24, v10
+; VLS-NEXT:    vmv1r.v v16, v9
+; VLS-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 8
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 12
+; VLS-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 20
+; VLS-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 28
+; VLS-NEXT:    addi sp, sp, 512
+; VLS-NEXT:    addi sp, sp, 16
 ; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
index 37902aa1873215..4ec2e59672ad6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
@@ -24,15 +24,17 @@ define void @widen_2xv4i16(ptr %x, ptr %z) {
 define void @widen_3xv4i16(ptr %x, ptr %z) {
 ; CHECK-LABEL: widen_3xv4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a2, a0, 16
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a2)
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 8
-; CHECK-NEXT:    vsetivli zero, 12, e16, m2, ta, ma
-; CHECK-NEXT:    vse16.v v10, (a1)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    addi a2, a0, 8
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
   %b.gep = getelementptr i8, ptr %x, i64 8
@@ -70,18 +72,20 @@ define void @widen_4xv4i16(ptr %x, ptr %z) {
 define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
 ; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned:
 ; CHECK-NO-MISALIGN:       # %bb.0:
-; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NO-MISALIGN-NEXT:    vle8.v v8, (a0)
-; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 16
-; CHECK-NO-MISALIGN-NEXT:    vle8.v v10, (a2)
 ; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 8
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v10, (a2)
+; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 16
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v12, (a2)
 ; CHECK-NO-MISALIGN-NEXT:    addi a0, a0, 24
-; CHECK-NO-MISALIGN-NEXT:    vle8.v v9, (a0)
-; CHECK-NO-MISALIGN-NEXT:    vle8.v v11, (a2)
-; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v10, v9, 4
-; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v11, 4
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v14, (a0)
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v10, 8
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v14, 12
 ; CHECK-NO-MISALIGN-NEXT:    vse16.v v8, (a1)
 ; CHECK-NO-MISALIGN-NEXT:    ret
 ;
@@ -181,14 +185,21 @@ define void @strided_constant_0(ptr %x, ptr %z) {
 define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
 ; CHECK-LABEL: strided_constant_mismatch_4xv4i16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    addi a2, a0, 2
+; CHECK-NEXT:    vle16.v v10, (a2)
 ; CHECK-NEXT:    addi a2, a0, 6
-; CHECK-NEXT:    li a3, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vlse64.v v8, (a0), a3
-; CHECK-NEXT:    vlse64.v v10, (a2), a3
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
-; CHECK-NEXT:    vse64.v v8, (a1)
+; CHECK-NEXT:    vle16.v v12, (a2)
+; CHECK-NEXT:    addi a0, a0, 8
+; CHECK-NEXT:    vle16.v v14, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v14, 12
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %x
   %b.gep = getelementptr i8, ptr %x, i64 2
@@ -244,38 +255,59 @@ define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) {
 define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
 ; RV32-LABEL: strided_runtime_mismatch_4xv4i16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), a2
-; RV32-NEXT:    vlse64.v v10, (a3), a2
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vslideup.vi v8, v10, 2
-; RV32-NEXT:    vse64.v v8, (a1)
+; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT:    vle16.v v8, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vle16.v v10, (a0)
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    vle16.v v12, (a0)
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vle16.v v14, (a0)
+; RV32-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v8, v10, 4
+; RV32-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v8, v12, 8
+; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV32-NEXT:    vslideup.vi v8, v14, 12
+; RV32-NEXT:    vse16.v v8, (a1)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: strided_runtime_mismatch_4xv4i16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    add a4, a0, a2
-; RV64-NEXT:    add a3, a4, a3
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vlse64.v v8, (a0), a2
-; RV64-NEXT:    vlse64.v v10, (a3), a2
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vslideup.vi v8, v10, 2
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NEXT:    vle16.v v8, (a0)
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vle16.v v10, (a0)
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    vle16.v v12, (a0)
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vle16.v v14, (a0)
+; RV64-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; RV64-NEXT:    vslideup.vi v8, v10, 4
+; RV64-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; RV64-NEXT:    vslideup.vi v8, v12, 8
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT:    vslideup.vi v8, v14, 12
+; RV64-NEXT:    vse16.v v8, (a1)
 ; RV64-NEXT:    ret
 ;
 ; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16:
 ; ZVE64F:       # %bb.0:
-; ZVE64F-NEXT:    add a4, a0, a2
-; ZVE64F-NEXT:    add a3, a4, a3
-; ZVE64F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; ZVE64F-NEXT:    vlse64.v v8, (a0), a2
-; ZVE64F-NEXT:    vlse64.v v10, (a3), a2
-; ZVE64F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; ZVE64F-NEXT:    vslideup.vi v8, v10, 2
-; ZVE64F-NEXT:    vse64.v v8, (a1)
+; ZVE64F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE64F-NEXT:    vle16.v v8, (a0)
+; ZVE64F-NEXT:    add a0, a0, a2
+; ZVE64F-NEXT:    vle16.v v10, (a0)
+; ZVE64F-NEXT:    add a0, a0, a3
+; ZVE64F-NEXT:    vle16.v v12, (a0)
+; ZVE64F-NEXT:    add a0, a0, a2
+; ZVE64F-NEXT:    vle16.v v14, (a0)
+; ZVE64F-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; ZVE64F-NEXT:    vslideup.vi v8, v10, 4
+; ZVE64F-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; ZVE64F-NEXT:    vslideup.vi v8, v12, 8
+; ZVE64F-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVE64F-NEXT:    vslideup.vi v8, v14, 12
+; ZVE64F-NEXT:    vse16.v v8, (a1)
 ; ZVE64F-NEXT:    ret
   %a = load <4 x i16>, ptr %x
   %b.gep = getelementptr i8, ptr %x, i64 %s
@@ -534,3 +566,28 @@ define void @reverse_strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
   store <8 x float> %e.2, ptr %z
   ret void
 }
+
+; The middle end sometimes produces this pattern of shuffles, where the
+; intermediate shuffles are the full result vector size padded with poison
+; elements.
+define <16 x i8> @widen_4xv4i8_immediate_expand(ptr %p, i64 %s) {
+; CHECK-LABEL: widen_4xv4i8_immediate_expand:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %a = load <4 x i8>, ptr %p
+  %b.ptr = getelementptr i8, ptr %p, i64 %s
+  %b = load <4 x i8>, ptr %b.ptr
+  %c.ptr = getelementptr i8, ptr %b.ptr, i64 %s
+  %c = load <4 x i8>, ptr %c.ptr
+  %d.ptr = getelementptr i8, ptr %c.ptr, i64 %s
+  %d = load <4 x i8>, ptr %d.ptr
+
+  %ab = shufflevector <4 x i8> %a, <4 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %cx = shufflevector <4 x i8> %c, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %dx = shufflevector <4 x i8> %d, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %abcx = shufflevector <16 x i8> %ab, <16 x i8> %cx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+  %abcd = shufflevector <16 x i8> %abcx, <16 x i8> %dx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+  ret <16 x i8> %abcd
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index b3bda5973eb8c4..eb7894ede04647 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -441,50 +441,57 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -602,50 +609,57 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -773,53 +787,60 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    li a0, -1
 ; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
+; CHECK-V-NEXT:    vmin.vx v8, v10, a0
 ; CHECK-V-NEXT:    vmax.vx v10, v8, zero
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
@@ -1383,125 +1404,90 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -1696,125 +1682,90 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
+; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
+; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
-; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -2031,129 +1982,94 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    lui a0, 16
 ; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
+; CHECK-V-NEXT:    vmin.vx v8, v10, a0
 ; CHECK-V-NEXT:    vmax.vx v10, v8, zero
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -3807,50 +3723,57 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -3966,50 +3889,57 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -4136,53 +4066,60 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
 ; CHECK-V-NEXT:    slli a1, a1, 2
 ; CHECK-V-NEXT:    sub sp, sp, a1
 ; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    lhu s0, 24(a0)
+; CHECK-V-NEXT:    lhu s1, 16(a0)
+; CHECK-V-NEXT:    lhu s2, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add a0, sp, a0
 ; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
 ; CHECK-V-NEXT:    li a0, -1
 ; CHECK-V-NEXT:    srli a0, a0, 32
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
+; CHECK-V-NEXT:    vmin.vx v8, v10, a0
 ; CHECK-V-NEXT:    vmax.vx v10, v8, zero
 ; CHECK-V-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
@@ -4734,125 +4671,90 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnclip.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -5045,125 +4947,90 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v10, a0
-; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
 ; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl2r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnclipu.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
@@ -5379,129 +5246,94 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
 ; CHECK-V-NEXT:    .cfi_offset s5, -56
 ; CHECK-V-NEXT:    .cfi_offset s6, -64
 ; CHECK-V-NEXT:    csrr a1, vlenb
-; CHECK-V-NEXT:    slli a1, a1, 2
+; CHECK-V-NEXT:    slli a1, a1, 1
 ; CHECK-V-NEXT:    sub sp, sp, a1
-; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 4 * vlenb
-; CHECK-V-NEXT:    lhu s0, 0(a0)
-; CHECK-V-NEXT:    lhu s1, 8(a0)
-; CHECK-V-NEXT:    lhu s2, 16(a0)
-; CHECK-V-NEXT:    lhu s3, 24(a0)
-; CHECK-V-NEXT:    lhu s4, 32(a0)
-; CHECK-V-NEXT:    lhu s5, 40(a0)
-; CHECK-V-NEXT:    lhu s6, 48(a0)
-; CHECK-V-NEXT:    lhu a0, 56(a0)
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 56(a0)
+; CHECK-V-NEXT:    lhu s1, 48(a0)
+; CHECK-V-NEXT:    lhu s2, 40(a0)
+; CHECK-V-NEXT:    lhu s3, 32(a0)
+; CHECK-V-NEXT:    lhu s4, 24(a0)
+; CHECK-V-NEXT:    lhu s5, 16(a0)
+; CHECK-V-NEXT:    lhu s6, 0(a0)
+; CHECK-V-NEXT:    lhu a0, 8(a0)
 ; CHECK-V-NEXT:    fmv.w.x fa0, a0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
 ; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s6
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s5
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s4
+; CHECK-V-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 2
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s4
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 3
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s3
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 4
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 5
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
 ; CHECK-V-NEXT:    fmv.w.x fa0, s1
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 6
+; CHECK-V-NEXT:    vs2r.v v10, (a0) # Unknown-size Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    addi a0, sp, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
-; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
-; CHECK-V-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 1
-; CHECK-V-NEXT:    add a0, sp, a0
-; CHECK-V-NEXT:    addi a0, a0, 16
 ; CHECK-V-NEXT:    vl2r.v v10, (a0) # Unknown-size Folded Reload
-; CHECK-V-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-V-NEXT:    vslideup.vi v10, v8, 7
 ; CHECK-V-NEXT:    lui a0, 16
 ; CHECK-V-NEXT:    addi a0, a0, -1
-; CHECK-V-NEXT:    vmin.vx v8, v8, a0
+; CHECK-V-NEXT:    vmin.vx v8, v10, a0
 ; CHECK-V-NEXT:    vmax.vx v10, v8, zero
 ; CHECK-V-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-V-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-V-NEXT:    csrr a0, vlenb
-; CHECK-V-NEXT:    slli a0, a0, 2
+; CHECK-V-NEXT:    slli a0, a0, 1
 ; CHECK-V-NEXT:    add sp, sp, a0
 ; CHECK-V-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 0b236f6d3ff388..f3ae03af7c7868 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -2136,18 +2136,17 @@ define <vscale x 32 x i8> @mgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8>
 ; RV64-NEXT:    vluxei64.v v13, (a0), v24, v0.t
 ; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v8, v16, a1
+; RV64-NEXT:    vslidedown.vx v0, v16, a1
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v16, v10
+; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
+; RV64-NEXT:    vluxei64.v v14, (a0), v16, v0.t
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v8, a2
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v11
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64-NEXT:    vluxei64.v v15, (a0), v16, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v16, v10
-; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vluxei64.v v14, (a0), v16, v0.t
 ; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i8, ptr %base, <vscale x 32 x i8> %idxs
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
index d13d67fd0a8824..c27488b18a017a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr63596.ll
@@ -9,38 +9,39 @@ define <4 x float> @foo(ptr %0) nounwind {
 ; CHECK-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    lhu s0, 0(a0)
-; CHECK-NEXT:    lhu s1, 2(a0)
-; CHECK-NEXT:    lhu s2, 4(a0)
-; CHECK-NEXT:    lhu a0, 6(a0)
+; CHECK-NEXT:    lhu s0, 6(a0)
+; CHECK-NEXT:    lhu s1, 4(a0)
+; CHECK-NEXT:    lhu s2, 0(a0)
+; CHECK-NEXT:    lhu a0, 2(a0)
 ; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fsw fa0, 4(sp)
+; CHECK-NEXT:    fsw fa0, 8(sp)
 ; CHECK-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fsw fa0, 12(sp)
+; CHECK-NEXT:    fsw fa0, 0(sp)
 ; CHECK-NEXT:    fmv.w.x fa0, s1
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fsw fa0, 8(sp)
+; CHECK-NEXT:    fsw fa0, 12(sp)
 ; CHECK-NEXT:    fmv.w.x fa0, s0
 ; CHECK-NEXT:    call __extendhfsf2
-; CHECK-NEXT:    fsw fa0, 0(sp)
-; CHECK-NEXT:    addi a0, sp, 4
+; CHECK-NEXT:    fsw fa0, 4(sp)
+; CHECK-NEXT:    addi a0, sp, 8
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    mv a0, sp
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    addi a0, sp, 12
+; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    addi a0, sp, 8
+; CHECK-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    addi a0, sp, 4
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    mv a0, sp
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    vle32.v v9, (a0)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vslideup.vi v8, v9, 3
 ; CHECK-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-stack-align.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-stack-align.mir
index 6ea6fb183a7fdf..749bd4c13879b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-stack-align.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-stack-align.mir
@@ -159,7 +159,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
@@ -204,7 +204,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    16
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
@@ -249,7 +249,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    32
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir b/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir
index 06ed46f291a832..8248c26636793e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/wrong-stack-offset-for-rvv-object.mir
@@ -83,7 +83,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/RISCV/stack-inst-compress.mir b/llvm/test/CodeGen/RISCV/stack-inst-compress.mir
index 6721ff11d99b7b..5cc4615bb64a13 100644
--- a/llvm/test/CodeGen/RISCV/stack-inst-compress.mir
+++ b/llvm/test/CodeGen/RISCV/stack-inst-compress.mir
@@ -32,6 +32,7 @@ alignment:       2
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
+  adjustsStack:    true
   hasCalls:        true
   localFrameSize:  2048
 stack:
@@ -117,6 +118,7 @@ alignment:       2
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
+  adjustsStack:    true
   hasCalls:        true
   localFrameSize:  4096
 stack:
@@ -210,6 +212,7 @@ alignment:       2
 tracksRegLiveness: true
 frameInfo:
   maxAlignment:    4
+  adjustsStack:    true
   hasCalls:        true
   localFrameSize:  8192
 stack:
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 8c0d97afe6c21d..f1ae3200175636 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -89,8 +89,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    snez a3, a3
 ; RISCV32-NEXT:    and a3, a3, a7
 ; RISCV32-NEXT:    or a2, a3, a2
-; RISCV32-NEXT:    or a3, t2, t3
-; RISCV32-NEXT:    or a2, a2, a3
+; RISCV32-NEXT:    or a2, a2, t2
+; RISCV32-NEXT:    or a2, a2, t3
 ; RISCV32-NEXT:    mul a3, a5, a4
 ; RISCV32-NEXT:    andi a2, a2, 1
 ; RISCV32-NEXT:    sw a3, 0(a0)
diff --git a/llvm/test/CodeGen/SPIRV/ComparePointers.ll b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
index 9be05944789b6f..6777fc38024b32 100644
--- a/llvm/test/CodeGen/SPIRV/ComparePointers.ll
+++ b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --mattr=+spirv1.3  %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; kernel void test(int global *in, int global *in2) {
 ;;   if (!in)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll
new file mode 100644
index 00000000000000..710a1581f760ca
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lifetime.ll
@@ -0,0 +1,25 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+
+; CHECK: OpFunction
+; CHECK: %[[FooArg:.*]] = OpVariable
+; CHECK: OpLifetimeStart %[[FooArg]], 0
+; CHECK: OpCopyMemorySized
+; CHECK: OpBitcast
+; CHECK: OpInBoundsPtrAccessChain
+; CHECK: OpLifetimeStop %[[FooArg]], 0
+
+%tprange = type { %tparray }
+%tparray = type { [2 x i64] }
+
+define spir_func void @foo(ptr noundef byval(%tprange) align 8 %_arg_UserRange) {
+  %RoundedRangeKernel = alloca %tprange, align 8
+  call void @llvm.lifetime.start.p0(i64 72, ptr nonnull %RoundedRangeKernel) #7
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %RoundedRangeKernel, ptr align 8 %_arg_UserRange, i64 16, i1 false)
+  %KernelFunc = getelementptr inbounds i8, ptr %RoundedRangeKernel, i64 16
+  call void @llvm.lifetime.end.p0(i64 72, ptr nonnull %RoundedRangeKernel) #7
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
index 55cfcea999d84b..e0c47798cc6d09 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --mattr=+spirv1.3 %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --mattr=+spirv1.3 %s -o - -filetype=obj | spirv-val %}
 
 ;; __kernel void testAtomicCompareExchangeExplicit_cl20(
 ;;     volatile global atomic_int* object,
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
index f18f27a6de51d4..50748931635656 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId
 ; CHECK-SPIRV: %[[#Id:]] = OpVariable %[[#]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
index d39ca3c39383c0..d0c4dff43121cc 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; The IR was generated from the following source:
 ;; #include <CL/sycl.hpp>
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
index 03456aef6b6b2e..3885f070231442 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; The IR was generated from the following source:
 ;; #include <CL/sycl.hpp>
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-06.ll b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
index c9c5504520345c..60ff780df87b0e 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
@@ -4,9 +4,7 @@
 
 define float @f1(ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: lgf [[R:%r[0-9]+]], 0(%r2)
-; CHECK: sllg [[R]], [[R]], 32
-; CHECK: ldgr %f0, [[R]]
+; CHECK: le %f0, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic float, ptr %src seq_cst, align 4
   ret float %val
diff --git a/llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll b/llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll
new file mode 100644
index 00000000000000..8038329c0e09a0
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+;
+; Test fpext of atomic loads to fp128 without VectorEnhancements1 (using FP register pairs).
+
+define fp128 @f1(ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lxeb %f0, 0(%r3)
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    std %f2, 8(%r2)
+; CHECK-NEXT:    br %r14
+  %V  = load atomic float, ptr %src seq_cst, align 4
+  %Res = fpext float %V to fp128
+  ret fp128 %Res
+}
+
+define fp128 @f2(ptr %src) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lxdb %f0, 0(%r3)
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    std %f2, 8(%r2)
+; CHECK-NEXT:    br %r14
+  %V  = load atomic double, ptr %src seq_cst, align 8
+  %Res = fpext double %V to fp128
+  ret fp128 %Res
+}
+
+
+
diff --git a/llvm/test/CodeGen/SystemZ/atomic-memops.ll b/llvm/test/CodeGen/SystemZ/atomic-memops.ll
new file mode 100644
index 00000000000000..0bc647aa0e0f7c
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-memops.ll
@@ -0,0 +1,739 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
+
+; Sign-extending atomic loads.
+define void @f1(ptr %src, ptr %dst) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i16
+  store volatile i16 %s, ptr %dst
+  ret void
+}
+
+define void @f2(ptr %src, ptr %dst) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i32
+  store volatile i32 %s, ptr %dst
+  ret void
+}
+
+define void @f3(ptr %src, ptr %dst) {
+; CHECK-LABEL: f3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgb %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i64
+  store volatile i64 %s, ptr %dst
+  ret void
+}
+
+define void @f4(ptr %src, ptr %dst) {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lh %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %s = sext i16 %b to i32
+  store volatile i32 %s, ptr %dst
+  ret void
+}
+
+define void @f5(ptr %src, ptr %dst) {
+; CHECK-LABEL: f5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgh %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %s = sext i16 %b to i64
+  store volatile i64 %s, ptr %dst
+  ret void
+}
+
+define void @f6(ptr %src, ptr %dst) {
+; CHECK-LABEL: f6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgf %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %s = sext i32 %b to i64
+  store volatile i64 %s, ptr %dst
+  ret void
+}
+
+; Zero-extending atomic loads.
+define void @f7(ptr %src, ptr %dst) {
+; CHECK-LABEL: f7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llc %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i16
+  store volatile i16 %z, ptr %dst
+  ret void
+}
+
+define void @f8(ptr %src, ptr %dst) {
+; CHECK-LABEL: f8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llc %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i32
+  store volatile i32 %z, ptr %dst
+  ret void
+}
+
+define void @f9(ptr %src, ptr %dst) {
+; CHECK-LABEL: f9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i64
+  store volatile i64 %z, ptr %dst
+  ret void
+}
+
+define void @f10(ptr %src, ptr %dst) {
+; CHECK-LABEL: f10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llh %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %z = zext i16 %b to i32
+  store volatile i32 %z, ptr %dst
+  ret void
+}
+
+define void @f11(ptr %src, ptr %dst) {
+; CHECK-LABEL: f11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgh %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %z = zext i16 %b to i64
+  store volatile i64 %z, ptr %dst
+  ret void
+}
+
+define void @f12(ptr %src, ptr %dst) {
+; CHECK-LABEL: f12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgf %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %z = zext i32 %b to i64
+  store volatile i64 %z, ptr %dst
+  ret void
+}
+
+; reg/mem
+define i64 @f13(i64 %a, ptr %src) {
+; CHECK-LABEL: f13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ag %r2, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i64, ptr %src seq_cst, align 8
+  %add = add i64 %a, %b
+  ret i64 %add
+}
+
+; reg/mem op with extension from memory.
+define i64 @f14(i64 %a, ptr %src) {
+; CHECK-LABEL: f14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slgf %r2, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %bext = zext i32 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+define float @f15(float %f1, ptr %ptr, float %acc) {
+; CHECK-LABEL: f15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maeb %f2, %f0, 0(%r2)
+; CHECK-NEXT:    ldr %f0, %f2
+; CHECK-NEXT:    br %r14
+  %f2 = load atomic float, ptr %ptr seq_cst, align 4
+  %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
+  ret float %res
+}
+declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+
+define double @f15_b(ptr %src) {
+; CHECK-LABEL: f15_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ldeb %f0, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %V  = load atomic float, ptr %src seq_cst, align 4
+  %Res = fpext float %V to double
+  ret double %Res
+}
+
+define fp128 @f15_c(ptr %src) {
+; CHECK-LABEL: f15_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lde %f0, 0(%r3)
+; CHECK-NEXT:    ldebr %f0, %f0
+; CHECK-NEXT:    wflld %v0, %f0
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %V  = load atomic float, ptr %src seq_cst, align 4
+  %Res = fpext float %V to fp128
+  ret fp128 %Res
+}
+
+define fp128 @f15_d(ptr %src) {
+; CHECK-LABEL: f15_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld %f0, 0(%r3)
+; CHECK-NEXT:    wflld %v0, %f0
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %V  = load atomic double, ptr %src seq_cst, align 8
+  %Res = fpext double %V to fp128
+  ret fp128 %Res
+}
+
+; Do it twice for good measure given the involved DAG combines.
+define void @f16(ptr %src, ptr %dst) {
+; CHECK-LABEL: f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    lgbr %r1, %r0
+; CHECK-NEXT:    stg %r1, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    lgbr %r1, %r0
+; CHECK-NEXT:    stg %r1, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i64
+  %z = zext i8 %b to i64
+  store volatile i64 %s, ptr %dst
+  store volatile i64 %z, ptr %dst
+
+  %b2 = load atomic i8, ptr %src seq_cst, align 1
+  %s2 = sext i8 %b2 to i64
+  %z2 = zext i8 %b2 to i64
+  store volatile i64 %s2, ptr %dst
+  store volatile i64 %z2, ptr %dst
+
+  ret void
+}
+
+define void @f16_b(ptr %src, ptr %dst) {
+; CHECK-LABEL: f16_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgb %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i16
+  store volatile i16 %s, ptr %dst
+
+  %s2 = sext i8 %b to i64
+  store volatile i64 %s2, ptr %dst
+
+  ret void
+}
+
+define void @f16_c(ptr %src, ptr %dst) {
+; CHECK-LABEL: f16_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i16
+  store volatile i16 %z, ptr %dst
+
+  %z2 = zext i8 %b to i64
+  store volatile i64 %z2, ptr %dst
+
+  ret void
+}
+
+; Check that two i8 loads use a reg/reg op.
+define i8 @f16_d(ptr %src, ptr %src2) {
+; CHECK-LABEL: f16_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r2, 0(%r2)
+; CHECK-NEXT:    lb %r0, 0(%r3)
+; CHECK-NEXT:    ar %r2, %r0
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %b2 = load atomic i8, ptr %src2 seq_cst, align 1
+  %add = add i8 %b, %b2
+  ret i8 %add
+}
+
+; Binary operations on a byte in memory, with an atomic load.
+define void @f17(ptr %ptr) {
+; CHECK-LABEL: f17:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ni 0(%r2), 1
+; CHECK-NEXT:    br %r14
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %xor = and i8 %val, -255
+  store i8 %xor, ptr %ptr
+  ret void
+}
+
+define void @f18(ptr %src) {
+; CHECK-LABEL: f18:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    oiy 4096(%r2), 1
+; CHECK-NEXT:    br %r14
+  %ptr = getelementptr i8, ptr %src, i64 4096
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %xor = or i8 %val, -255
+  store i8 %xor, ptr %ptr
+  ret void
+}
+
+define void @f19(ptr %src) {
+; CHECK-LABEL: f19:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xi 4095(%r2), 1
+; CHECK-NEXT:    br %r14
+  %ptr = getelementptr i8, ptr %src, i64 4095
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %xor = xor i8 %val, -255
+  store i8 %xor, ptr %ptr
+  ret void
+}
+
+; TM
+define double @f20(ptr %src, double %a, double %b) {
+; CHECK-LABEL: f20:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    tm 0(%r2), 1
+; CHECK-NEXT:    je .LBB25_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    ldr %f2, %f0
+; CHECK-NEXT:  .LBB25_2:
+; CHECK-NEXT:    ldr %f0, %f2
+; CHECK-NEXT:    br %r14
+  %byte = load atomic i8, ptr %src seq_cst, align 1
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; vector load and replicate
+define void @f21(ptr %src, ptr %dst) {
+; CHECK-LABEL: f21:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepb %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %v = insertelement <16 x i8> undef, i8 %b, i32 1
+  store volatile <16 x i8> %v, ptr %dst
+  ret void
+}
+
+define void @f22(ptr %src, ptr %dst) {
+; CHECK-LABEL: f22:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlreph %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %v = insertelement <8 x i16> undef, i16 %b, i32 1
+  store volatile <8 x i16> %v, ptr %dst
+  ret void
+}
+
+define void @f23(ptr %src, ptr %dst) {
+; CHECK-LABEL: f23:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %v = insertelement <4 x i32> undef, i32 %b, i32 2
+  store volatile <4 x i32> %v, ptr %dst
+  ret void
+}
+
+define void @f24(ptr %src, ptr %dst) {
+; CHECK-LABEL: f24:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepg %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i64, ptr %src seq_cst, align 8
+  %v = insertelement <2 x i64> undef, i64 %b, i32 0
+  store volatile <2 x i64> %v, ptr %dst
+  ret void
+}
+
+define void @f25(ptr %src, ptr %dst) {
+; CHECK-LABEL: f25:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic float, ptr %src seq_cst, align 4
+  %v = insertelement <4 x float> undef, float %b, i32 1
+  store volatile <4 x float> %v, ptr %dst
+  ret void
+}
+
+; Do *not* use vlrep for an extending load.
+define <4 x i32> @f25_c(ptr %ptr) {
+; CHECK-LABEL: f25_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepf %v24, %v0, 1
+; CHECK-NEXT:    br %r14
+  %L = load atomic i8, ptr %ptr seq_cst, align 4
+  %S = sext i8 %L to i32
+  %val = insertelement <4 x i32> undef, i32 %S, i32 0
+  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+; Do *not* use vlrep if there is another scalar use.
+define <4 x i32> @f25_d(ptr %ptr, ptr %dst) {
+; CHECK-LABEL: f25_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    l %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepf %v24, %v0, 1
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %L = load atomic i32, ptr %ptr seq_cst, align 4
+  store i32 %L, ptr %dst, align 4
+  %val = insertelement <4 x i32> undef, i32 %L, i32 0
+  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define void @f26(ptr %src, ptr %dst) {
+; CHECK-LABEL: f26:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepg %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic double, ptr %src seq_cst, align 8
+  %v = insertelement <2 x double> undef, double %b, i32 0
+  store volatile <2 x double> %v, ptr %dst
+  ret void
+}
+
+; Vector Load logical element and zero.
+define <16 x i8> @f27(ptr %ptr) {
+; CHECK-LABEL: f27:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezb %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @f28(ptr %ptr) {
+; CHECK-LABEL: f28:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezh %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i16, ptr %ptr seq_cst, align 2
+  %ret = insertelement <8 x i16> zeroinitializer, i16 %val, i32 3
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @f29(ptr %ptr) {
+; CHECK-LABEL: f29:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezf %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i32, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 1
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @f30(ptr %ptr) {
+; CHECK-LABEL: f30:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezg %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i64, ptr %ptr seq_cst, align 8
+  %ret = insertelement <2 x i64> zeroinitializer, i64 %val, i32 0
+  ret <2 x i64> %ret
+}
+
+define <4 x i32> @f31(ptr %ptr) {
+; CHECK-LABEL: f31:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezlf %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i32, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 0
+  ret <4 x i32> %ret
+}
+
+define <4 x float> @f32(ptr %ptr) {
+; CHECK-LABEL: f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezlf %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic float, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 0
+  ret <4 x float> %ret
+}
+
+; Vector Load element.
+define <16 x i8> @f33(<16 x i8> %val, ptr %ptr) {
+; CHECK-LABEL: f33:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vleb %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i8, ptr %ptr seq_cst, align 1
+  %ret = insertelement <16 x i8> %val, i8 %element, i32 0
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @f34(<8 x i16> %val, ptr %ptr) {
+; CHECK-LABEL: f34:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vleh %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i16, ptr %ptr seq_cst, align 2
+  %ret = insertelement <8 x i16> %val, i16 %element, i32 0
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @f35(<4 x i32> %val, ptr %ptr) {
+; CHECK-LABEL: f35:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlef %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i32, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x i32> %val, i32 %element, i32 0
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @f36(<2 x i64> %val, ptr %ptr) {
+; CHECK-LABEL: f36:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vleg %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i64, ptr %ptr seq_cst, align 8
+  %ret = insertelement <2 x i64> %val, i64 %element, i32 0
+  ret <2 x i64> %ret
+}
+
+; Test operation on memory involving atomic load and store.
+define void @f39(ptr %ptr) {
+; CHECK-LABEL: f39:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    oi 0(%r2), 1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %or = or i8 %val, -255
+  store atomic i8 %or, ptr %ptr seq_cst, align 1
+  ret void
+}
+
+; Some atomic stores of immediates.
+define void @f40(ptr %ptr) {
+; CHECK-LABEL: f40:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvi 0(%r2), 128
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i8 128, ptr %ptr seq_cst, align 1
+  ret void
+}
+
+define void @f41(ptr %ptr) {
+; CHECK-LABEL: f41:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvhi 0(%r2), -1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i32 4294967295, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f42(ptr %ptr) {
+; CHECK-LABEL: f42:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvhi 0(%r2), -1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i32 4294967295, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f43(ptr %ptr) {
+; CHECK-LABEL: f43:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llihl %r0, 255
+; CHECK-NEXT:    oilf %r0, 4294967295
+; CHECK-NEXT:    stg %r0, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i64 1099511627775, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define void @f44(ptr %ptr) {
+; CHECK-LABEL: f44:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI49_0
+; CHECK-NEXT:    ld %f0, 0(%r1)
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic double 0x3ff0000020000000, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+; Vector Store Element.
+define void @f45(<16 x i8> %val, ptr %ptr) {
+; CHECK-LABEL: f45:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteb %v24, 0(%r2), 0
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <16 x i8> %val, i32 0
+  store atomic i8 %element, ptr %ptr seq_cst, align 1
+  ret void
+}
+
+define void @f46(<8 x i16> %val, ptr %base) {
+; CHECK-LABEL: f46:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteh %v24, 4094(%r2), 5
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %ptr = getelementptr i16, ptr %base, i32 2047
+  %element = extractelement <8 x i16> %val, i32 5
+  store atomic i16 %element, ptr %ptr seq_cst, align 2
+  ret void
+}
+
+define void @f47(<4 x i32> %val, ptr %ptr) {
+; CHECK-LABEL: f47:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vstef %v24, 0(%r2), 3
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <4 x i32> %val, i32 3
+  store atomic i32 %element, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f48(<2 x i64> %val, ptr %ptr) {
+; CHECK-LABEL: f48:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteg %v24, 0(%r2), 1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <2 x i64> %val, i32 1
+  store atomic i64 %element, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define void @f49(<4 x float> %val, ptr %ptr) {
+; CHECK-LABEL: f49:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vstef %v24, 0(%r2), 0
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <4 x float> %val, i32 0
+  store atomic float %element, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f50(<2 x double> %val, ptr %ptr) {
+; CHECK-LABEL: f50:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteg %v24, 0(%r2), 1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <2 x double> %val, i32 1
+  store atomic double %element, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define void @f51(ptr %src, ptr %dst) {
+; CHECK-LABEL: f51:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vgmf %v1, 2, 8
+; CHECK-NEXT:    aebr %f0, %f1
+; CHECK-NEXT:    ste %f0, 0(%r3)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %atomic-load = load atomic i128, ptr %src seq_cst, align 16
+  %b0 = bitcast i128 %atomic-load to <4 x float>
+  %vecext = extractelement <4 x float> %b0, i64 0
+  %add = fadd float %vecext, 1.000000e+00
+  store atomic float %add, ptr %dst seq_cst, align 4
+  ret void
+}
+
+define void @f52(ptr %src, ptr %dst) {
+; CHECK-LABEL: f52:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vgmg %v1, 2, 11
+; CHECK-NEXT:    adbr %f0, %f1
+; CHECK-NEXT:    std %f0, 0(%r3)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %atomic-load = load atomic i128, ptr %src seq_cst, align 16
+  %b0 = bitcast i128 %atomic-load to <2 x double>
+  %vecext = extractelement <2 x double> %b0, i64 0
+  %add = fadd double %vecext, 1.000000e+00
+  store atomic double %add, ptr %dst seq_cst, align 8
+  ret void
+}
+
+define void @fun58(ptr %ptr, i64 %arg) {
+; CHECK-LABEL: fun58:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    st %r3, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %res = trunc i64 %arg to i32
+  store atomic i32 %res, ptr %ptr seq_cst, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
index b748bfc767a4db..91e324b0af1a97 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
@@ -6,10 +6,7 @@
 define void @f1(ptr %src, float %val) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $f0s killed $f0s def $f0d
-; CHECK-NEXT:    lgdr %r0, %f0
-; CHECK-NEXT:    srlg %r0, %r0, 32
-; CHECK-NEXT:    st %r0, 0(%r2)
+; CHECK-NEXT:    ste %f0, 0(%r2)
 ; CHECK-NEXT:    bcr 15, %r0
 ; CHECK-NEXT:    br %r14
   store atomic float %val, ptr %src seq_cst, align 4
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-04.mir b/llvm/test/CodeGen/SystemZ/cond-move-04.mir
index 97aa00f582921d..23fd2739698a40 100644
--- a/llvm/test/CodeGen/SystemZ/cond-move-04.mir
+++ b/llvm/test/CodeGen/SystemZ/cond-move-04.mir
@@ -53,6 +53,7 @@ registers:
   - { id: 10, class: gr64bit }
   - { id: 11, class: gr32bit }
 frameInfo:       
+  adjustsStack:    true
   hasCalls:        true
 body:             |
   bb.0 (%ir-block.1):
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-08.mir b/llvm/test/CodeGen/SystemZ/cond-move-08.mir
index 93aa5626b8e894..64c6d069799282 100644
--- a/llvm/test/CodeGen/SystemZ/cond-move-08.mir
+++ b/llvm/test/CodeGen/SystemZ/cond-move-08.mir
@@ -116,6 +116,7 @@ registers:
   - { id: 27, class: grx32bit }
   - { id: 28, class: addr64bit }
 frameInfo:
+  adjustsStack:    true
   hasCalls:        true
 body:             |
   bb.0.bb5:
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints-02.mir b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints-02.mir
index 37e29800fb1d62..2701a1dc034a22 100644
--- a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints-02.mir
+++ b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints-02.mir
@@ -30,6 +30,7 @@ registers:
   - { id: 11, class: gr32bit }
 frameInfo:
   maxAlignment:    1
+  adjustsStack:    true
   hasCalls:        true
 machineFunctionInfo: {}
 body:             |
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
index e7e1eaf8f8fdcd..c98ffda8372721 100644
--- a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
+++ b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
@@ -192,6 +192,7 @@ liveins:
   - { reg: '$r2d', virtual-reg: '%31' }
   - { reg: '$r3d', virtual-reg: '%32' }
 frameInfo:       
+  adjustsStack:    true
   hasCalls:        true
 body:             |
   bb.0.bb:
diff --git a/llvm/test/CodeGen/SystemZ/frame-28.mir b/llvm/test/CodeGen/SystemZ/frame-28.mir
index dd5933a9c7b4b4..13337dba6ec53f 100644
--- a/llvm/test/CodeGen/SystemZ/frame-28.mir
+++ b/llvm/test/CodeGen/SystemZ/frame-28.mir
@@ -162,6 +162,8 @@ body:             |
 ---
 name:            fun4
 tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, size: 5000 }
   - { id: 1, size: 2500 }
diff --git a/llvm/test/CodeGen/SystemZ/int-usub-12.ll b/llvm/test/CodeGen/SystemZ/int-usub-12.ll
index c39a6da37048d3..147fbfd920a9dc 100644
--- a/llvm/test/CodeGen/SystemZ/int-usub-12.ll
+++ b/llvm/test/CodeGen/SystemZ/int-usub-12.ll
@@ -11,6 +11,7 @@ define zeroext i1 @f1(i128 %a, i128 %b, ptr %res) {
 ; CHECK-NEXT:    vscbiq %v2, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v2, 1
 ; CHECK-NEXT:    vsq %v0, %v1, %v0
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    vst %v0, 0(%r4), 3
 ; CHECK-NEXT:    br %r14
   %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b)
@@ -27,6 +28,7 @@ define zeroext i1 @f2(i128 %a, i128 %b) {
 ; CHECK-NEXT:    vl %v1, 0(%r2), 3
 ; CHECK-NEXT:    vscbiq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    br %r14
   %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b)
   %obit = extractvalue {i128, i1} %t, 1
@@ -46,5 +48,25 @@ define i128 @f3(i128 %a, i128 %b) {
   ret i128 %val
 }
 
+define i128 @f4(i128 %a, i128 %b) {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl %v0, 0(%r4), 3
+; CHECK-NEXT:    vl %v1, 0(%r3), 3
+; CHECK-NEXT:    vscbiq %v2, %v1, %v0
+; CHECK-NEXT:    vlgvf %r0, %v2, 3
+; CHECK-NEXT:    vgbm %v2, 0
+; CHECK-NEXT:    xilf %r0, 1
+; CHECK-NEXT:    jl .LBB3_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vsq %v2, %v1, %v0
+; CHECK-NEXT:  .LBB3_2:
+; CHECK-NEXT:    vst %v2, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %val = call i128 @llvm.usub.sat.i128(i128 %a, i128 %b)
+  ret i128 %val
+}
+
 declare {i128, i1} @llvm.usub.with.overflow.i128(i128, i128) nounwind readnone
+declare i128 @llvm.usub.sat.i128(i128, i128) nounwind readnone
 
diff --git a/llvm/test/CodeGen/SystemZ/int-usub-13.ll b/llvm/test/CodeGen/SystemZ/int-usub-13.ll
index 637e1a81de996f..794af3b73fbe2a 100644
--- a/llvm/test/CodeGen/SystemZ/int-usub-13.ll
+++ b/llvm/test/CodeGen/SystemZ/int-usub-13.ll
@@ -15,6 +15,7 @@ define zeroext i1 @f1(i256 %a, i256 %b, ptr %res) {
 ; CHECK-NEXT:    vlgvg %r2, %v5, 1
 ; CHECK-NEXT:    vsbiq %v0, %v1, %v0, %v4
 ; CHECK-NEXT:    vsq %v1, %v3, %v2
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    vst %v1, 16(%r4), 3
 ; CHECK-NEXT:    vst %v0, 0(%r4), 3
 ; CHECK-NEXT:    br %r14
@@ -35,6 +36,7 @@ define zeroext i1 @f2(i256 %a, i256 %b) {
 ; CHECK-NEXT:    vscbiq %v2, %v3, %v2
 ; CHECK-NEXT:    vsbcbiq %v0, %v1, %v0, %v2
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
+; CHECK-NEXT:    xilf %r2, 1
 ; CHECK-NEXT:    br %r14
   %t = call {i256, i1} @llvm.usub.with.overflow.i256(i256 %a, i256 %b)
   %obit = extractvalue {i256, i1} %t, 1
diff --git a/llvm/test/CodeGen/X86/apx/add.ll b/llvm/test/CodeGen/X86/apx/add.ll
index cdb29a70770e4b..d3301ecdb72d0f 100644
--- a/llvm/test/CodeGen/X86/apx/add.ll
+++ b/llvm/test/CodeGen/X86/apx/add.ll
@@ -298,9 +298,9 @@ define i8 @addflag8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: addflag8rr:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xf7]
-; CHECK-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; CHECK-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -311,10 +311,10 @@ entry:
 define i16 @addflag16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-LABEL: addflag16rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw %si, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x01,0xf7]
-; CHECK-NEXT:    movl $65535, %eax # encoding: [0xb8,0xff,0xff,0x00,0x00]
+; CHECK-NEXT:    addw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xf7]
+; CHECK-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0xFFFF
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -325,9 +325,9 @@ entry:
 define i32 @addflag32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK-LABEL: addflag32rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x01,0xf7]
-; CHECK-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    addl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf7]
+; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
@@ -337,9 +337,9 @@ entry:
 define i64 @addflag64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK-LABEL: addflag64rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x01,0xf7]
-; CHECK-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    addq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xf7]
+; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 %b)
@@ -350,9 +350,9 @@ define i8 @addflag8rm(i8 noundef %a, ptr %b) {
 ; CHECK-LABEL: addflag8rm:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
-; CHECK-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; CHECK-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -364,10 +364,10 @@ entry:
 define i16 @addflag16rm(i16 noundef %a, ptr %b) {
 ; CHECK-LABEL: addflag16rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw (%rsi), %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x03,0x3e]
-; CHECK-NEXT:    movl $65535, %eax # encoding: [0xb8,0xff,0xff,0x00,0x00]
+; CHECK-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
+; CHECK-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0xFFFF
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -379,9 +379,9 @@ entry:
 define i32 @addflag32rm(i32 noundef %a, ptr %b) {
 ; CHECK-LABEL: addflag32rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x03,0x3e]
-; CHECK-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
+; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
@@ -392,9 +392,9 @@ entry:
 define i64 @addflag64rm(i64 noundef %a, ptr %b) {
 ; CHECK-LABEL: addflag64rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x03,0x3e]
-; CHECK-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
+; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
@@ -405,10 +405,10 @@ entry:
 define i16 @addflag16ri8(i16 noundef %a) {
 ; CHECK-LABEL: addflag16ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw $123, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xc7,0x7b]
-; CHECK-NEXT:    movl $65535, %eax # encoding: [0xb8,0xff,0xff,0x00,0x00]
+; CHECK-NEXT:    addw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xc7,0x7b]
+; CHECK-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0xFFFF
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -419,9 +419,9 @@ entry:
 define i32 @addflag32ri8(i32 noundef %a) {
 ; CHECK-LABEL: addflag32ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xc7,0x7b]
-; CHECK-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    addl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xc7,0x7b]
+; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123)
@@ -431,9 +431,9 @@ entry:
 define i64 @addflag64ri8(i64 noundef %a) {
 ; CHECK-LABEL: addflag64ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xc7,0x7b]
-; CHECK-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
+; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123)
@@ -444,9 +444,9 @@ define i8 @addflag8ri(i8 noundef %a) {
 ; CHECK-LABEL: addflag8ri:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xc7,0x7b]
-; CHECK-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; CHECK-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -457,11 +457,11 @@ entry:
 define i16 @addflag16ri(i16 noundef %a) {
 ; CHECK-LABEL: addflag16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw $1234, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xc7,0xd2,0x04]
+; CHECK-NEXT:    addw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xc7,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    movl $65535, %eax # encoding: [0xb8,0xff,0xff,0x00,0x00]
+; CHECK-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0xFFFF
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -472,10 +472,10 @@ entry:
 define i32 @addflag32ri(i32 noundef %a) {
 ; CHECK-LABEL: addflag32ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT:    addl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovael %ecx, %eax # encoding: [0x0f,0x43,0xc1]
+; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 123456)
@@ -485,10 +485,10 @@ entry:
 define i64 @addflag64ri(i64 noundef %a) {
 ; CHECK-LABEL: addflag64ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123456)
diff --git a/llvm/test/CodeGen/X86/apx/cfcmov.ll b/llvm/test/CodeGen/X86/apx/cfcmov.ll
new file mode 100644
index 00000000000000..f643120c9b50ff
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/cfcmov.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+cf -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+cf -x86-cmov-converter=false -verify-machineinstrs | FileCheck %s
+
+define i8 @cfcmov8rr(i8 %0) {
+; CHECK-LABEL: cfcmov8rr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    cfcmovel %edi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %2 = icmp eq i8 %0, 1
+  %3 = select i1 %2, i8 %0, i8 0
+  ret i8 %3
+}
+
+define i16 @cfcmov16rr(i16 %0) {
+; CHECK-LABEL: cfcmov16rr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpw $1, %di
+; CHECK-NEXT:    cfcmovnel %edi, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %2 = icmp ne i16 %0, 1
+  %3 = select i1 %2, i16 %0, i16 0
+  ret i16 %3
+}
+
+define i32 @cfcmov32rr(i32 %0) {
+; CHECK-LABEL: cfcmov32rr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    cfcmovael %edi, %eax
+; CHECK-NEXT:    retq
+  %2 = icmp ugt i32 %0, 1
+  %3 = select i1 %2, i32 %0, i32 0
+  ret i32 %3
+}
+
+define i64 @cfcmov64rr(i64 %0) {
+; CHECK-LABEL: cfcmov64rr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    cfcmoveq %rdi, %rax
+; CHECK-NEXT:    retq
+  %2 = icmp ult i64 %0, 1
+  %3 = select i1 %2, i64 %0, i64 0
+  ret i64 %3
+}
+
+define i8 @cfcmov8rr_inv(i8 %0) {
+; CHECK-LABEL: cfcmov8rr_inv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    cfcmovnel %edi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %2 = icmp eq i8 %0, 1
+  %3 = select i1 %2, i8 0, i8 %0
+  ret i8 %3
+}
+
+define i16 @cfcmov16rr_inv(i16 %0) {
+; CHECK-LABEL: cfcmov16rr_inv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpw $1, %di
+; CHECK-NEXT:    cfcmovel %edi, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %2 = icmp ne i16 %0, 1
+  %3 = select i1 %2, i16 0, i16 %0
+  ret i16 %3
+}
+
+define i32 @cfcmov32rr_inv(i32 %0) {
+; CHECK-LABEL: cfcmov32rr_inv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    cfcmovbl %edi, %eax
+; CHECK-NEXT:    retq
+  %2 = icmp ugt i32 %0, 1
+  %3 = select i1 %2, i32 0, i32 %0
+  ret i32 %3
+}
+
+define i64 @cfcmov64rr_inv(i64 %0) {
+; CHECK-LABEL: cfcmov64rr_inv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cmpq $2, %rdi
+; CHECK-NEXT:    cfcmovaeq %rdi, %rax
+; CHECK-NEXT:    retq
+  %2 = icmp ule i64 %0, 1
+  %3 = select i1 %2, i64 0, i64 %0
+  ret i64 %3
+}
diff --git a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.mir b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.mir
index d6a9cda1dc8162..e81a4480ba44cb 100644
--- a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.mir
+++ b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.mir
@@ -29,6 +29,18 @@
     call void @foo()
     ret void
   }
+
+  define void @test_cmov(i64 %a, i64 %b) {
+  entry:
+    call void @foo()
+    ret void
+  }
+
+  define void @test_cfcmov(i64 %a, i64 %b) {
+  entry:
+    call void @foo()
+    ret void
+  }
 ...
 ---
 name:            test_adc
@@ -166,3 +178,93 @@ body:             |
     RET 0
 
 ...
+---
+name:            test_cmov
+# CHECK-LABEL: name: test_cmov
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $rdi, $rsi
+
+    %0:gr64 = COPY $rdi
+    %1:gr64 = COPY $rsi
+    CMP64rr %0, %1, implicit-def $eflags
+    %2:gr64 = COPY $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+  ; CHECK:      %[[A_REG:[^:]*]]:gr8 = SETCCr 7, implicit $eflags
+  ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETCCr 2, implicit $eflags
+  ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETCCr 4, implicit $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+
+    $eflags = COPY %2
+    %3:gr64 = CMOV64rr_ND %0, %1, 7, implicit $eflags
+    %4:gr64 = CMOV64rr_ND %0, %1, 2, implicit $eflags
+    %5:gr64 = CMOV64rr_ND %0, %1, 4, implicit $eflags
+    %6:gr64 = CMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NOT:     $eflags =
+  ; CHECK:         TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %3:gr64 = CMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[B_REG]], %[[B_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %4:gr64 = CMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %5:gr64 = CMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %6:gr64 = CMOV64rr_ND %0, %1, 4, implicit killed $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %6
+
+    RET 0
+...
+---
+name:            test_cfcmov
+# CHECK-LABEL: name: test_cfcmov
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%1' }
+body:             |
+  bb.0:
+    liveins: $rdi, $rsi
+
+    %0:gr64 = COPY $rdi
+    %1:gr64 = COPY $rsi
+    CMP64rr %0, %1, implicit-def $eflags
+    %2:gr64 = COPY $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+  ; CHECK:      %[[A_REG:[^:]*]]:gr8 = SETCCr 7, implicit $eflags
+  ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETCCr 2, implicit $eflags
+  ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETCCr 4, implicit $eflags
+  ; CHECK-NOT:  COPY{{( killed)?}} $eflags
+
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+
+    $eflags = COPY %2
+    %3:gr64 = CFCMOV64rr %1, 7, implicit $eflags
+    %4:gr64 = CFCMOV64rr %1, 2, implicit $eflags
+    %5:gr64 = CFCMOV64rr_ND %0, %1, 4, implicit $eflags
+    %6:gr64 = CFCMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NOT:     $eflags =
+  ; CHECK:         TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %3:gr64 = CFCMOV64rr %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[B_REG]], %[[B_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %4:gr64 = CFCMOV64rr %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %5:gr64 = CFCMOV64rr_ND %0, %1, 5, implicit killed $eflags
+  ; CHECK-NEXT:    TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
+  ; CHECK-NEXT:    %6:gr64 = CFCMOV64rr_ND %0, %1, 4, implicit killed $eflags
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
+    MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %6
+
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/apx/inc.ll b/llvm/test/CodeGen/X86/apx/inc.ll
index 613f7866c9ac5c..a9c6d740cf2cee 100644
--- a/llvm/test/CodeGen/X86/apx/inc.ll
+++ b/llvm/test/CodeGen/X86/apx/inc.ll
@@ -92,9 +92,9 @@ define i8 @uinc8r(i8 noundef %a) {
 ; CHECK-LABEL: uinc8r:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    incb %dil, %al
-; CHECK-NEXT:    movzbl %al, %ecx
-; CHECK-NEXT:    movl $255, %eax
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movl $255, %ecx
+; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 entry:
@@ -105,9 +105,9 @@ entry:
 define i16 @uinc16r(i16 noundef %a) {
 ; CHECK-LABEL: uinc16r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incw %di, %cx
-; CHECK-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    incw %di, %ax
+; CHECK-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
 entry:
@@ -118,9 +118,9 @@ entry:
 define i32 @uinc32r(i32 noundef %a) {
 ; CHECK-LABEL: uinc32r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incl %edi, %ecx
-; CHECK-NEXT:    movl $-1, %eax
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    incl %edi, %eax
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    cmovel %ecx, %eax
 ; CHECK-NEXT:    retq
 entry:
   %inc = call i32 @llvm.uadd.sat.i32(i32 %a, i32 1)
@@ -130,9 +130,9 @@ entry:
 define i64 @uinc64r(i64 noundef %a) {
 ; CHECK-LABEL: uinc64r:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incq %rdi, %rcx
-; CHECK-NEXT:    movq $-1, %rax
-; CHECK-NEXT:    cmovneq %rcx, %rax
+; CHECK-NEXT:    incq %rdi, %rax
+; CHECK-NEXT:    movq $-1, %rcx
+; CHECK-NEXT:    cmoveq %rcx, %rax
 ; CHECK-NEXT:    retq
 entry:
   %inc = call i64 @llvm.uadd.sat.i64(i64 %a, i64 1)
diff --git a/llvm/test/CodeGen/X86/apx/shift-eflags.ll b/llvm/test/CodeGen/X86/apx/shift-eflags.ll
index f34dc6c05dad98..932cdc189bf9f8 100644
--- a/llvm/test/CodeGen/X86/apx/shift-eflags.ll
+++ b/llvm/test/CodeGen/X86/apx/shift-eflags.ll
@@ -7,9 +7,8 @@
 define i32 @ashr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: ashr_const:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    sarl $14, %edi, %edx
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    sarl $14, %edi, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = ashr i32 %a0, 14
   %c = icmp eq i32 %s, 0
@@ -21,9 +20,8 @@ define i32 @ashr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @lshr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: lshr_const:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    testl $-16384, %edi # imm = 0xC000
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = lshr i32 %a0, 14
   %c = icmp eq i32 %s, 0
@@ -35,9 +33,8 @@ define i32 @lshr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @shl_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: shl_const:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    testl $262143, %edi # imm = 0x3FFFF
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = shl i32 %a0, 14
   %c = icmp eq i32 %s, 0
@@ -88,9 +85,8 @@ define i32 @shl_const_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @ashr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: ashr_const1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    sarl %edi, %edx
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    sarl %edi, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = ashr i32 %a0, 1
   %c = icmp eq i32 %s, 0
@@ -102,9 +98,8 @@ define i32 @ashr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @lshr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: lshr_const1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    testl $-2, %edi
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = lshr i32 %a0, 1
   %c = icmp eq i32 %s, 0
@@ -116,9 +111,8 @@ define i32 @lshr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i32 @shl_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: shl_const1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    testl $2147483647, %edi # imm = 0x7FFFFFFF
-; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    cmovel %edx, %ecx, %eax
 ; CHECK-NEXT:    retq
   %s = shl i32 %a0, 1
   %c = icmp eq i32 %s, 0
diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll
index 4b0bd14872141b..be0914c90b9faf 100644
--- a/llvm/test/CodeGen/X86/apx/sub.ll
+++ b/llvm/test/CodeGen/X86/apx/sub.ll
@@ -299,10 +299,10 @@ declare i64 @llvm.usub.sat.i64(i64, i64)
 define i8 @subflag8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: subflag8rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf7]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subb %sil, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x28,0xf7]
+; CHECK-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -313,9 +313,9 @@ entry:
 define i16 @subflag16rr(i16 noundef %a, i16 noundef %b) {
 ; CHECK-LABEL: subflag16rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf7]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subw %si, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x29,0xf7]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -326,9 +326,9 @@ entry:
 define i32 @subflag32rr(i32 noundef %a, i32 noundef %b) {
 ; CHECK-LABEL: subflag32rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf7]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subl %esi, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x29,0xf7]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 %b)
@@ -340,7 +340,7 @@ define i64 @subflag64rr(i64 noundef %a, i64 noundef %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    subq %rsi, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x29,0xf7]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 %b)
@@ -350,10 +350,10 @@ entry:
 define i8 @subflag8rm(i8 noundef %a, ptr %b) {
 ; CHECK-LABEL: subflag8rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x2a,0x3e]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subb (%rsi), %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x2a,0x3e]
+; CHECK-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -365,9 +365,9 @@ entry:
 define i16 @subflag16rm(i16 noundef %a, ptr %b) {
 ; CHECK-LABEL: subflag16rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x2b,0x3e]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subw (%rsi), %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x2b,0x3e]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -379,9 +379,9 @@ entry:
 define i32 @subflag32rm(i32 noundef %a, ptr %b) {
 ; CHECK-LABEL: subflag32rm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x2b,0x3e]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subl (%rsi), %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x2b,0x3e]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
@@ -394,7 +394,7 @@ define i64 @subflag64rm(i64 noundef %a, ptr %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    subq (%rsi), %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x2b,0x3e]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
@@ -405,9 +405,9 @@ entry:
 define i16 @subflag16ri8(i16 noundef %a) {
 ; CHECK-LABEL: subflag16ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subw $123, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0xef,0x7b]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subw $123, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xef,0x7b]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -418,9 +418,9 @@ entry:
 define i32 @subflag32ri8(i32 noundef %a) {
 ; CHECK-LABEL: subflag32ri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0xef,0x7b]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subl $123, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x83,0xef,0x7b]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123)
@@ -432,7 +432,7 @@ define i64 @subflag64ri8(i64 noundef %a) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    subq $123, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x83,0xef,0x7b]
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123)
@@ -442,10 +442,10 @@ entry:
 define i8 @subflag8ri(i8 noundef %a) {
 ; CHECK-LABEL: subflag8ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xef,0x7b]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subb $123, %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xef,0x7b]
+; CHECK-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -456,10 +456,10 @@ entry:
 define i16 @subflag16ri(i16 noundef %a) {
 ; CHECK-LABEL: subflag16ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subw $1234, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xef,0xd2,0x04]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subw $1234, %di, %cx # encoding: [0x62,0xf4,0x75,0x18,0x81,0xef,0xd2,0x04]
 ; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -470,10 +470,10 @@ entry:
 define i32 @subflag32ri(i32 noundef %a) {
 ; CHECK-LABEL: subflag32ri:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
-; CHECK-NEXT:    subl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT:    subl $123456, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    cmovbl %ecx, %eax # encoding: [0x0f,0x42,0xc1]
+; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i32 @llvm.usub.sat.i32(i32 %a, i32 123456)
@@ -486,7 +486,7 @@ define i64 @subflag64ri(i64 noundef %a) {
 ; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; CHECK-NEXT:    subq $123456, %rdi, %rcx # encoding: [0x62,0xf4,0xf4,0x18,0x81,0xef,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    cmovaeq %rcx, %rax # encoding: [0x48,0x0f,0x43,0xc1]
+; CHECK-NEXT:    cmovaeq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc1]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = call i64 @llvm.usub.sat.i64(i64 %a, i64 123456)
diff --git a/llvm/test/CodeGen/X86/asm-dialect-module.ll b/llvm/test/CodeGen/X86/asm-dialect-module.ll
new file mode 100644
index 00000000000000..2c00a44424c2c3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/asm-dialect-module.ll
@@ -0,0 +1,10 @@
+;; Test that we respect the assembler dialect when parsing module-level inline asm.
+; RUN: not llc < %s -mtriple=x86_64 2>&1 | FileCheck %s --check-prefix=ERR
+; RUN: llc < %s -mtriple=x86_64 -x86-asm-syntax=intel | FileCheck %s
+
+; ERR: <inline asm>:1:1: error: unknown use of instruction mnemonic without a size suffix
+
+; CHECK: .intel_syntax noprefix
+; CHECK: mov eax, eax
+
+module asm "mov eax, eax"
diff --git a/llvm/test/CodeGen/X86/avgceils.ll b/llvm/test/CodeGen/X86/avgceils.ll
new file mode 100644
index 00000000000000..4529ea275df9c7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgceils.ll
@@ -0,0 +1,3821 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+;
+; 128-bit vectors
+;
+
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_fixed_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm0, %xmm2
+; SSE-NEXT:    paddb %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i8> %a0, %a1
+  %xor = xor <16 x i8> %a0, %a1
+  %shift = ashr <16 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <16 x i8> %or, %shift
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE2-LABEL: test_ext_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    paddw %xmm2, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    psubw %xmm1, %xmm4
+; SSE2-NEXT:    psubw %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm3
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm0
+; SSE4-NEXT:    paddw %xmm2, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm1
+; SSE4-NEXT:    paddw %xmm3, %xmm1
+; SSE4-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE4-NEXT:    psubw %xmm2, %xmm0
+; SSE4-NEXT:    psubw %xmm2, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm2, %xmm0
+; SSE4-NEXT:    pand %xmm2, %xmm1
+; SSE4-NEXT:    packuswb %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i8> %a0 to <16 x i16>
+  %x1 = sext <16 x i8> %a1 to <16 x i16>
+  %sum = add <16 x i16> %x0, %x1
+  %inc = add <16 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = ashr <16 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <16 x i16> %shift to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_fixed_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    psraw $1, %xmm1
+; SSE-NEXT:    psubw %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %or = or <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a1, %a0
+  %shift = ashr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <8 x i16> %or, %shift
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE2-LABEL: test_ext_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm2, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm3
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm0
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm1
+; SSE4-NEXT:    paddd %xmm3, %xmm1
+; SSE4-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE4-NEXT:    psubd %xmm2, %xmm0
+; SSE4-NEXT:    psubd %xmm2, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE4-NEXT:    packusdw %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %sum = add <8 x i32> %x0, %x1
+  %inc = add <8 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = ashr <8 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <8 x i32> %shift to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_fixed_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    psrad $1, %xmm1
+; SSE-NEXT:    psubd %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %or = or <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a1, %a0
+  %shift = ashr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
+  %res = sub <4 x i32> %or, %shift
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE2-LABEL: test_ext_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    psubq %xmm1, %xmm4
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm2
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm4
+; SSE4-NEXT:    paddq %xmm2, %xmm4
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm0
+; SSE4-NEXT:    paddq %xmm3, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT:    psubq %xmm1, %xmm4
+; SSE4-NEXT:    psubq %xmm1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <4 x i32> %a0 to <4 x i64>
+  %x1 = sext <4 x i32> %a1 to <4 x i64>
+  %sum = add <4 x i64> %x0, %x1
+  %inc = add <4 x i64> %sum, <i64 1, i64 1, i64 1, i64 1>
+  %shift = ashr <4 x i64> %inc, <i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <4 x i64> %shift to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psubq %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    por %xmm1, %xmm2
+; SSE4-NEXT:    pxor %xmm0, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    psrad $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE4-NEXT:    psubq %xmm1, %xmm2
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpsraq $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %or = or <2 x i64> %a0, %a1
+  %xor = xor <2 x i64> %a1, %a0
+  %shift = ashr <2 x i64> %xor, <i64 1, i64 1>
+  %res = sub <2 x i64> %or, %shift
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_ext_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    movq %xmm1, %rdi
+; SSE2-NEXT:    movq %rdi, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r9
+; SSE2-NEXT:    movq %r9, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    addq %r9, %rdx
+; SSE2-NEXT:    adcq %rsi, %r10
+; SSE2-NEXT:    addq %rdi, %rax
+; SSE2-NEXT:    adcq %rcx, %r8
+; SSE2-NEXT:    addq $1, %rax
+; SSE2-NEXT:    adcq $0, %r8
+; SSE2-NEXT:    addq $1, %rdx
+; SSE2-NEXT:    adcq $0, %r10
+; SSE2-NEXT:    shldq $63, %rdx, %r10
+; SSE2-NEXT:    shldq $63, %rax, %r8
+; SSE2-NEXT:    movq %r8, %xmm0
+; SSE2-NEXT:    movq %r10, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %xmm0, %rdx
+; SSE4-NEXT:    movq %rdx, %rsi
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    pextrq $1, %xmm1, %rdi
+; SSE4-NEXT:    movq %rdi, %r8
+; SSE4-NEXT:    sarq $63, %r8
+; SSE4-NEXT:    movq %xmm1, %r9
+; SSE4-NEXT:    movq %r9, %r10
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    addq %r9, %rdx
+; SSE4-NEXT:    adcq %rsi, %r10
+; SSE4-NEXT:    addq %rdi, %rax
+; SSE4-NEXT:    adcq %rcx, %r8
+; SSE4-NEXT:    addq $1, %rax
+; SSE4-NEXT:    adcq $0, %r8
+; SSE4-NEXT:    addq $1, %rdx
+; SSE4-NEXT:    adcq $0, %r10
+; SSE4-NEXT:    shldq $63, %rdx, %r10
+; SSE4-NEXT:    shldq $63, %rax, %r8
+; SSE4-NEXT:    movq %r8, %xmm1
+; SSE4-NEXT:    movq %r10, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vmovq %xmm0, %rdx
+; AVX-NEXT:    movq %rdx, %rsi
+; AVX-NEXT:    sarq $63, %rsi
+; AVX-NEXT:    vpextrq $1, %xmm1, %rdi
+; AVX-NEXT:    movq %rdi, %r8
+; AVX-NEXT:    sarq $63, %r8
+; AVX-NEXT:    vmovq %xmm1, %r9
+; AVX-NEXT:    movq %r9, %r10
+; AVX-NEXT:    sarq $63, %r10
+; AVX-NEXT:    addq %r9, %rdx
+; AVX-NEXT:    adcq %rsi, %r10
+; AVX-NEXT:    addq %rdi, %rax
+; AVX-NEXT:    adcq %rcx, %r8
+; AVX-NEXT:    addq $1, %rax
+; AVX-NEXT:    adcq $0, %r8
+; AVX-NEXT:    addq $1, %rdx
+; AVX-NEXT:    adcq $0, %r10
+; AVX-NEXT:    shldq $63, %rdx, %r10
+; AVX-NEXT:    shldq $63, %rax, %r8
+; AVX-NEXT:    vmovq %r8, %xmm0
+; AVX-NEXT:    vmovq %r10, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %x0 = sext <2 x i64> %a0 to <2 x i128>
+  %x1 = sext <2 x i64> %a1 to <2 x i128>
+  %sum = add <2 x i128> %x0, %x1
+  %inc = add <2 x i128> %sum, <i128 1, i128 1>
+  %shift = ashr <2 x i128> %inc, <i128 1, i128 1>
+  %res = trunc <2 x i128> %shift to <2 x i64>
+  ret <2 x i64> %res
+}
+
+;
+; 256-bit vectors
+;
+
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_fixed_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psubb %xmm1, %xmm4
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    psubb %xmm0, %xmm5
+; SSE-NEXT:    paddb %xmm3, %xmm5
+; SSE-NEXT:    paddb %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512-NEXT:    vpsubb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <32 x i8> %a0, %a1
+  %xor = xor <32 x i8> %a0, %a1
+  %shift = ashr <32 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <32 x i8> %or, %shift
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE2-LABEL: test_ext_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm8
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    paddw %xmm5, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm6, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    paddw %xmm7, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm8, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    psubw %xmm3, %xmm4
+; SSE2-NEXT:    psubw %xmm3, %xmm0
+; SSE2-NEXT:    psubw %xmm3, %xmm2
+; SSE2-NEXT:    psubw %xmm3, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm4
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm5
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm7
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm0
+; SSE4-NEXT:    paddw %xmm4, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm2
+; SSE4-NEXT:    paddw %xmm5, %xmm2
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm1
+; SSE4-NEXT:    paddw %xmm6, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm3
+; SSE4-NEXT:    paddw %xmm7, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE4-NEXT:    psubw %xmm4, %xmm0
+; SSE4-NEXT:    psubw %xmm4, %xmm2
+; SSE4-NEXT:    psubw %xmm4, %xmm1
+; SSE4-NEXT:    psubw %xmm4, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm4, %xmm0
+; SSE4-NEXT:    pand %xmm4, %xmm2
+; SSE4-NEXT:    packuswb %xmm2, %xmm0
+; SSE4-NEXT:    pand %xmm4, %xmm1
+; SSE4-NEXT:    pand %xmm4, %xmm3
+; SSE4-NEXT:    packuswb %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmovsxbw %xmm5, %xmm6
+; AVX1-NEXT:    vpaddw %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm5, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <32 x i8> %a0 to <32 x i16>
+  %x1 = sext <32 x i8> %a1 to <32 x i16>
+  %sum = add <32 x i16> %x0, %x1
+  %inc = add <32 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = ashr <32 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <32 x i16> %shift to <32 x i8>
+  ret <32 x i8> %res
+}
+
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_fixed_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    psraw $1, %xmm3
+; SSE-NEXT:    psubw %xmm3, %xmm4
+; SSE-NEXT:    psraw $1, %xmm2
+; SSE-NEXT:    psubw %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubw %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i16> %a0, %a1
+  %xor = xor <16 x i16> %a1, %a0
+  %shift = ashr <16 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <16 x i16> %or, %shift
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE2-LABEL: test_ext_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    paddd %xmm5, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm6, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    paddd %xmm7, %xmm2
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm1
+; SSE2-NEXT:    psubd %xmm4, %xmm3
+; SSE2-NEXT:    psubd %xmm4, %xmm0
+; SSE2-NEXT:    psubd %xmm4, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm4
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm5
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm7
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm0
+; SSE4-NEXT:    paddd %xmm4, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm2
+; SSE4-NEXT:    paddd %xmm5, %xmm2
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm1
+; SSE4-NEXT:    paddd %xmm6, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm3
+; SSE4-NEXT:    paddd %xmm7, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE4-NEXT:    psubd %xmm4, %xmm0
+; SSE4-NEXT:    psubd %xmm4, %xmm2
+; SSE4-NEXT:    psubd %xmm4, %xmm1
+; SSE4-NEXT:    psubd %xmm4, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm4, %xmm4
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm2, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm3, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmovsxwd %xmm5, %xmm6
+; AVX1-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i16> %a0 to <16 x i32>
+  %x1 = sext <16 x i16> %a1 to <16 x i32>
+  %sum = add <16 x i32> %x0, %x1
+  %inc = add <16 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = ashr <16 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <16 x i32> %shift to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_fixed_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    psrad $1, %xmm3
+; SSE-NEXT:    psubd %xmm3, %xmm4
+; SSE-NEXT:    psrad $1, %xmm2
+; SSE-NEXT:    psubd %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <8 x i32> %a0, %a1
+  %xor = xor <8 x i32> %a1, %a0
+  %shift = ashr <8 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = sub <8 x i32> %or, %shift
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE2-LABEL: test_ext_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm3, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm4
+; SSE2-NEXT:    psubq %xmm3, %xmm0
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm3, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm4, %xmm5
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm7
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm4
+; SSE4-NEXT:    paddq %xmm5, %xmm4
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm0
+; SSE4-NEXT:    paddq %xmm6, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm2
+; SSE4-NEXT:    paddq %xmm7, %xmm2
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm1
+; SSE4-NEXT:    paddq %xmm8, %xmm1
+; SSE4-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE4-NEXT:    psubq %xmm3, %xmm4
+; SSE4-NEXT:    psubq %xmm3, %xmm0
+; SSE4-NEXT:    psubq %xmm3, %xmm2
+; SSE4-NEXT:    psubq %xmm3, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm5
+; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT:    vpaddq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,2],ymm0[0,2],ymm2[4,6],ymm0[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i32> %a0 to <8 x i64>
+  %x1 = sext <8 x i32> %a1 to <8 x i64>
+  %sum = add <8 x i64> %x0, %x1
+  %inc = add <8 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shift = ashr <8 x i64> %inc, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <8 x i64> %shift to <8 x i32>
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psubq %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psubq %xmm1, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm1, %xmm4
+; SSE4-NEXT:    por %xmm3, %xmm4
+; SSE4-NEXT:    movdqa %xmm0, %xmm5
+; SSE4-NEXT:    por %xmm2, %xmm5
+; SSE4-NEXT:    pxor %xmm0, %xmm2
+; SSE4-NEXT:    pxor %xmm1, %xmm3
+; SSE4-NEXT:    movdqa %xmm3, %xmm0
+; SSE4-NEXT:    psrad $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7]
+; SSE4-NEXT:    psubq %xmm3, %xmm4
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    psrad $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE4-NEXT:    psubq %xmm2, %xmm5
+; SSE4-NEXT:    movdqa %xmm5, %xmm0
+; SSE4-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsraq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <4 x i64> %a0, %a1
+  %xor = xor <4 x i64> %a1, %a0
+  %shift = ashr <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1>
+  %res = sub <4 x i64> %or, %shift
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_ext_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    movq %xmm0, %r11
+; SSE2-NEXT:    movq %r11, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %rcx, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movq %xmm1, %rdx
+; SSE2-NEXT:    movq %rdx, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r9
+; SSE2-NEXT:    movq %r9, %r15
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movq %xmm2, %rsi
+; SSE2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r13
+; SSE2-NEXT:    movq %r13, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movq %xmm3, %rbp
+; SSE2-NEXT:    movq %rbp, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    addq %rax, %r9
+; SSE2-NEXT:    adcq %r15, %r10
+; SSE2-NEXT:    addq %rbp, %rdx
+; SSE2-NEXT:    adcq %r14, %rdi
+; SSE2-NEXT:    addq %r13, %rcx
+; SSE2-NEXT:    adcq %rbx, %r8
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE2-NEXT:    adcq %r12, %rsi
+; SSE2-NEXT:    addq $1, %r11
+; SSE2-NEXT:    adcq $0, %rsi
+; SSE2-NEXT:    addq $1, %rcx
+; SSE2-NEXT:    adcq $0, %r8
+; SSE2-NEXT:    addq $1, %rdx
+; SSE2-NEXT:    adcq $0, %rdi
+; SSE2-NEXT:    addq $1, %r9
+; SSE2-NEXT:    adcq $0, %r10
+; SSE2-NEXT:    shldq $63, %r9, %r10
+; SSE2-NEXT:    shldq $63, %rdx, %rdi
+; SSE2-NEXT:    shldq $63, %rcx, %r8
+; SSE2-NEXT:    shldq $63, %r11, %rsi
+; SSE2-NEXT:    movq %rsi, %xmm0
+; SSE2-NEXT:    movq %r8, %xmm2
+; SSE2-NEXT:    movq %rdi, %xmm1
+; SSE2-NEXT:    movq %r10, %xmm3
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    pextrq $1, %xmm0, %r11
+; SSE4-NEXT:    movq %r11, %r12
+; SSE4-NEXT:    sarq $63, %r12
+; SSE4-NEXT:    movq %xmm0, %rcx
+; SSE4-NEXT:    movq %rcx, %rbx
+; SSE4-NEXT:    sarq $63, %rbx
+; SSE4-NEXT:    pextrq $1, %xmm1, %rdx
+; SSE4-NEXT:    movq %rdx, %r14
+; SSE4-NEXT:    sarq $63, %r14
+; SSE4-NEXT:    movq %xmm1, %r9
+; SSE4-NEXT:    movq %r9, %r15
+; SSE4-NEXT:    sarq $63, %r15
+; SSE4-NEXT:    pextrq $1, %xmm2, %rsi
+; SSE4-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    movq %xmm2, %r13
+; SSE4-NEXT:    movq %r13, %r8
+; SSE4-NEXT:    sarq $63, %r8
+; SSE4-NEXT:    pextrq $1, %xmm3, %rbp
+; SSE4-NEXT:    movq %rbp, %rdi
+; SSE4-NEXT:    sarq $63, %rdi
+; SSE4-NEXT:    movq %xmm3, %rax
+; SSE4-NEXT:    movq %rax, %r10
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    addq %rax, %r9
+; SSE4-NEXT:    adcq %r15, %r10
+; SSE4-NEXT:    addq %rbp, %rdx
+; SSE4-NEXT:    adcq %r14, %rdi
+; SSE4-NEXT:    addq %r13, %rcx
+; SSE4-NEXT:    adcq %rbx, %r8
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE4-NEXT:    adcq %r12, %rsi
+; SSE4-NEXT:    addq $1, %r11
+; SSE4-NEXT:    adcq $0, %rsi
+; SSE4-NEXT:    addq $1, %rcx
+; SSE4-NEXT:    adcq $0, %r8
+; SSE4-NEXT:    addq $1, %rdx
+; SSE4-NEXT:    adcq $0, %rdi
+; SSE4-NEXT:    addq $1, %r9
+; SSE4-NEXT:    adcq $0, %r10
+; SSE4-NEXT:    shldq $63, %r9, %r10
+; SSE4-NEXT:    shldq $63, %rdx, %rdi
+; SSE4-NEXT:    shldq $63, %rcx, %r8
+; SSE4-NEXT:    shldq $63, %r11, %rsi
+; SSE4-NEXT:    movq %rsi, %xmm2
+; SSE4-NEXT:    movq %r8, %xmm0
+; SSE4-NEXT:    movq %rdi, %xmm3
+; SSE4-NEXT:    movq %r10, %xmm1
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX1-NEXT:    movq %r11, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vmovq %xmm2, %rcx
+; AVX1-NEXT:    movq %rcx, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    movq %rdx, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vmovq %xmm0, %r8
+; AVX1-NEXT:    movq %r8, %r15
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vmovq %xmm0, %r13
+; AVX1-NEXT:    movq %r13, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rbp
+; AVX1-NEXT:    movq %rbp, %r9
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    movq %rax, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    addq %rax, %r8
+; AVX1-NEXT:    adcq %r15, %r10
+; AVX1-NEXT:    addq %rbp, %rdx
+; AVX1-NEXT:    adcq %r14, %r9
+; AVX1-NEXT:    addq %r13, %rcx
+; AVX1-NEXT:    adcq %rbx, %rdi
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX1-NEXT:    adcq %r12, %rsi
+; AVX1-NEXT:    addq $1, %r11
+; AVX1-NEXT:    adcq $0, %rsi
+; AVX1-NEXT:    addq $1, %rcx
+; AVX1-NEXT:    adcq $0, %rdi
+; AVX1-NEXT:    addq $1, %rdx
+; AVX1-NEXT:    adcq $0, %r9
+; AVX1-NEXT:    addq $1, %r8
+; AVX1-NEXT:    adcq $0, %r10
+; AVX1-NEXT:    shldq $63, %r8, %r10
+; AVX1-NEXT:    shldq $63, %rdx, %r9
+; AVX1-NEXT:    shldq $63, %rcx, %rdi
+; AVX1-NEXT:    shldq $63, %r11, %rsi
+; AVX1-NEXT:    vmovq %rsi, %xmm0
+; AVX1-NEXT:    vmovq %rdi, %xmm1
+; AVX1-NEXT:    vmovq %r9, %xmm2
+; AVX1-NEXT:    vmovq %r10, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX2-NEXT:    movq %r11, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vmovq %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vmovq %xmm0, %r8
+; AVX2-NEXT:    movq %r8, %r15
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vmovq %xmm0, %r13
+; AVX2-NEXT:    movq %r13, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rbp
+; AVX2-NEXT:    movq %rbp, %r9
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    movq %rax, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    addq %rax, %r8
+; AVX2-NEXT:    adcq %r15, %r10
+; AVX2-NEXT:    addq %rbp, %rdx
+; AVX2-NEXT:    adcq %r14, %r9
+; AVX2-NEXT:    addq %r13, %rcx
+; AVX2-NEXT:    adcq %rbx, %rdi
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    adcq %r12, %rsi
+; AVX2-NEXT:    addq $1, %r11
+; AVX2-NEXT:    adcq $0, %rsi
+; AVX2-NEXT:    addq $1, %rcx
+; AVX2-NEXT:    adcq $0, %rdi
+; AVX2-NEXT:    addq $1, %rdx
+; AVX2-NEXT:    adcq $0, %r9
+; AVX2-NEXT:    addq $1, %r8
+; AVX2-NEXT:    adcq $0, %r10
+; AVX2-NEXT:    shldq $63, %r8, %r10
+; AVX2-NEXT:    shldq $63, %rdx, %r9
+; AVX2-NEXT:    shldq $63, %rcx, %rdi
+; AVX2-NEXT:    shldq $63, %r11, %rsi
+; AVX2-NEXT:    vmovq %rsi, %xmm0
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vmovq %r9, %xmm2
+; AVX2-NEXT:    vmovq %r10, %xmm3
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX512-NEXT:    movq %r11, %r12
+; AVX512-NEXT:    sarq $63, %r12
+; AVX512-NEXT:    vmovq %xmm2, %rcx
+; AVX512-NEXT:    movq %rcx, %rbx
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT:    sarq $63, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    sarq $63, %r14
+; AVX512-NEXT:    vmovq %xmm0, %rdi
+; AVX512-NEXT:    movq %rdi, %r15
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r15
+; AVX512-NEXT:    sarq $63, %rsi
+; AVX512-NEXT:    vmovq %xmm0, %r13
+; AVX512-NEXT:    movq %r13, %r8
+; AVX512-NEXT:    sarq $63, %r8
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rbp
+; AVX512-NEXT:    movq %rbp, %r9
+; AVX512-NEXT:    sarq $63, %r9
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    movq %rax, %r10
+; AVX512-NEXT:    sarq $63, %r10
+; AVX512-NEXT:    addq %rax, %rdi
+; AVX512-NEXT:    adcq %r15, %r10
+; AVX512-NEXT:    addq %rbp, %rdx
+; AVX512-NEXT:    adcq %r14, %r9
+; AVX512-NEXT:    addq %r13, %rcx
+; AVX512-NEXT:    adcq %rbx, %r8
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    adcq %r12, %rsi
+; AVX512-NEXT:    addq $1, %r11
+; AVX512-NEXT:    adcq $0, %rsi
+; AVX512-NEXT:    addq $1, %rcx
+; AVX512-NEXT:    adcq $0, %r8
+; AVX512-NEXT:    addq $1, %rdx
+; AVX512-NEXT:    adcq $0, %r9
+; AVX512-NEXT:    addq $1, %rdi
+; AVX512-NEXT:    adcq $0, %r10
+; AVX512-NEXT:    shldq $63, %rdi, %r10
+; AVX512-NEXT:    shldq $63, %rdx, %r9
+; AVX512-NEXT:    shldq $63, %rcx, %r8
+; AVX512-NEXT:    shldq $63, %r11, %rsi
+; AVX512-NEXT:    vmovq %rsi, %xmm0
+; AVX512-NEXT:    vmovq %r8, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vmovq %r10, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = sext <4 x i64> %a0 to <4 x i128>
+  %x1 = sext <4 x i64> %a1 to <4 x i128>
+  %sum = add <4 x i128> %x0, %x1
+  %inc = add <4 x i128> %sum, <i128 1, i128 1, i128 1, i128 1>
+  %shift = ashr <4 x i128> %inc, <i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <4 x i128> %shift to <4 x i64>
+  ret <4 x i64> %res
+}
+
+;
+; 512-bit vectors
+;
+
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_fixed_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm11
+; SSE-NEXT:    movdqa %xmm2, %xmm8
+; SSE-NEXT:    movdqa %xmm1, %xmm9
+; SSE-NEXT:    movdqa %xmm0, %xmm10
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm6, %xmm2
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm10
+; SSE-NEXT:    pxor %xmm5, %xmm9
+; SSE-NEXT:    pxor %xmm6, %xmm8
+; SSE-NEXT:    pxor %xmm7, %xmm11
+; SSE-NEXT:    psrlw $1, %xmm11
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm5, %xmm11
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm4, %xmm11
+; SSE-NEXT:    psubb %xmm11, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm8
+; SSE-NEXT:    pand %xmm5, %xmm8
+; SSE-NEXT:    pxor %xmm4, %xmm8
+; SSE-NEXT:    psubb %xmm8, %xmm2
+; SSE-NEXT:    psrlw $1, %xmm9
+; SSE-NEXT:    pand %xmm5, %xmm9
+; SSE-NEXT:    pxor %xmm4, %xmm9
+; SSE-NEXT:    psubb %xmm9, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    pxor %xmm4, %xmm10
+; SSE-NEXT:    psubb %xmm10, %xmm0
+; SSE-NEXT:    paddb %xmm4, %xmm0
+; SSE-NEXT:    paddb %xmm4, %xmm1
+; SSE-NEXT:    paddb %xmm4, %xmm2
+; SSE-NEXT:    paddb %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm7, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512-NEXT:    vpsubb %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <64 x i8> %a0, %a1
+  %xor = xor <64 x i8> %a0, %a1
+  %shift = ashr <64 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <64 x i8> %or, %shift
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE2-LABEL: test_ext_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm14
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm15
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm13
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15]
+; SSE2-NEXT:    psraw $8, %xmm12
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
+; SSE2-NEXT:    psraw $8, %xmm11
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
+; SSE2-NEXT:    psraw $8, %xmm10
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; SSE2-NEXT:    psraw $8, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE2-NEXT:    psraw $8, %xmm8
+; SSE2-NEXT:    paddw %xmm14, %xmm8
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm15, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    paddw %xmm3, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm13, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    paddw %xmm12, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    paddw %xmm11, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    paddw %xmm10, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    paddw %xmm9, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    psubw %xmm7, %xmm8
+; SSE2-NEXT:    psubw %xmm7, %xmm0
+; SSE2-NEXT:    psubw %xmm7, %xmm4
+; SSE2-NEXT:    psubw %xmm7, %xmm1
+; SSE2-NEXT:    psubw %xmm7, %xmm5
+; SSE2-NEXT:    psubw %xmm7, %xmm2
+; SSE2-NEXT:    psubw %xmm7, %xmm6
+; SSE2-NEXT:    psubw %xmm7, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm6
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm7, %xmm8
+; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    packuswb %xmm8, %xmm0
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm1
+; SSE2-NEXT:    packuswb %xmm4, %xmm1
+; SSE2-NEXT:    pand %xmm7, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm2
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    pand %xmm7, %xmm6
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    packuswb %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v64i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm9
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm11
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm13
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm15
+; SSE4-NEXT:    pmovsxbw %xmm4, %xmm0
+; SSE4-NEXT:    paddw %xmm8, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm4
+; SSE4-NEXT:    paddw %xmm9, %xmm4
+; SSE4-NEXT:    pmovsxbw %xmm5, %xmm1
+; SSE4-NEXT:    paddw %xmm10, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm5
+; SSE4-NEXT:    paddw %xmm11, %xmm5
+; SSE4-NEXT:    pmovsxbw %xmm6, %xmm2
+; SSE4-NEXT:    paddw %xmm12, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm6
+; SSE4-NEXT:    paddw %xmm13, %xmm6
+; SSE4-NEXT:    pmovsxbw %xmm7, %xmm3
+; SSE4-NEXT:    paddw %xmm14, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm7, %xmm7
+; SSE4-NEXT:    paddw %xmm15, %xmm7
+; SSE4-NEXT:    pcmpeqd %xmm8, %xmm8
+; SSE4-NEXT:    psubw %xmm8, %xmm0
+; SSE4-NEXT:    psubw %xmm8, %xmm4
+; SSE4-NEXT:    psubw %xmm8, %xmm1
+; SSE4-NEXT:    psubw %xmm8, %xmm5
+; SSE4-NEXT:    psubw %xmm8, %xmm2
+; SSE4-NEXT:    psubw %xmm8, %xmm6
+; SSE4-NEXT:    psubw %xmm8, %xmm3
+; SSE4-NEXT:    psubw %xmm8, %xmm7
+; SSE4-NEXT:    psrlw $1, %xmm7
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm6
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm5
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm8, %xmm0
+; SSE4-NEXT:    pand %xmm8, %xmm4
+; SSE4-NEXT:    packuswb %xmm4, %xmm0
+; SSE4-NEXT:    pand %xmm8, %xmm1
+; SSE4-NEXT:    pand %xmm8, %xmm5
+; SSE4-NEXT:    packuswb %xmm5, %xmm1
+; SSE4-NEXT:    pand %xmm8, %xmm2
+; SSE4-NEXT:    pand %xmm8, %xmm6
+; SSE4-NEXT:    packuswb %xmm6, %xmm2
+; SSE4-NEXT:    pand %xmm8, %xmm3
+; SSE4-NEXT:    pand %xmm8, %xmm7
+; SSE4-NEXT:    packuswb %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpmovsxbw %xmm7, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm10
+; AVX1-NEXT:    vpmovsxbw %xmm10, %xmm11
+; AVX1-NEXT:    vpaddw %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm10, %xmm10
+; AVX1-NEXT:    vpaddw %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm10
+; AVX1-NEXT:    vpaddw %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm10
+; AVX1-NEXT:    vpaddw %xmm10, %xmm8, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm7
+; AVX1-NEXT:    vpaddw %xmm7, %xmm9, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubw %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm8, %xmm8
+; AVX1-NEXT:    vpsubw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubw %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm7, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm4
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm5
+; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm4
+; AVX1-NEXT:    vpand %xmm2, %xmm8, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpand %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpmovsxbw %xmm4, %ymm4
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT:    vpmovsxbw %xmm6, %ymm6
+; AVX2-NEXT:    vpaddw %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpaddw %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpsubw %ymm3, %ymm4, %ymm4
+; AVX2-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm4, %ymm3
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm3
+; AVX512-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubw %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsubw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <64 x i8> %a0 to <64 x i16>
+  %x1 = sext <64 x i8> %a1 to <64 x i16>
+  %sum = add <64 x i16> %x0, %x1
+  %inc = add <64 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = ashr <64 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <64 x i16> %shift to <64 x i8>
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_fixed_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    por %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    por %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm8, %xmm7
+; SSE-NEXT:    psraw $1, %xmm7
+; SSE-NEXT:    psubw %xmm7, %xmm3
+; SSE-NEXT:    psraw $1, %xmm6
+; SSE-NEXT:    psubw %xmm6, %xmm9
+; SSE-NEXT:    psraw $1, %xmm5
+; SSE-NEXT:    psubw %xmm5, %xmm10
+; SSE-NEXT:    psraw $1, %xmm4
+; SSE-NEXT:    psubw %xmm4, %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubw %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsraw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsraw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubw %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <32 x i16> %a0, %a1
+  %xor = xor <32 x i16> %a1, %a0
+  %shift = ashr <32 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <32 x i16> %or, %shift
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE2-LABEL: test_ext_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
+; SSE2-NEXT:    psrad $16, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7]
+; SSE2-NEXT:    psrad $16, %xmm14
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; SSE2-NEXT:    psrad $16, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
+; SSE2-NEXT:    psrad $16, %xmm12
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm10
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    paddd %xmm13, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    paddd %xmm14, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    paddd %xmm15, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    paddd %xmm12, %xmm6
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm11, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    paddd %xmm10, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm9, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    paddd %xmm8, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm8
+; SSE2-NEXT:    psubd %xmm8, %xmm3
+; SSE2-NEXT:    psubd %xmm8, %xmm7
+; SSE2-NEXT:    psubd %xmm8, %xmm2
+; SSE2-NEXT:    psubd %xmm8, %xmm6
+; SSE2-NEXT:    psubd %xmm8, %xmm1
+; SSE2-NEXT:    psubd %xmm8, %xmm5
+; SSE2-NEXT:    psubd %xmm8, %xmm0
+; SSE2-NEXT:    psubd %xmm8, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm4, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm5, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm6
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm6, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm7
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    pslld $15, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    packssdw %xmm7, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm9
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm11
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm13
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm15
+; SSE4-NEXT:    pmovsxwd %xmm4, %xmm0
+; SSE4-NEXT:    paddd %xmm8, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm4
+; SSE4-NEXT:    paddd %xmm9, %xmm4
+; SSE4-NEXT:    pmovsxwd %xmm5, %xmm1
+; SSE4-NEXT:    paddd %xmm10, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm5
+; SSE4-NEXT:    paddd %xmm11, %xmm5
+; SSE4-NEXT:    pmovsxwd %xmm6, %xmm2
+; SSE4-NEXT:    paddd %xmm12, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm6
+; SSE4-NEXT:    paddd %xmm13, %xmm6
+; SSE4-NEXT:    pmovsxwd %xmm7, %xmm3
+; SSE4-NEXT:    paddd %xmm14, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm7, %xmm7
+; SSE4-NEXT:    paddd %xmm15, %xmm7
+; SSE4-NEXT:    pcmpeqd %xmm8, %xmm8
+; SSE4-NEXT:    psubd %xmm8, %xmm0
+; SSE4-NEXT:    psubd %xmm8, %xmm4
+; SSE4-NEXT:    psubd %xmm8, %xmm1
+; SSE4-NEXT:    psubd %xmm8, %xmm5
+; SSE4-NEXT:    psubd %xmm8, %xmm2
+; SSE4-NEXT:    psubd %xmm8, %xmm6
+; SSE4-NEXT:    psubd %xmm8, %xmm3
+; SSE4-NEXT:    psubd %xmm8, %xmm7
+; SSE4-NEXT:    psrld $1, %xmm7
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm6
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm5
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm8, %xmm8
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2],xmm8[3],xmm0[4],xmm8[5],xmm0[6],xmm8[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2],xmm8[3],xmm4[4],xmm8[5],xmm4[6],xmm8[7]
+; SSE4-NEXT:    packusdw %xmm4, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2],xmm8[3],xmm1[4],xmm8[5],xmm1[6],xmm8[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4],xmm8[5],xmm5[6],xmm8[7]
+; SSE4-NEXT:    packusdw %xmm5, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2],xmm8[3],xmm2[4],xmm8[5],xmm2[6],xmm8[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4],xmm8[5],xmm6[6],xmm8[7]
+; SSE4-NEXT:    packusdw %xmm6, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2],xmm8[3],xmm3[4],xmm8[5],xmm3[6],xmm8[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2],xmm8[3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]
+; SSE4-NEXT:    packusdw %xmm7, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm4, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpmovsxwd %xmm7, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm10
+; AVX1-NEXT:    vpmovsxwd %xmm10, %xmm11
+; AVX1-NEXT:    vpaddd %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm10, %xmm10
+; AVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm10
+; AVX1-NEXT:    vpaddd %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm10
+; AVX1-NEXT:    vpaddd %xmm10, %xmm8, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm7
+; AVX1-NEXT:    vpaddd %xmm7, %xmm9, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubd %xmm3, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubd %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubd %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm8, %xmm8
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm7, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4],xmm8[5],xmm5[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2],xmm8[3],xmm4[4],xmm8[5],xmm4[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4],xmm8[5],xmm6[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2],xmm8[3],xmm0[4],xmm8[5],xmm0[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0],xmm8[1],xmm7[2],xmm8[3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2],xmm8[3],xmm2[4],xmm8[5],xmm2[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2],xmm8[3],xmm3[4],xmm8[5],xmm3[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2],xmm8[3],xmm1[4],xmm8[5],xmm1[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpmovsxwd %xmm4, %ymm4
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT:    vpmovsxwd %xmm5, %ymm5
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT:    vpmovsxwd %xmm6, %ymm6
+; AVX2-NEXT:    vpaddd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpsubd %ymm3, %ymm4, %ymm4
+; AVX2-NEXT:    vpsubd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7],ymm3[8],ymm4[9],ymm3[10],ymm4[11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7],ymm2[8],ymm4[9],ymm2[10],ymm4[11],ymm2[12],ymm4[13],ymm2[14],ymm4[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm3
+; AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubd %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <32 x i16> %a0 to <32 x i32>
+  %x1 = sext <32 x i16> %a1 to <32 x i32>
+  %sum = add <32 x i32> %x0, %x1
+  %inc = add <32 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = ashr <32 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <32 x i32> %shift to <32 x i16>
+  ret <32 x i16> %res
+}
+
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE-LABEL: test_fixed_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    por %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    por %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm8, %xmm7
+; SSE-NEXT:    psrad $1, %xmm7
+; SSE-NEXT:    psubd %xmm7, %xmm3
+; SSE-NEXT:    psrad $1, %xmm6
+; SSE-NEXT:    psubd %xmm6, %xmm9
+; SSE-NEXT:    psrad $1, %xmm5
+; SSE-NEXT:    psubd %xmm5, %xmm10
+; SSE-NEXT:    psrad $1, %xmm4
+; SSE-NEXT:    psubd %xmm4, %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxord %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrad $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i32> %a0, %a1
+  %xor = xor <16 x i32> %a1, %a0
+  %shift = ashr <16 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = sub <16 x i32> %or, %shift
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE2-LABEL: test_ext_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm13, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm14, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm14
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1]
+; SSE2-NEXT:    paddq %xmm13, %xmm8
+; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm13
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm13
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
+; SSE2-NEXT:    paddq %xmm12, %xmm4
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm12
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
+; SSE2-NEXT:    paddq %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm12
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
+; SSE2-NEXT:    paddq %xmm11, %xmm5
+; SSE2-NEXT:    pxor %xmm11, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm11
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3]
+; SSE2-NEXT:    pxor %xmm11, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm11
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
+; SSE2-NEXT:    paddq %xmm10, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT:    psubq %xmm7, %xmm8
+; SSE2-NEXT:    psubq %xmm7, %xmm0
+; SSE2-NEXT:    psubq %xmm7, %xmm4
+; SSE2-NEXT:    psubq %xmm7, %xmm1
+; SSE2-NEXT:    psubq %xmm7, %xmm5
+; SSE2-NEXT:    psubq %xmm7, %xmm2
+; SSE2-NEXT:    psubq %xmm7, %xmm6
+; SSE2-NEXT:    psubq %xmm7, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm8
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm8, %xmm9
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm11
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm13
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm15
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm8
+; SSE4-NEXT:    paddq %xmm9, %xmm8
+; SSE4-NEXT:    pmovsxdq %xmm4, %xmm0
+; SSE4-NEXT:    paddq %xmm10, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm4
+; SSE4-NEXT:    paddq %xmm11, %xmm4
+; SSE4-NEXT:    pmovsxdq %xmm5, %xmm1
+; SSE4-NEXT:    paddq %xmm12, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm5
+; SSE4-NEXT:    paddq %xmm13, %xmm5
+; SSE4-NEXT:    pmovsxdq %xmm6, %xmm2
+; SSE4-NEXT:    paddq %xmm14, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm6
+; SSE4-NEXT:    paddq %xmm15, %xmm6
+; SSE4-NEXT:    pmovsxdq %xmm7, %xmm3
+; SSE4-NEXT:    paddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE4-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE4-NEXT:    psubq %xmm7, %xmm8
+; SSE4-NEXT:    psubq %xmm7, %xmm0
+; SSE4-NEXT:    psubq %xmm7, %xmm4
+; SSE4-NEXT:    psubq %xmm7, %xmm1
+; SSE4-NEXT:    psubq %xmm7, %xmm5
+; SSE4-NEXT:    psubq %xmm7, %xmm2
+; SSE4-NEXT:    psubq %xmm7, %xmm6
+; SSE4-NEXT:    psubq %xmm7, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm8
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm8
+; AVX1-NEXT:    vpmovsxdq %xmm8, %xmm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm8, %xmm8
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm10
+; AVX1-NEXT:    vpaddq %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm10
+; AVX1-NEXT:    vpmovsxdq %xmm10, %xmm11
+; AVX1-NEXT:    vpaddq %xmm6, %xmm11, %xmm6
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm5
+; AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
+; AVX1-NEXT:    vpmovsxdq %xmm7, %xmm10
+; AVX1-NEXT:    vpaddq %xmm10, %xmm9, %xmm9
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubq %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubq %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubq %xmm7, %xmm9, %xmm8
+; AVX1-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrlq $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[0,2],ymm0[0,2],ymm4[4,6],ymm0[4,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[0,2],ymm2[4,6],ymm1[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm4
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm6
+; AVX2-NEXT:    vpaddq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm6
+; AVX2-NEXT:    vpaddq %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubq %ymm2, %ymm4, %ymm3
+; AVX2-NEXT:    vpsubq %ymm2, %ymm5, %ymm4
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm0[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm3
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i32> %a0 to <16 x i64>
+  %x1 = sext <16 x i32> %a1 to <16 x i64>
+  %sum = add <16 x i64> %x0, %x1
+  %inc = add <16 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shift = ashr <16 x i64> %inc, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <16 x i64> %shift to <16 x i32>
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm0, %xmm11
+; SSE2-NEXT:    por %xmm7, %xmm3
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm11, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm8
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; SSE2-NEXT:    psubq %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; SSE2-NEXT:    psubq %xmm6, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm6
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; SSE2-NEXT:    psubq %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    psubq %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm8
+; SSE4-NEXT:    movdqa %xmm2, %xmm9
+; SSE4-NEXT:    movdqa %xmm1, %xmm10
+; SSE4-NEXT:    movdqa %xmm0, %xmm11
+; SSE4-NEXT:    por %xmm7, %xmm3
+; SSE4-NEXT:    por %xmm6, %xmm2
+; SSE4-NEXT:    por %xmm5, %xmm1
+; SSE4-NEXT:    por %xmm4, %xmm0
+; SSE4-NEXT:    pxor %xmm11, %xmm4
+; SSE4-NEXT:    pxor %xmm10, %xmm5
+; SSE4-NEXT:    pxor %xmm9, %xmm6
+; SSE4-NEXT:    pxor %xmm8, %xmm7
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    psrad $1, %xmm8
+; SSE4-NEXT:    psrlq $1, %xmm7
+; SSE4-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7]
+; SSE4-NEXT:    psubq %xmm7, %xmm3
+; SSE4-NEXT:    movdqa %xmm6, %xmm7
+; SSE4-NEXT:    psrad $1, %xmm7
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
+; SSE4-NEXT:    psubq %xmm6, %xmm2
+; SSE4-NEXT:    movdqa %xmm5, %xmm6
+; SSE4-NEXT:    psrad $1, %xmm6
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
+; SSE4-NEXT:    psubq %xmm5, %xmm1
+; SSE4-NEXT:    movdqa %xmm4, %xmm5
+; SSE4-NEXT:    psrad $1, %xmm5
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; SSE4-NEXT:    psubq %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpsubq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsraq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <8 x i64> %a0, %a1
+  %xor = xor <8 x i64> %a1, %a0
+  %shift = ashr <8 x i64> %xor, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = sub <8 x i64> %or, %shift
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_ext_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    .cfi_def_cfa_offset 64
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm3, %rbx
+; SSE2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    movq %rdi, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    movq %xmm4, %r8
+; SSE2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r11
+; SSE2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movq %xmm5, %r10
+; SSE2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r15
+; SSE2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    movq %xmm6, %r9
+; SSE2-NEXT:    movq %r9, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rsi
+; SSE2-NEXT:    movq %rsi, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movq %xmm7, %rdx
+; SSE2-NEXT:    movq %rdx, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    addq %rax, %rdi
+; SSE2-NEXT:    adcq %rbp, %rcx
+; SSE2-NEXT:    addq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    adcq %rbx, %r12
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE2-NEXT:    addq %rsi, %rbp
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    addq %r9, %rbx
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE2-NEXT:    adcq (%rsp), %r8 # 8-byte Folded Reload
+; SSE2-NEXT:    addq $1, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    adcq $0, %r8
+; SSE2-NEXT:    addq $1, %rdx
+; SSE2-NEXT:    adcq $0, %r11
+; SSE2-NEXT:    addq $1, %rsi
+; SSE2-NEXT:    adcq $0, %r10
+; SSE2-NEXT:    addq $1, %r9
+; SSE2-NEXT:    adcq $0, %r15
+; SSE2-NEXT:    addq $1, %rbx
+; SSE2-NEXT:    adcq $0, %r14
+; SSE2-NEXT:    addq $1, %rbp
+; SSE2-NEXT:    adcq $0, %r13
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    addq $1, %rax
+; SSE2-NEXT:    adcq $0, %r12
+; SSE2-NEXT:    addq $1, %rdi
+; SSE2-NEXT:    adcq $0, %rcx
+; SSE2-NEXT:    shldq $63, %rdi, %rcx
+; SSE2-NEXT:    shldq $63, %rax, %r12
+; SSE2-NEXT:    shldq $63, %rbp, %r13
+; SSE2-NEXT:    shldq $63, %rbx, %r14
+; SSE2-NEXT:    shldq $63, %r9, %r15
+; SSE2-NEXT:    shldq $63, %rsi, %r10
+; SSE2-NEXT:    shldq $63, %rdx, %r11
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rax, %r8
+; SSE2-NEXT:    movq %r8, %xmm0
+; SSE2-NEXT:    movq %r11, %xmm4
+; SSE2-NEXT:    movq %r10, %xmm1
+; SSE2-NEXT:    movq %r15, %xmm5
+; SSE2-NEXT:    movq %r14, %xmm2
+; SSE2-NEXT:    movq %r13, %xmm6
+; SSE2-NEXT:    movq %r12, %xmm3
+; SSE2-NEXT:    movq %rcx, %xmm7
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    subq $16, %rsp
+; SSE4-NEXT:    .cfi_def_cfa_offset 72
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm0, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm1, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm1, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm2, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm2, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm3, %r13
+; SSE4-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r13
+; SSE4-NEXT:    movq %xmm3, %rax
+; SSE4-NEXT:    movq %rax, %rsi
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    pextrq $1, %xmm4, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm4, %r11
+; SSE4-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r11
+; SSE4-NEXT:    pextrq $1, %xmm5, %r10
+; SSE4-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    movq %xmm5, %rax
+; SSE4-NEXT:    movq %rax, %r14
+; SSE4-NEXT:    sarq $63, %r14
+; SSE4-NEXT:    pextrq $1, %xmm6, %rdi
+; SSE4-NEXT:    movq %rdi, %rbx
+; SSE4-NEXT:    sarq $63, %rbx
+; SSE4-NEXT:    movq %xmm6, %rdx
+; SSE4-NEXT:    movq %rdx, %r12
+; SSE4-NEXT:    sarq $63, %r12
+; SSE4-NEXT:    pextrq $1, %xmm7, %r15
+; SSE4-NEXT:    movq %r15, %r9
+; SSE4-NEXT:    sarq $63, %r9
+; SSE4-NEXT:    movq %xmm7, %rbp
+; SSE4-NEXT:    movq %rbp, %r8
+; SSE4-NEXT:    sarq $63, %r8
+; SSE4-NEXT:    addq %rbp, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    adcq %rsi, %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    addq %r15, %rcx
+; SSE4-NEXT:    adcq %r13, %r9
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE4-NEXT:    addq %rdx, %rbp
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; SSE4-NEXT:    addq %rdi, %r13
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    addq %rax, %r15
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE4-NEXT:    adcq (%rsp), %r11 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE4-NEXT:    addq $1, %rdx
+; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    adcq $0, %rax
+; SSE4-NEXT:    movq %rax, %rdx
+; SSE4-NEXT:    addq $1, %rsi
+; SSE4-NEXT:    adcq $0, %r11
+; SSE4-NEXT:    addq $1, %rdi
+; SSE4-NEXT:    adcq $0, %r10
+; SSE4-NEXT:    addq $1, %r15
+; SSE4-NEXT:    adcq $0, %r14
+; SSE4-NEXT:    addq $1, %r13
+; SSE4-NEXT:    adcq $0, %rbx
+; SSE4-NEXT:    addq $1, %rbp
+; SSE4-NEXT:    adcq $0, %r12
+; SSE4-NEXT:    addq $1, %rcx
+; SSE4-NEXT:    adcq $0, %r9
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    addq $1, %rax
+; SSE4-NEXT:    adcq $0, %r8
+; SSE4-NEXT:    shldq $63, %rax, %r8
+; SSE4-NEXT:    shldq $63, %rcx, %r9
+; SSE4-NEXT:    shldq $63, %rbp, %r12
+; SSE4-NEXT:    shldq $63, %r13, %rbx
+; SSE4-NEXT:    shldq $63, %r15, %r14
+; SSE4-NEXT:    shldq $63, %rdi, %r10
+; SSE4-NEXT:    shldq $63, %rsi, %r11
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rcx, %rdx
+; SSE4-NEXT:    movq %rdx, %xmm4
+; SSE4-NEXT:    movq %r11, %xmm0
+; SSE4-NEXT:    movq %r10, %xmm5
+; SSE4-NEXT:    movq %r14, %xmm1
+; SSE4-NEXT:    movq %rbx, %xmm6
+; SSE4-NEXT:    movq %r12, %xmm2
+; SSE4-NEXT:    movq %r9, %xmm7
+; SSE4-NEXT:    movq %r8, %xmm3
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE4-NEXT:    addq $16, %rsp
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    pushq %rax
+; AVX1-NEXT:    .cfi_def_cfa_offset 64
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm4, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX1-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vmovq %xmm1, %r8
+; AVX1-NEXT:    movq %r8, %rbp
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX1-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vmovq %xmm0, %r10
+; AVX1-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX1-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r11
+; AVX1-NEXT:    vmovq %xmm2, %r15
+; AVX1-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX1-NEXT:    movq %rdi, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vmovq %xmm0, %rsi
+; AVX1-NEXT:    movq %rsi, %r12
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpextrq $1, %xmm3, %r13
+; AVX1-NEXT:    movq %r13, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    addq %rax, %r8
+; AVX1-NEXT:    adcq %rbp, %rcx
+; AVX1-NEXT:    addq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    adcq %rbx, %rdx
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT:    addq %rsi, %rbp
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX1-NEXT:    addq %rdi, %r13
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX1-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX1-NEXT:    addq $1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    adcq $0, %r9
+; AVX1-NEXT:    addq $1, %rsi
+; AVX1-NEXT:    adcq $0, %r10
+; AVX1-NEXT:    addq $1, %rdi
+; AVX1-NEXT:    adcq $0, %r11
+; AVX1-NEXT:    addq $1, %rbx
+; AVX1-NEXT:    adcq $0, %r15
+; AVX1-NEXT:    addq $1, %r13
+; AVX1-NEXT:    adcq $0, %r14
+; AVX1-NEXT:    addq $1, %rbp
+; AVX1-NEXT:    adcq $0, %r12
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    addq $1, %rax
+; AVX1-NEXT:    adcq $0, %rdx
+; AVX1-NEXT:    addq $1, %r8
+; AVX1-NEXT:    adcq $0, %rcx
+; AVX1-NEXT:    shldq $63, %r8, %rcx
+; AVX1-NEXT:    shldq $63, %rax, %rdx
+; AVX1-NEXT:    shldq $63, %rbp, %r12
+; AVX1-NEXT:    shldq $63, %r13, %r14
+; AVX1-NEXT:    shldq $63, %rbx, %r15
+; AVX1-NEXT:    shldq $63, %rdi, %r11
+; AVX1-NEXT:    shldq $63, %rsi, %r10
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %r9
+; AVX1-NEXT:    vmovq %r9, %xmm0
+; AVX1-NEXT:    vmovq %r10, %xmm1
+; AVX1-NEXT:    vmovq %r11, %xmm2
+; AVX1-NEXT:    vmovq %r15, %xmm3
+; AVX1-NEXT:    vmovq %r14, %xmm4
+; AVX1-NEXT:    vmovq %r12, %xmm5
+; AVX1-NEXT:    vmovq %rdx, %xmm6
+; AVX1-NEXT:    vmovq %rcx, %xmm7
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    addq $8, %rsp
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    .cfi_def_cfa_offset 64
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm4, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    movq %r8, %rbp
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vmovq %xmm0, %r10
+; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX2-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r11
+; AVX2-NEXT:    vmovq %xmm2, %r15
+; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    movq %rdi, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r13
+; AVX2-NEXT:    movq %r13, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    addq %rax, %r8
+; AVX2-NEXT:    adcq %rbp, %rcx
+; AVX2-NEXT:    addq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    adcq %rbx, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX2-NEXT:    addq %rsi, %rbp
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    addq %rdi, %r13
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX2-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX2-NEXT:    addq $1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    adcq $0, %r9
+; AVX2-NEXT:    addq $1, %rsi
+; AVX2-NEXT:    adcq $0, %r10
+; AVX2-NEXT:    addq $1, %rdi
+; AVX2-NEXT:    adcq $0, %r11
+; AVX2-NEXT:    addq $1, %rbx
+; AVX2-NEXT:    adcq $0, %r15
+; AVX2-NEXT:    addq $1, %r13
+; AVX2-NEXT:    adcq $0, %r14
+; AVX2-NEXT:    addq $1, %rbp
+; AVX2-NEXT:    adcq $0, %r12
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    addq $1, %rax
+; AVX2-NEXT:    adcq $0, %rdx
+; AVX2-NEXT:    addq $1, %r8
+; AVX2-NEXT:    adcq $0, %rcx
+; AVX2-NEXT:    shldq $63, %r8, %rcx
+; AVX2-NEXT:    shldq $63, %rax, %rdx
+; AVX2-NEXT:    shldq $63, %rbp, %r12
+; AVX2-NEXT:    shldq $63, %r13, %r14
+; AVX2-NEXT:    shldq $63, %rbx, %r15
+; AVX2-NEXT:    shldq $63, %rdi, %r11
+; AVX2-NEXT:    shldq $63, %rsi, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rax, %r9
+; AVX2-NEXT:    vmovq %r9, %xmm0
+; AVX2-NEXT:    vmovq %r10, %xmm1
+; AVX2-NEXT:    vmovq %r11, %xmm2
+; AVX2-NEXT:    vmovq %r15, %xmm3
+; AVX2-NEXT:    vmovq %r14, %xmm4
+; AVX2-NEXT:    vmovq %r12, %xmm5
+; AVX2-NEXT:    vmovq %rdx, %xmm6
+; AVX2-NEXT:    vmovq %rcx, %xmm7
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    .cfi_def_cfa_offset 64
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %rax
+; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX512-NEXT:    vmovq %xmm3, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rbx
+; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %rbx
+; AVX512-NEXT:    vmovq %xmm0, %r8
+; AVX512-NEXT:    movq %r8, %r13
+; AVX512-NEXT:    sarq $63, %r13
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r9
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r9
+; AVX512-NEXT:    vmovq %xmm2, %r10
+; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r10
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r11
+; AVX512-NEXT:    vmovq %xmm0, %r14
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r14
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512-NEXT:    movq %rdi, %r15
+; AVX512-NEXT:    sarq $63, %r15
+; AVX512-NEXT:    vmovq %xmm0, %rsi
+; AVX512-NEXT:    movq %rsi, %r12
+; AVX512-NEXT:    sarq $63, %r12
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rbp
+; AVX512-NEXT:    movq %rbp, %rdx
+; AVX512-NEXT:    sarq $63, %rdx
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    addq %rax, %r8
+; AVX512-NEXT:    adcq %r13, %rcx
+; AVX512-NEXT:    addq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    adcq %rbx, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT:    addq %rsi, %rbp
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT:    addq %rdi, %r13
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX512-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX512-NEXT:    addq $1, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    adcq $0, %r9
+; AVX512-NEXT:    addq $1, %rsi
+; AVX512-NEXT:    adcq $0, %r10
+; AVX512-NEXT:    addq $1, %rdi
+; AVX512-NEXT:    adcq $0, %r11
+; AVX512-NEXT:    addq $1, %rbx
+; AVX512-NEXT:    adcq $0, %r14
+; AVX512-NEXT:    addq $1, %r13
+; AVX512-NEXT:    adcq $0, %r15
+; AVX512-NEXT:    addq $1, %rbp
+; AVX512-NEXT:    adcq $0, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    addq $1, %rax
+; AVX512-NEXT:    adcq $0, %rdx
+; AVX512-NEXT:    addq $1, %r8
+; AVX512-NEXT:    adcq $0, %rcx
+; AVX512-NEXT:    shldq $63, %r8, %rcx
+; AVX512-NEXT:    shldq $63, %rax, %rdx
+; AVX512-NEXT:    shldq $63, %rbp, %r12
+; AVX512-NEXT:    shldq $63, %r13, %r15
+; AVX512-NEXT:    shldq $63, %rbx, %r14
+; AVX512-NEXT:    shldq $63, %rdi, %r11
+; AVX512-NEXT:    shldq $63, %rsi, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rax, %r9
+; AVX512-NEXT:    vmovq %r9, %xmm0
+; AVX512-NEXT:    vmovq %r10, %xmm1
+; AVX512-NEXT:    vmovq %r11, %xmm2
+; AVX512-NEXT:    vmovq %r14, %xmm3
+; AVX512-NEXT:    vmovq %r15, %xmm4
+; AVX512-NEXT:    vmovq %r12, %xmm5
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vmovq %rcx, %xmm7
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    addq $8, %rsp
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i64> %a0 to <8 x i128>
+  %x1 = sext <8 x i64> %a1 to <8 x i128>
+  %sum = add <8 x i128> %x0, %x1
+  %inc = add <8 x i128> %sum, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %shift = ashr <8 x i128> %inc, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <8 x i128> %shift to <8 x i64>
+  ret <8 x i64> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/avgceilu.ll b/llvm/test/CodeGen/X86/avgceilu.ll
new file mode 100644
index 00000000000000..3a74fca2377370
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgceilu.ll
@@ -0,0 +1,2219 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+;
+; 128-bit vectors
+;
+
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_fixed_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %or = or <16 x i8> %a0, %a1
+  %xor = xor <16 x i8> %a0, %a1
+  %shift = lshr <16 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <16 x i8> %or, %shift
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_ext_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x0 = zext <16 x i8> %a0 to <16 x i16>
+  %x1 = zext <16 x i8> %a1 to <16 x i16>
+  %sum = add <16 x i16> %x0, %x1
+  %inc = add <16 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = lshr <16 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <16 x i16> %shift to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_fixed_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %or = or <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a1, %a0
+  %shift = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <8 x i16> %or, %shift
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_ext_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x0 = zext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %sum = add <8 x i32> %x0, %x1
+  %inc = add <8 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = lshr <8 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <8 x i32> %shift to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_fixed_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    psubd %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %or = or <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a1, %a0
+  %shift = lshr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
+  %res = sub <4 x i32> %or, %shift
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE2-LABEL: test_ext_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    psubq %xmm0, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE4-NEXT:    paddq %xmm0, %xmm1
+; SSE4-NEXT:    paddq %xmm4, %xmm2
+; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE4-NEXT:    psubq %xmm0, %xmm1
+; SSE4-NEXT:    psubq %xmm0, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE4-NEXT:    movaps %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = zext <4 x i32> %a0 to <4 x i64>
+  %x1 = zext <4 x i32> %a1 to <4 x i64>
+  %sum = add <4 x i64> %x0, %x1
+  %inc = add <4 x i64> %sum, <i64 1, i64 1, i64 1, i64 1>
+  %shift = lshr <4 x i64> %inc, <i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <4 x i64> %shift to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE-LABEL: test_fixed_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    psubq %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %or = or <2 x i64> %a0, %a1
+  %xor = xor <2 x i64> %a1, %a0
+  %shift = lshr <2 x i64> %xor, <i64 1, i64 1>
+  %res = sub <2 x i64> %or, %shift
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_ext_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rcx
+; SSE2-NEXT:    movb $1, %dl
+; SSE2-NEXT:    movb $1, %sil
+; SSE2-NEXT:    addb $-1, %sil
+; SSE2-NEXT:    leaq 1(%rax,%rcx), %rsi
+; SSE2-NEXT:    adcq %rcx, %rax
+; SSE2-NEXT:    setb %al
+; SSE2-NEXT:    addb $-1, %dl
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %xmm1, %rdx
+; SSE2-NEXT:    leaq 1(%rcx,%rdx), %rdi
+; SSE2-NEXT:    adcq %rdx, %rcx
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    movzbl %cl, %ecx
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    shrdq $1, %rcx, %rdi
+; SSE2-NEXT:    shrdq $1, %rax, %rsi
+; SSE2-NEXT:    movq %rdi, %xmm0
+; SSE2-NEXT:    movq %rsi, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm0, %rax
+; SSE4-NEXT:    movq %xmm1, %rcx
+; SSE4-NEXT:    movb $1, %dl
+; SSE4-NEXT:    movb $1, %sil
+; SSE4-NEXT:    addb $-1, %sil
+; SSE4-NEXT:    leaq 1(%rax,%rcx), %rsi
+; SSE4-NEXT:    adcq %rcx, %rax
+; SSE4-NEXT:    setb %al
+; SSE4-NEXT:    addb $-1, %dl
+; SSE4-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE4-NEXT:    pextrq $1, %xmm1, %rdx
+; SSE4-NEXT:    leaq 1(%rcx,%rdx), %rdi
+; SSE4-NEXT:    adcq %rdx, %rcx
+; SSE4-NEXT:    setb %cl
+; SSE4-NEXT:    movzbl %cl, %ecx
+; SSE4-NEXT:    movzbl %al, %eax
+; SSE4-NEXT:    shrdq $1, %rcx, %rdi
+; SSE4-NEXT:    shrdq $1, %rax, %rsi
+; SSE4-NEXT:    movq %rdi, %xmm1
+; SSE4-NEXT:    movq %rsi, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vmovq %xmm1, %rcx
+; AVX-NEXT:    movb $1, %dl
+; AVX-NEXT:    movb $1, %sil
+; AVX-NEXT:    addb $-1, %sil
+; AVX-NEXT:    leaq 1(%rax,%rcx), %rsi
+; AVX-NEXT:    adcq %rcx, %rax
+; AVX-NEXT:    setb %al
+; AVX-NEXT:    addb $-1, %dl
+; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX-NEXT:    leaq 1(%rcx,%rdx), %rdi
+; AVX-NEXT:    adcq %rdx, %rcx
+; AVX-NEXT:    setb %cl
+; AVX-NEXT:    movzbl %cl, %ecx
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    shrdq $1, %rcx, %rdi
+; AVX-NEXT:    shrdq $1, %rax, %rsi
+; AVX-NEXT:    vmovq %rdi, %xmm0
+; AVX-NEXT:    vmovq %rsi, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %x0 = zext <2 x i64> %a0 to <2 x i128>
+  %x1 = zext <2 x i64> %a1 to <2 x i128>
+  %sum = add <2 x i128> %x0, %x1
+  %inc = add <2 x i128> %sum, <i128 1, i128 1>
+  %shift = lshr <2 x i128> %inc, <i128 1, i128 1>
+  %res = trunc <2 x i128> %shift to <2 x i64>
+  ret <2 x i64> %res
+}
+
+;
+; 256-bit vectors
+;
+
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_fixed_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm2, %xmm0
+; SSE-NEXT:    pavgb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <32 x i8> %a0, %a1
+  %xor = xor <32 x i8> %a0, %a1
+  %shift = lshr <32 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <32 x i8> %or, %shift
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_ext_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm2, %xmm0
+; SSE-NEXT:    pavgb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <32 x i8> %a0 to <32 x i16>
+  %x1 = zext <32 x i8> %a1 to <32 x i16>
+  %sum = add <32 x i16> %x0, %x1
+  %inc = add <32 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = lshr <32 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <32 x i16> %shift to <32 x i8>
+  ret <32 x i8> %res
+}
+
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_fixed_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm2, %xmm0
+; SSE-NEXT:    pavgw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i16> %a0, %a1
+  %xor = xor <16 x i16> %a1, %a0
+  %shift = lshr <16 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <16 x i16> %or, %shift
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_ext_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm2, %xmm0
+; SSE-NEXT:    pavgw %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i16> %a0 to <16 x i32>
+  %x1 = zext <16 x i16> %a1 to <16 x i32>
+  %sum = add <16 x i32> %x0, %x1
+  %inc = add <16 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = lshr <16 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <16 x i32> %shift to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_fixed_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    psrld $1, %xmm3
+; SSE-NEXT:    psubd %xmm3, %xmm4
+; SSE-NEXT:    psrld $1, %xmm2
+; SSE-NEXT:    psubd %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <8 x i32> %a0, %a1
+  %xor = xor <8 x i32> %a1, %a0
+  %shift = lshr <8 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = sub <8 x i32> %or, %shift
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE2-LABEL: test_ext_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm0
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT:    paddq %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm4
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE2-NEXT:    paddq %xmm1, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    psubq %xmm1, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm4
+; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    pxor %xmm5, %xmm5
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE4-NEXT:    paddq %xmm4, %xmm2
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE4-NEXT:    paddq %xmm1, %xmm3
+; SSE4-NEXT:    paddq %xmm6, %xmm0
+; SSE4-NEXT:    paddq %xmm7, %xmm4
+; SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT:    psubq %xmm1, %xmm2
+; SSE4-NEXT:    psubq %xmm1, %xmm3
+; SSE4-NEXT:    psubq %xmm1, %xmm0
+; SSE4-NEXT:    psubq %xmm1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE4-NEXT:    movaps %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpaddq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i32> %a0 to <8 x i64>
+  %x1 = zext <8 x i32> %a1 to <8 x i64>
+  %sum = add <8 x i64> %x0, %x1
+  %inc = add <8 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shift = lshr <8 x i64> %inc, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <8 x i64> %shift to <8 x i32>
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE-LABEL: test_fixed_v4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    psrlq $1, %xmm3
+; SSE-NEXT:    psubq %xmm3, %xmm4
+; SSE-NEXT:    psrlq $1, %xmm2
+; SSE-NEXT:    psubq %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %or = or <4 x i64> %a0, %a1
+  %xor = xor <4 x i64> %a1, %a0
+  %shift = lshr <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1>
+  %res = sub <4 x i64> %or, %shift
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_ext_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm4, %rcx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm4, %rdx
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    movb $1, %sil
+; SSE2-NEXT:    addb $-1, %sil
+; SSE2-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; SSE2-NEXT:    adcq %rdx, %rcx
+; SSE2-NEXT:    setb %dl
+; SSE2-NEXT:    movb $1, %cl
+; SSE2-NEXT:    addb $-1, %cl
+; SSE2-NEXT:    movq %xmm1, %rdi
+; SSE2-NEXT:    movq %xmm3, %r8
+; SSE2-NEXT:    leaq 1(%rdi,%r8), %rcx
+; SSE2-NEXT:    adcq %r8, %rdi
+; SSE2-NEXT:    setb %dil
+; SSE2-NEXT:    movb $1, %r8b
+; SSE2-NEXT:    addb $-1, %r8b
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    leaq 1(%r8,%r9), %r10
+; SSE2-NEXT:    adcq %r9, %r8
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    movq %xmm2, %r9
+; SSE2-NEXT:    leaq 1(%rax,%r9), %r11
+; SSE2-NEXT:    adcq %r9, %rax
+; SSE2-NEXT:    setb %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    movzbl %r8b, %r8d
+; SSE2-NEXT:    movzbl %dil, %edi
+; SSE2-NEXT:    movzbl %dl, %edx
+; SSE2-NEXT:    shrdq $1, %rax, %r11
+; SSE2-NEXT:    shrdq $1, %r8, %r10
+; SSE2-NEXT:    shrdq $1, %rdi, %rcx
+; SSE2-NEXT:    shrdq $1, %rdx, %rsi
+; SSE2-NEXT:    movq %r11, %xmm0
+; SSE2-NEXT:    movq %r10, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movq %rcx, %xmm1
+; SSE2-NEXT:    movq %rsi, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm1, %rcx
+; SSE4-NEXT:    movq %xmm3, %rdx
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    movb $1, %sil
+; SSE4-NEXT:    addb $-1, %sil
+; SSE4-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; SSE4-NEXT:    adcq %rdx, %rcx
+; SSE4-NEXT:    setb %dl
+; SSE4-NEXT:    movb $1, %cl
+; SSE4-NEXT:    addb $-1, %cl
+; SSE4-NEXT:    pextrq $1, %xmm1, %rdi
+; SSE4-NEXT:    pextrq $1, %xmm3, %r8
+; SSE4-NEXT:    leaq 1(%rdi,%r8), %rcx
+; SSE4-NEXT:    adcq %r8, %rdi
+; SSE4-NEXT:    setb %dil
+; SSE4-NEXT:    movb $1, %r8b
+; SSE4-NEXT:    addb $-1, %r8b
+; SSE4-NEXT:    movq %xmm0, %r8
+; SSE4-NEXT:    movq %xmm2, %r9
+; SSE4-NEXT:    leaq 1(%r8,%r9), %r10
+; SSE4-NEXT:    adcq %r9, %r8
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; SSE4-NEXT:    pextrq $1, %xmm2, %r9
+; SSE4-NEXT:    leaq 1(%rax,%r9), %r11
+; SSE4-NEXT:    adcq %r9, %rax
+; SSE4-NEXT:    setb %al
+; SSE4-NEXT:    movzbl %al, %eax
+; SSE4-NEXT:    movzbl %r8b, %r8d
+; SSE4-NEXT:    movzbl %dil, %edi
+; SSE4-NEXT:    movzbl %dl, %edx
+; SSE4-NEXT:    shrdq $1, %rax, %r11
+; SSE4-NEXT:    shrdq $1, %r8, %r10
+; SSE4-NEXT:    shrdq $1, %rdi, %rcx
+; SSE4-NEXT:    shrdq $1, %rdx, %rsi
+; SSE4-NEXT:    movq %r11, %xmm1
+; SSE4-NEXT:    movq %r10, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    movq %rcx, %xmm2
+; SSE4-NEXT:    movq %rsi, %xmm1
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rcx
+; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    movb $1, %sil
+; AVX1-NEXT:    addb $-1, %sil
+; AVX1-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; AVX1-NEXT:    adcq %rdx, %rcx
+; AVX1-NEXT:    setb %dl
+; AVX1-NEXT:    movb $1, %cl
+; AVX1-NEXT:    addb $-1, %cl
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX1-NEXT:    leaq 1(%rdi,%r8), %rcx
+; AVX1-NEXT:    adcq %r8, %rdi
+; AVX1-NEXT:    setb %dil
+; AVX1-NEXT:    movb $1, %r8b
+; AVX1-NEXT:    addb $-1, %r8b
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %r9
+; AVX1-NEXT:    leaq 1(%r8,%r9), %r10
+; AVX1-NEXT:    adcq %r9, %r8
+; AVX1-NEXT:    setb %r8b
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX1-NEXT:    leaq 1(%rax,%r9), %r11
+; AVX1-NEXT:    adcq %r9, %rax
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    movzbl %al, %eax
+; AVX1-NEXT:    movzbl %r8b, %r8d
+; AVX1-NEXT:    movzbl %dil, %edi
+; AVX1-NEXT:    movzbl %dl, %edx
+; AVX1-NEXT:    shrdq $1, %rax, %r11
+; AVX1-NEXT:    shrdq $1, %r8, %r10
+; AVX1-NEXT:    shrdq $1, %rdi, %rcx
+; AVX1-NEXT:    shrdq $1, %rdx, %rsi
+; AVX1-NEXT:    vmovq %r11, %xmm0
+; AVX1-NEXT:    vmovq %r10, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vmovq %rsi, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    movb $1, %sil
+; AVX2-NEXT:    addb $-1, %sil
+; AVX2-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; AVX2-NEXT:    adcq %rdx, %rcx
+; AVX2-NEXT:    setb %dl
+; AVX2-NEXT:    movb $1, %cl
+; AVX2-NEXT:    addb $-1, %cl
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX2-NEXT:    leaq 1(%rdi,%r8), %rcx
+; AVX2-NEXT:    adcq %r8, %rdi
+; AVX2-NEXT:    setb %dil
+; AVX2-NEXT:    movb $1, %r8b
+; AVX2-NEXT:    addb $-1, %r8b
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r8
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %r9
+; AVX2-NEXT:    leaq 1(%r8,%r9), %r10
+; AVX2-NEXT:    adcq %r9, %r8
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    leaq 1(%rax,%r9), %r11
+; AVX2-NEXT:    adcq %r9, %rax
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    movzbl %r8b, %r8d
+; AVX2-NEXT:    movzbl %dil, %edi
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    shrdq $1, %rax, %r11
+; AVX2-NEXT:    shrdq $1, %r8, %r10
+; AVX2-NEXT:    shrdq $1, %rdi, %rcx
+; AVX2-NEXT:    shrdq $1, %rdx, %rsi
+; AVX2-NEXT:    vmovq %r11, %xmm0
+; AVX2-NEXT:    vmovq %r10, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vmovq %rsi, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    vmovq %xmm1, %rdx
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    movb $1, %sil
+; AVX512-NEXT:    addb $-1, %sil
+; AVX512-NEXT:    leaq 1(%rcx,%rdx), %rsi
+; AVX512-NEXT:    adcq %rdx, %rcx
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    movb $1, %cl
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512-NEXT:    addb $-1, %cl
+; AVX512-NEXT:    leaq 1(%rdi,%r8), %rcx
+; AVX512-NEXT:    adcq %r8, %rdi
+; AVX512-NEXT:    setb %dil
+; AVX512-NEXT:    movb $1, %r8b
+; AVX512-NEXT:    addb $-1, %r8b
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r8
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %r9
+; AVX512-NEXT:    leaq 1(%r8,%r9), %r10
+; AVX512-NEXT:    adcq %r9, %r8
+; AVX512-NEXT:    setb %r8b
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX512-NEXT:    leaq 1(%rax,%r9), %r11
+; AVX512-NEXT:    adcq %r9, %rax
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    movzbl %al, %eax
+; AVX512-NEXT:    movzbl %r8b, %r8d
+; AVX512-NEXT:    movzbl %dil, %edi
+; AVX512-NEXT:    movzbl %dl, %edx
+; AVX512-NEXT:    shrdq $1, %rax, %r11
+; AVX512-NEXT:    shrdq $1, %r8, %r10
+; AVX512-NEXT:    shrdq $1, %rdi, %rcx
+; AVX512-NEXT:    shrdq $1, %rdx, %rsi
+; AVX512-NEXT:    vmovq %r11, %xmm0
+; AVX512-NEXT:    vmovq %r10, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    vmovq %rsi, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <4 x i64> %a0 to <4 x i128>
+  %x1 = zext <4 x i64> %a1 to <4 x i128>
+  %sum = add <4 x i128> %x0, %x1
+  %inc = add <4 x i128> %sum, <i128 1, i128 1, i128 1, i128 1>
+  %shift = lshr <4 x i128> %inc, <i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <4 x i128> %shift to <4 x i64>
+  ret <4 x i64> %res
+}
+
+;
+; 512-bit vectors
+;
+
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_fixed_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm4, %xmm0
+; SSE-NEXT:    pavgb %xmm5, %xmm1
+; SSE-NEXT:    pavgb %xmm6, %xmm2
+; SSE-NEXT:    pavgb %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vpsubb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpsubb %xmm6, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <64 x i8> %a0, %a1
+  %xor = xor <64 x i8> %a0, %a1
+  %shift = lshr <64 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = sub <64 x i8> %or, %shift
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_ext_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgb %xmm4, %xmm0
+; SSE-NEXT:    pavgb %xmm5, %xmm1
+; SSE-NEXT:    pavgb %xmm6, %xmm2
+; SSE-NEXT:    pavgb %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <64 x i8> %a0 to <64 x i16>
+  %x1 = zext <64 x i8> %a1 to <64 x i16>
+  %sum = add <64 x i16> %x0, %x1
+  %inc = add <64 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shift = lshr <64 x i16> %inc, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <64 x i16> %shift to <64 x i8>
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_fixed_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm4, %xmm0
+; SSE-NEXT:    pavgw %xmm5, %xmm1
+; SSE-NEXT:    pavgw %xmm6, %xmm2
+; SSE-NEXT:    pavgw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubw %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <32 x i16> %a0, %a1
+  %xor = xor <32 x i16> %a1, %a0
+  %shift = lshr <32 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = sub <32 x i16> %or, %shift
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_ext_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pavgw %xmm4, %xmm0
+; SSE-NEXT:    pavgw %xmm5, %xmm1
+; SSE-NEXT:    pavgw %xmm6, %xmm2
+; SSE-NEXT:    pavgw %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpavgw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpavgw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpavgw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <32 x i16> %a0 to <32 x i32>
+  %x1 = zext <32 x i16> %a1 to <32 x i32>
+  %sum = add <32 x i32> %x0, %x1
+  %inc = add <32 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = lshr <32 x i32> %inc, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <32 x i32> %shift to <32 x i16>
+  ret <32 x i16> %res
+}
+
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE-LABEL: test_fixed_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    por %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    por %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm8, %xmm7
+; SSE-NEXT:    psrld $1, %xmm7
+; SSE-NEXT:    psubd %xmm7, %xmm3
+; SSE-NEXT:    psrld $1, %xmm6
+; SSE-NEXT:    psubd %xmm6, %xmm9
+; SSE-NEXT:    psrld $1, %xmm5
+; SSE-NEXT:    psubd %xmm5, %xmm10
+; SSE-NEXT:    psrld $1, %xmm4
+; SSE-NEXT:    psubd %xmm4, %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxord %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <16 x i32> %a0, %a1
+  %xor = xor <16 x i32> %a1, %a0
+  %shift = lshr <16 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = sub <16 x i32> %or, %shift
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE2-LABEL: test_ext_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
+; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm13
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm10, %xmm0
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; SSE2-NEXT:    paddq %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm5, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm11, %xmm1
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
+; SSE2-NEXT:    paddq %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm12, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; SSE2-NEXT:    paddq %xmm8, %xmm6
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; SSE2-NEXT:    paddq %xmm13, %xmm8
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; SSE2-NEXT:    paddq %xmm3, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm0
+; SSE2-NEXT:    psubq %xmm3, %xmm4
+; SSE2-NEXT:    psubq %xmm3, %xmm1
+; SSE2-NEXT:    psubq %xmm3, %xmm5
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm3, %xmm6
+; SSE2-NEXT:    psubq %xmm3, %xmm8
+; SSE2-NEXT:    psubq %xmm3, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm8
+; SSE2-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm7[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT:    movaps %xmm8, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm8
+; SSE4-NEXT:    movdqa %xmm2, %xmm3
+; SSE4-NEXT:    movdqa %xmm1, %xmm2
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    pxor %xmm10, %xmm10
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm11 = xmm2[0],zero,xmm2[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm13 = xmm8[0],zero,xmm8[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
+; SSE4-NEXT:    paddq %xmm1, %xmm4
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; SSE4-NEXT:    paddq %xmm2, %xmm5
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3]
+; SSE4-NEXT:    paddq %xmm3, %xmm6
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3]
+; SSE4-NEXT:    paddq %xmm8, %xmm7
+; SSE4-NEXT:    paddq %xmm9, %xmm0
+; SSE4-NEXT:    paddq %xmm11, %xmm1
+; SSE4-NEXT:    paddq %xmm12, %xmm2
+; SSE4-NEXT:    paddq %xmm13, %xmm3
+; SSE4-NEXT:    pcmpeqd %xmm8, %xmm8
+; SSE4-NEXT:    psubq %xmm8, %xmm4
+; SSE4-NEXT:    psubq %xmm8, %xmm5
+; SSE4-NEXT:    psubq %xmm8, %xmm6
+; SSE4-NEXT:    psubq %xmm8, %xmm7
+; SSE4-NEXT:    psubq %xmm8, %xmm0
+; SSE4-NEXT:    psubq %xmm8, %xmm1
+; SSE4-NEXT:    psubq %xmm8, %xmm2
+; SSE4-NEXT:    psubq %xmm8, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm7
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX1-NEXT:    vpaddq %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm11
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; AVX1-NEXT:    vpaddq %xmm7, %xmm12, %xmm7
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX1-NEXT:    vpaddq %xmm12, %xmm8, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm12
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX1-NEXT:    vpaddq %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm11[0],zero,xmm11[1],zero
+; AVX1-NEXT:    vpaddq %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero
+; AVX1-NEXT:    vpaddq %xmm3, %xmm9, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT:    vpsubq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpsubq %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubq %xmm6, %xmm8, %xmm8
+; AVX1-NEXT:    vpsubq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpsubq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $1, %xmm8, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm7, %xmm7
+; AVX1-NEXT:    vpsrlq $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT:    vpaddq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpaddq %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubq %ymm2, %ymm4, %ymm3
+; AVX2-NEXT:    vpsubq %ymm2, %ymm5, %ymm4
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm0[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm1[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm1, %zmm2, %zmm2
+; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i32> %a0 to <16 x i64>
+  %x1 = zext <16 x i32> %a1 to <16 x i64>
+  %sum = add <16 x i64> %x0, %x1
+  %inc = add <16 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %shift = lshr <16 x i64> %inc, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <16 x i64> %shift to <16 x i32>
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE-LABEL: test_fixed_v8i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    por %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    por %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm1, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm8, %xmm7
+; SSE-NEXT:    psrlq $1, %xmm7
+; SSE-NEXT:    psubq %xmm7, %xmm3
+; SSE-NEXT:    psrlq $1, %xmm6
+; SSE-NEXT:    psubq %xmm6, %xmm9
+; SSE-NEXT:    psrlq $1, %xmm5
+; SSE-NEXT:    psubq %xmm5, %xmm10
+; SSE-NEXT:    psrlq $1, %xmm4
+; SSE-NEXT:    psubq %xmm4, %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %or = or <8 x i64> %a0, %a1
+  %xor = xor <8 x i64> %a1, %a0
+  %shift = lshr <8 x i64> %xor, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = sub <8 x i64> %or, %shift
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_ext_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm8, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
+; SSE2-NEXT:    movq %xmm8, %rdx
+; SSE2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    adcq %rdx, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %xmm3, %r12
+; SSE2-NEXT:    movq %xmm7, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %r12, %rax
+; SSE2-NEXT:    adcq %rcx, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %r11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %rbx
+; SSE2-NEXT:    movq %r11, %rax
+; SSE2-NEXT:    adcq %rbx, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %xmm2, %r14
+; SSE2-NEXT:    movq %xmm6, %r15
+; SSE2-NEXT:    movq %r14, %rax
+; SSE2-NEXT:    adcq %r15, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %r13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %r10
+; SSE2-NEXT:    movq %r13, %rax
+; SSE2-NEXT:    adcq %r10, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    movq %xmm5, %r8
+; SSE2-NEXT:    movq %r9, %rax
+; SSE2-NEXT:    adcq %r8, %rax
+; SSE2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    addb $-1, %al
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rdi
+; SSE2-NEXT:    movq %xmm2, %rsi
+; SSE2-NEXT:    movq %rdi, %rdx
+; SSE2-NEXT:    adcq %rsi, %rdx
+; SSE2-NEXT:    movb $1, %dl
+; SSE2-NEXT:    setb %bpl
+; SSE2-NEXT:    addb $-1, %dl
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %xmm4, %rax
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    adcq %rax, %rdx
+; SSE2-NEXT:    leaq 1(%rcx,%rax), %rdx
+; SSE2-NEXT:    leaq 1(%rdi,%rsi), %rax
+; SSE2-NEXT:    leaq 1(%r9,%r8), %rcx
+; SSE2-NEXT:    leaq 1(%r13,%r10), %rdi
+; SSE2-NEXT:    leaq 1(%r14,%r15), %rsi
+; SSE2-NEXT:    leaq 1(%r11,%rbx), %r11
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    leaq 1(%r12,%r8), %r9
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT:    leaq 1(%r8,%r10), %r10
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    movzbl %r8b, %r8d
+; SSE2-NEXT:    shrdq $1, %r8, %rdx
+; SSE2-NEXT:    movzbl %bpl, %r8d
+; SSE2-NEXT:    shrdq $1, %r8, %rax
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %rcx
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %rdi
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %rsi
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %r11
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %r9
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    shrdq $1, %r8, %r10
+; SSE2-NEXT:    movq %rdx, %xmm0
+; SSE2-NEXT:    movq %rax, %xmm4
+; SSE2-NEXT:    movq %rcx, %xmm1
+; SSE2-NEXT:    movq %rdi, %xmm5
+; SSE2-NEXT:    movq %rsi, %xmm2
+; SSE2-NEXT:    movq %r11, %xmm6
+; SSE2-NEXT:    movq %r9, %xmm3
+; SSE2-NEXT:    movq %r10, %xmm7
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    movq %xmm3, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm7, %rdx
+; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    movq %rcx, %rax
+; SSE4-NEXT:    adcq %rdx, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    pextrq $1, %xmm3, %r12
+; SSE4-NEXT:    pextrq $1, %xmm7, %rbp
+; SSE4-NEXT:    movq %r12, %rax
+; SSE4-NEXT:    adcq %rbp, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    movq %xmm2, %r11
+; SSE4-NEXT:    movq %xmm6, %rbx
+; SSE4-NEXT:    movq %r11, %rax
+; SSE4-NEXT:    adcq %rbx, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    pextrq $1, %xmm2, %r14
+; SSE4-NEXT:    pextrq $1, %xmm6, %r15
+; SSE4-NEXT:    movq %r14, %rax
+; SSE4-NEXT:    adcq %r15, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    movq %xmm1, %r13
+; SSE4-NEXT:    movq %xmm5, %r10
+; SSE4-NEXT:    movq %r13, %rax
+; SSE4-NEXT:    adcq %r10, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    pextrq $1, %xmm1, %r9
+; SSE4-NEXT:    pextrq $1, %xmm5, %r8
+; SSE4-NEXT:    movq %r9, %rax
+; SSE4-NEXT:    adcq %r8, %rax
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    movb $1, %al
+; SSE4-NEXT:    addb $-1, %al
+; SSE4-NEXT:    movq %xmm0, %rdi
+; SSE4-NEXT:    movq %xmm4, %rsi
+; SSE4-NEXT:    movq %rdi, %rdx
+; SSE4-NEXT:    adcq %rsi, %rdx
+; SSE4-NEXT:    movb $1, %dl
+; SSE4-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SSE4-NEXT:    addb $-1, %dl
+; SSE4-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE4-NEXT:    pextrq $1, %xmm4, %rax
+; SSE4-NEXT:    movq %rcx, %rdx
+; SSE4-NEXT:    adcq %rax, %rdx
+; SSE4-NEXT:    leaq 1(%rcx,%rax), %rdx
+; SSE4-NEXT:    leaq 1(%rdi,%rsi), %rax
+; SSE4-NEXT:    leaq 1(%r9,%r8), %rcx
+; SSE4-NEXT:    leaq 1(%r13,%r10), %rdi
+; SSE4-NEXT:    leaq 1(%r14,%r15), %rsi
+; SSE4-NEXT:    leaq 1(%r11,%rbx), %r11
+; SSE4-NEXT:    leaq 1(%r12,%rbp), %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    leaq 1(%r9,%r10), %r9
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    movzbl %r10b, %r10d
+; SSE4-NEXT:    shrdq $1, %r10, %rdx
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %rax
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %rcx
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %rdi
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %rsi
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %r11
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %r8
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SSE4-NEXT:    shrdq $1, %r10, %r9
+; SSE4-NEXT:    movq %rdx, %xmm4
+; SSE4-NEXT:    movq %rax, %xmm0
+; SSE4-NEXT:    movq %rcx, %xmm5
+; SSE4-NEXT:    movq %rdi, %xmm1
+; SSE4-NEXT:    movq %rsi, %xmm6
+; SSE4-NEXT:    movq %r11, %xmm2
+; SSE4-NEXT:    movq %r8, %xmm7
+; SSE4-NEXT:    movq %r9, %xmm3
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vmovq %xmm1, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm3, %rdx
+; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    movq %rcx, %rax
+; AVX1-NEXT:    adcq %rdx, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r12
+; AVX1-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %r12, %rax
+; AVX1-NEXT:    adcq %rcx, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %r11
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vmovq %xmm3, %rbx
+; AVX1-NEXT:    movq %r11, %rax
+; AVX1-NEXT:    adcq %rbx, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX1-NEXT:    vpextrq $1, %xmm3, %r15
+; AVX1-NEXT:    movq %r14, %rax
+; AVX1-NEXT:    adcq %r15, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vmovq %xmm0, %r13
+; AVX1-NEXT:    vmovq %xmm2, %r10
+; AVX1-NEXT:    movq %r13, %rax
+; AVX1-NEXT:    adcq %r10, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r8
+; AVX1-NEXT:    movq %r9, %rax
+; AVX1-NEXT:    adcq %r8, %rax
+; AVX1-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX1-NEXT:    movb $1, %al
+; AVX1-NEXT:    addb $-1, %al
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT:    vmovq %xmm0, %rdi
+; AVX1-NEXT:    vmovq %xmm1, %rsi
+; AVX1-NEXT:    movq %rdi, %rcx
+; AVX1-NEXT:    adcq %rsi, %rcx
+; AVX1-NEXT:    movb $1, %cl
+; AVX1-NEXT:    setb %bpl
+; AVX1-NEXT:    addb $-1, %cl
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    movq %rdx, %rcx
+; AVX1-NEXT:    adcq %rax, %rcx
+; AVX1-NEXT:    leaq 1(%rdx,%rax), %rcx
+; AVX1-NEXT:    leaq 1(%rdi,%rsi), %rax
+; AVX1-NEXT:    leaq 1(%r9,%r8), %rdx
+; AVX1-NEXT:    leaq 1(%r13,%r10), %rdi
+; AVX1-NEXT:    leaq 1(%r14,%r15), %rsi
+; AVX1-NEXT:    leaq 1(%r11,%rbx), %r11
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX1-NEXT:    leaq 1(%r12,%r8), %r9
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX1-NEXT:    leaq 1(%r8,%r10), %r8
+; AVX1-NEXT:    setb %r10b
+; AVX1-NEXT:    movzbl %r10b, %r10d
+; AVX1-NEXT:    shrdq $1, %r10, %rcx
+; AVX1-NEXT:    movzbl %bpl, %r10d
+; AVX1-NEXT:    shrdq $1, %r10, %rax
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %rdx
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %rdi
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %rsi
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %r11
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %r9
+; AVX1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX1-NEXT:    shrdq $1, %r10, %r8
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vmovq %rdx, %xmm2
+; AVX1-NEXT:    vmovq %rdi, %xmm3
+; AVX1-NEXT:    vmovq %rsi, %xmm4
+; AVX1-NEXT:    vmovq %r11, %xmm5
+; AVX1-NEXT:    vmovq %r9, %xmm6
+; AVX1-NEXT:    vmovq %r8, %xmm7
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm3, %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    adcq %rdx, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r12
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %r12, %rax
+; AVX2-NEXT:    adcq %rcx, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %r11
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX2-NEXT:    vmovq %xmm3, %rbx
+; AVX2-NEXT:    movq %r11, %rax
+; AVX2-NEXT:    adcq %rbx, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r15
+; AVX2-NEXT:    movq %r14, %rax
+; AVX2-NEXT:    adcq %r15, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vmovq %xmm0, %r13
+; AVX2-NEXT:    vmovq %xmm2, %r10
+; AVX2-NEXT:    movq %r13, %rax
+; AVX2-NEXT:    adcq %r10, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r8
+; AVX2-NEXT:    movq %r9, %rax
+; AVX2-NEXT:    adcq %r8, %rax
+; AVX2-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    addb $-1, %al
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT:    vmovq %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %rsi
+; AVX2-NEXT:    movq %rdi, %rcx
+; AVX2-NEXT:    adcq %rsi, %rcx
+; AVX2-NEXT:    movb $1, %cl
+; AVX2-NEXT:    setb %bpl
+; AVX2-NEXT:    addb $-1, %cl
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    movq %rdx, %rcx
+; AVX2-NEXT:    adcq %rax, %rcx
+; AVX2-NEXT:    leaq 1(%rdx,%rax), %rcx
+; AVX2-NEXT:    leaq 1(%rdi,%rsi), %rax
+; AVX2-NEXT:    leaq 1(%r9,%r8), %rdx
+; AVX2-NEXT:    leaq 1(%r13,%r10), %rdi
+; AVX2-NEXT:    leaq 1(%r14,%r15), %rsi
+; AVX2-NEXT:    leaq 1(%r11,%rbx), %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    leaq 1(%r12,%r8), %r9
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    leaq 1(%r8,%r10), %r8
+; AVX2-NEXT:    setb %r10b
+; AVX2-NEXT:    movzbl %r10b, %r10d
+; AVX2-NEXT:    shrdq $1, %r10, %rcx
+; AVX2-NEXT:    movzbl %bpl, %r10d
+; AVX2-NEXT:    shrdq $1, %r10, %rax
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %rdx
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %rdi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %rsi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %r11
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %r9
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX2-NEXT:    shrdq $1, %r10, %r8
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vmovq %rdi, %xmm3
+; AVX2-NEXT:    vmovq %rsi, %xmm4
+; AVX2-NEXT:    vmovq %r11, %xmm5
+; AVX2-NEXT:    vmovq %r9, %xmm6
+; AVX2-NEXT:    vmovq %r8, %xmm7
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vmovq %xmm1, %rdx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    movq %rcx, %rax
+; AVX512-NEXT:    adcq %rdx, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    movq %r12, %rax
+; AVX512-NEXT:    adcq %rcx, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %r11
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512-NEXT:    vmovq %xmm3, %rbx
+; AVX512-NEXT:    movq %r11, %rax
+; AVX512-NEXT:    adcq %rbx, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r14
+; AVX512-NEXT:    vpextrq $1, %xmm3, %r15
+; AVX512-NEXT:    movq %r14, %rax
+; AVX512-NEXT:    adcq %r15, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vmovq %xmm0, %r13
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT:    vmovq %xmm1, %r10
+; AVX512-NEXT:    movq %r13, %rax
+; AVX512-NEXT:    adcq %r10, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512-NEXT:    movq %r9, %rax
+; AVX512-NEXT:    adcq %r8, %rax
+; AVX512-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    addb $-1, %al
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT:    vmovq %xmm0, %rdi
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    movq %rdi, %rcx
+; AVX512-NEXT:    adcq %rsi, %rcx
+; AVX512-NEXT:    movb $1, %cl
+; AVX512-NEXT:    setb %bpl
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512-NEXT:    addb $-1, %cl
+; AVX512-NEXT:    movq %rdx, %rcx
+; AVX512-NEXT:    adcq %rax, %rcx
+; AVX512-NEXT:    leaq 1(%rdx,%rax), %rcx
+; AVX512-NEXT:    leaq 1(%rdi,%rsi), %rax
+; AVX512-NEXT:    leaq 1(%r9,%r8), %rdx
+; AVX512-NEXT:    leaq 1(%r13,%r10), %rdi
+; AVX512-NEXT:    leaq 1(%r14,%r15), %rsi
+; AVX512-NEXT:    leaq 1(%r11,%rbx), %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    leaq 1(%r12,%r8), %r9
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    leaq 1(%r8,%r10), %r8
+; AVX512-NEXT:    setb %r10b
+; AVX512-NEXT:    movzbl %r10b, %r10d
+; AVX512-NEXT:    shrdq $1, %r10, %rcx
+; AVX512-NEXT:    movzbl %bpl, %r10d
+; AVX512-NEXT:    shrdq $1, %r10, %rax
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %rdx
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %rdi
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %rsi
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %r11
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %r9
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; AVX512-NEXT:    shrdq $1, %r10, %r8
+; AVX512-NEXT:    vmovq %rcx, %xmm0
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vmovq %rdx, %xmm2
+; AVX512-NEXT:    vmovq %rdi, %xmm3
+; AVX512-NEXT:    vmovq %rsi, %xmm4
+; AVX512-NEXT:    vmovq %r11, %xmm5
+; AVX512-NEXT:    vmovq %r9, %xmm6
+; AVX512-NEXT:    vmovq %r8, %xmm7
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i64> %a0 to <8 x i128>
+  %x1 = zext <8 x i64> %a1 to <8 x i128>
+  %sum = add <8 x i128> %x0, %x1
+  %inc = add <8 x i128> %sum, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %shift = lshr <8 x i128> %inc, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <8 x i128> %shift to <8 x i64>
+  ret <8 x i64> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
new file mode 100644
index 00000000000000..a3864ab4bb44e4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -0,0 +1,3437 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+;
+; 128-bit vectors
+;
+
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_fixed_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    paddb %xmm2, %xmm0
+; SSE-NEXT:    psubb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i8> %a0, %a1
+  %xor = xor <16 x i8> %a0, %a1
+  %shift = ashr <16 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <16 x i8> %and, %shift
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE2-LABEL: test_ext_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm3, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm2
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm4
+; SSE4-NEXT:    paddw %xmm2, %xmm4
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm0
+; SSE4-NEXT:    paddw %xmm3, %xmm0
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm1, %xmm0
+; SSE4-NEXT:    pand %xmm1, %xmm4
+; SSE4-NEXT:    packuswb %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i8> %a0 to <16 x i16>
+  %x1 = sext <16 x i8> %a1 to <16 x i16>
+  %sum = add <16 x i16> %x0, %x1
+  %shift = ashr <16 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <16 x i16> %shift to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_fixed_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a1, %a0
+  %shift = ashr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <8 x i16> %and, %shift
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE2-LABEL: test_ext_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    paddd %xmm2, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm2
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm4
+; SSE4-NEXT:    paddd %xmm2, %xmm4
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm0
+; SSE4-NEXT:    paddd %xmm3, %xmm0
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
+; SSE4-NEXT:    packusdw %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %sum = add <8 x i32> %x0, %x1
+  %shift = ashr <8 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <8 x i32> %shift to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_fixed_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrad $1, %xmm0
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a1, %a0
+  %shift = ashr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
+  %res = add <4 x i32> %and, %shift
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE2-LABEL: test_ext_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm3
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm0
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm1
+; SSE4-NEXT:    paddq %xmm3, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = sext <4 x i32> %a0 to <4 x i64>
+  %x1 = sext <4 x i32> %a1 to <4 x i64>
+  %sum = add <4 x i64> %x0, %x1
+  %shift = ashr <4 x i64> %sum, <i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <4 x i64> %shift to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    pand %xmm1, %xmm2
+; SSE4-NEXT:    pxor %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    psrad $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpsraq $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %and = and <2 x i64> %a0, %a1
+  %xor = xor <2 x i64> %a1, %a0
+  %shift = ashr <2 x i64> %xor, <i64 1, i64 1>
+  %res = add <2 x i64> %and, %shift
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_ext_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    movq %rdi, %r8
+; SSE2-NEXT:    sarq $63, %r8
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    movq %r9, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    addq %r9, %rdx
+; SSE2-NEXT:    adcq %rsi, %r10
+; SSE2-NEXT:    addq %rdi, %rax
+; SSE2-NEXT:    adcq %rcx, %r8
+; SSE2-NEXT:    shldq $63, %rax, %r8
+; SSE2-NEXT:    shldq $63, %rdx, %r10
+; SSE2-NEXT:    movq %r10, %xmm0
+; SSE2-NEXT:    movq %r8, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm0, %rax
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE4-NEXT:    movq %rdx, %rsi
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    movq %xmm1, %rdi
+; SSE4-NEXT:    movq %rdi, %r8
+; SSE4-NEXT:    sarq $63, %r8
+; SSE4-NEXT:    pextrq $1, %xmm1, %r9
+; SSE4-NEXT:    movq %r9, %r10
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    addq %r9, %rdx
+; SSE4-NEXT:    adcq %rsi, %r10
+; SSE4-NEXT:    addq %rdi, %rax
+; SSE4-NEXT:    adcq %rcx, %r8
+; SSE4-NEXT:    shldq $63, %rax, %r8
+; SSE4-NEXT:    shldq $63, %rdx, %r10
+; SSE4-NEXT:    movq %r10, %xmm1
+; SSE4-NEXT:    movq %r8, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_ext_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX-NEXT:    movq %rdx, %rsi
+; AVX-NEXT:    sarq $63, %rsi
+; AVX-NEXT:    vmovq %xmm1, %rdi
+; AVX-NEXT:    movq %rdi, %r8
+; AVX-NEXT:    sarq $63, %r8
+; AVX-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX-NEXT:    movq %r9, %r10
+; AVX-NEXT:    sarq $63, %r10
+; AVX-NEXT:    addq %r9, %rdx
+; AVX-NEXT:    adcq %rsi, %r10
+; AVX-NEXT:    addq %rdi, %rax
+; AVX-NEXT:    adcq %rcx, %r8
+; AVX-NEXT:    shldq $63, %rax, %r8
+; AVX-NEXT:    shldq $63, %rdx, %r10
+; AVX-NEXT:    vmovq %r10, %xmm0
+; AVX-NEXT:    vmovq %r8, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %x0 = sext <2 x i64> %a0 to <2 x i128>
+  %x1 = sext <2 x i64> %a1 to <2 x i128>
+  %sum = add <2 x i128> %x0, %x1
+  %shift = ashr <2 x i128> %sum, <i128 1, i128 1>
+  %res = trunc <2 x i128> %shift to <2 x i64>
+  ret <2 x i64> %res
+}
+
+;
+; 256-bit vectors
+;
+
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_fixed_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    paddb %xmm4, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    paddb %xmm5, %xmm0
+; SSE-NEXT:    psubb %xmm3, %xmm0
+; SSE-NEXT:    psubb %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; AVX512-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <32 x i8> %a0, %a1
+  %xor = xor <32 x i8> %a0, %a1
+  %shift = ashr <32 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <32 x i8> %and, %shift
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE2-LABEL: test_ext_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm4, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    paddw %xmm5, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm6, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    paddw %xmm7, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm4, %xmm5
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm7
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm4
+; SSE4-NEXT:    paddw %xmm5, %xmm4
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm1
+; SSE4-NEXT:    paddw %xmm6, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm3
+; SSE4-NEXT:    paddw %xmm7, %xmm3
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm0
+; SSE4-NEXT:    paddw %xmm8, %xmm0
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm2, %xmm0
+; SSE4-NEXT:    pand %xmm2, %xmm3
+; SSE4-NEXT:    packuswb %xmm3, %xmm0
+; SSE4-NEXT:    pand %xmm2, %xmm1
+; SSE4-NEXT:    pand %xmm2, %xmm4
+; SSE4-NEXT:    packuswb %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm5, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm5, %xmm5
+; AVX1-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <32 x i8> %a0 to <32 x i16>
+  %x1 = sext <32 x i8> %a1 to <32 x i16>
+  %sum = add <32 x i16> %x0, %x1
+  %shift = ashr <32 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <32 x i16> %shift to <32 x i8>
+  ret <32 x i8> %res
+}
+
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_fixed_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psraw $1, %xmm1
+; SSE-NEXT:    paddw %xmm4, %xmm1
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i16> %a0, %a1
+  %xor = xor <16 x i16> %a1, %a0
+  %shift = ashr <16 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <16 x i16> %and, %shift
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE2-LABEL: test_ext_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    paddd %xmm4, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm5, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm7, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm8
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm8, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm4, %xmm5
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm7
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm4
+; SSE4-NEXT:    paddd %xmm5, %xmm4
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm1
+; SSE4-NEXT:    paddd %xmm6, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm3
+; SSE4-NEXT:    paddd %xmm7, %xmm3
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm0
+; SSE4-NEXT:    paddd %xmm8, %xmm0
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm2, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; SSE4-NEXT:    packusdw %xmm3, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
+; SSE4-NEXT:    packusdw %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm5, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpackusdw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i16> %a0 to <16 x i32>
+  %x1 = sext <16 x i16> %a1 to <16 x i32>
+  %sum = add <16 x i32> %x0, %x1
+  %shift = ashr <16 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <16 x i32> %shift to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_fixed_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrad $1, %xmm1
+; SSE-NEXT:    paddd %xmm4, %xmm1
+; SSE-NEXT:    psrad $1, %xmm0
+; SSE-NEXT:    paddd %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <8 x i32> %a0, %a1
+  %xor = xor <8 x i32> %a1, %a0
+  %shift = ashr <8 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = add <8 x i32> %and, %shift
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE2-LABEL: test_ext_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm8
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm8
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm4
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm5
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm6
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm7
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm1
+; SSE4-NEXT:    paddq %xmm4, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm3
+; SSE4-NEXT:    paddq %xmm5, %xmm3
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm0
+; SSE4-NEXT:    paddq %xmm6, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm2
+; SSE4-NEXT:    paddq %xmm7, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm6, %xmm6
+; AVX1-NEXT:    vpaddq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm6, %xmm6
+; AVX1-NEXT:    vpaddq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i32> %a0 to <8 x i64>
+  %x1 = sext <8 x i32> %a1 to <8 x i64>
+  %sum = add <8 x i64> %x0, %x1
+  %shift = ashr <8 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <8 x i64> %shift to <8 x i32>
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    paddq %xmm5, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm1, %xmm4
+; SSE4-NEXT:    pand %xmm3, %xmm4
+; SSE4-NEXT:    movdqa %xmm0, %xmm5
+; SSE4-NEXT:    pand %xmm2, %xmm5
+; SSE4-NEXT:    pxor %xmm2, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm2
+; SSE4-NEXT:    psrad $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE4-NEXT:    paddq %xmm4, %xmm1
+; SSE4-NEXT:    movdqa %xmm0, %xmm2
+; SSE4-NEXT:    psrad $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE4-NEXT:    paddq %xmm5, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsraq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <4 x i64> %a0, %a1
+  %xor = xor <4 x i64> %a1, %a0
+  %shift = ashr <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1>
+  %res = add <4 x i64> %and, %shift
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_ext_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm4, %rdx
+; SSE2-NEXT:    movq %rdx, %r14
+; SSE2-NEXT:    sarq $63, %r14
+; SSE2-NEXT:    movq %xmm1, %rcx
+; SSE2-NEXT:    movq %rcx, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rsi
+; SSE2-NEXT:    movq %rsi, %r11
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movq %xmm0, %r8
+; SSE2-NEXT:    movq %r8, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movq %xmm3, %r15
+; SSE2-NEXT:    movq %r15, %r9
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r12
+; SSE2-NEXT:    movq %r12, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, %rbp
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    addq %rax, %r8
+; SSE2-NEXT:    adcq %rbx, %rbp
+; SSE2-NEXT:    addq %r12, %rsi
+; SSE2-NEXT:    adcq %r11, %r13
+; SSE2-NEXT:    addq %r15, %rcx
+; SSE2-NEXT:    adcq %r10, %r9
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; SSE2-NEXT:    adcq %r14, %rdi
+; SSE2-NEXT:    shldq $63, %rdx, %rdi
+; SSE2-NEXT:    shldq $63, %rcx, %r9
+; SSE2-NEXT:    shldq $63, %rsi, %r13
+; SSE2-NEXT:    shldq $63, %r8, %rbp
+; SSE2-NEXT:    movq %rbp, %xmm0
+; SSE2-NEXT:    movq %r13, %xmm2
+; SSE2-NEXT:    movq %r9, %xmm1
+; SSE2-NEXT:    movq %rdi, %xmm3
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    movq %xmm1, %rdi
+; SSE4-NEXT:    movq %rdi, %r14
+; SSE4-NEXT:    sarq $63, %r14
+; SSE4-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE4-NEXT:    movq %rcx, %r10
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    movq %xmm0, %rsi
+; SSE4-NEXT:    movq %rsi, %r11
+; SSE4-NEXT:    sarq $63, %r11
+; SSE4-NEXT:    pextrq $1, %xmm0, %r8
+; SSE4-NEXT:    movq %r8, %rbx
+; SSE4-NEXT:    sarq $63, %rbx
+; SSE4-NEXT:    movq %xmm3, %rdx
+; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rdx
+; SSE4-NEXT:    pextrq $1, %xmm3, %r15
+; SSE4-NEXT:    movq %r15, %r9
+; SSE4-NEXT:    sarq $63, %r9
+; SSE4-NEXT:    movq %xmm2, %r12
+; SSE4-NEXT:    movq %r12, %r13
+; SSE4-NEXT:    sarq $63, %r13
+; SSE4-NEXT:    pextrq $1, %xmm2, %rax
+; SSE4-NEXT:    movq %rax, %rbp
+; SSE4-NEXT:    sarq $63, %rbp
+; SSE4-NEXT:    addq %rax, %r8
+; SSE4-NEXT:    adcq %rbx, %rbp
+; SSE4-NEXT:    addq %r12, %rsi
+; SSE4-NEXT:    adcq %r11, %r13
+; SSE4-NEXT:    addq %r15, %rcx
+; SSE4-NEXT:    adcq %r10, %r9
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; SSE4-NEXT:    adcq %r14, %rdx
+; SSE4-NEXT:    shldq $63, %rdi, %rdx
+; SSE4-NEXT:    shldq $63, %rcx, %r9
+; SSE4-NEXT:    shldq $63, %rsi, %r13
+; SSE4-NEXT:    shldq $63, %r8, %rbp
+; SSE4-NEXT:    movq %rbp, %xmm2
+; SSE4-NEXT:    movq %r13, %xmm0
+; SSE4-NEXT:    movq %r9, %xmm3
+; SSE4-NEXT:    movq %rdx, %xmm1
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vmovq %xmm0, %rdx
+; AVX1-NEXT:    movq %rdx, %r14
+; AVX1-NEXT:    sarq $63, %r14
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT:    movq %rcx, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rsi
+; AVX1-NEXT:    movq %rsi, %r11
+; AVX1-NEXT:    sarq $63, %r11
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX1-NEXT:    movq %rdi, %rbx
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vmovq %xmm1, %r8
+; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r15
+; AVX1-NEXT:    movq %r15, %r9
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r12
+; AVX1-NEXT:    movq %r12, %r13
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    addq %rax, %rdi
+; AVX1-NEXT:    adcq %rbx, %rbp
+; AVX1-NEXT:    addq %r12, %rsi
+; AVX1-NEXT:    adcq %r11, %r13
+; AVX1-NEXT:    addq %r15, %rcx
+; AVX1-NEXT:    adcq %r10, %r9
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; AVX1-NEXT:    adcq %r14, %r8
+; AVX1-NEXT:    shldq $63, %rdx, %r8
+; AVX1-NEXT:    shldq $63, %rcx, %r9
+; AVX1-NEXT:    shldq $63, %rsi, %r13
+; AVX1-NEXT:    shldq $63, %rdi, %rbp
+; AVX1-NEXT:    vmovq %rbp, %xmm0
+; AVX1-NEXT:    vmovq %r13, %xmm1
+; AVX1-NEXT:    vmovq %r9, %xmm2
+; AVX1-NEXT:    vmovq %r8, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    sarq $63, %r14
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    movq %rsi, %r11
+; AVX2-NEXT:    sarq $63, %r11
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    movq %rdi, %rbx
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r15
+; AVX2-NEXT:    movq %r15, %r9
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r12
+; AVX2-NEXT:    movq %r12, %r13
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    movq %rax, %rbp
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    addq %rax, %rdi
+; AVX2-NEXT:    adcq %rbx, %rbp
+; AVX2-NEXT:    addq %r12, %rsi
+; AVX2-NEXT:    adcq %r11, %r13
+; AVX2-NEXT:    addq %r15, %rcx
+; AVX2-NEXT:    adcq %r10, %r9
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; AVX2-NEXT:    adcq %r14, %r8
+; AVX2-NEXT:    shldq $63, %rdx, %r8
+; AVX2-NEXT:    shldq $63, %rcx, %r9
+; AVX2-NEXT:    shldq $63, %rsi, %r13
+; AVX2-NEXT:    shldq $63, %rdi, %rbp
+; AVX2-NEXT:    vmovq %rbp, %xmm0
+; AVX2-NEXT:    vmovq %r13, %xmm1
+; AVX2-NEXT:    vmovq %r9, %xmm2
+; AVX2-NEXT:    vmovq %r8, %xmm3
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vmovq %xmm0, %rdx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    sarq $63, %r14
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    movq %rcx, %r10
+; AVX512-NEXT:    sarq $63, %r10
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rsi
+; AVX512-NEXT:    movq %rsi, %r11
+; AVX512-NEXT:    sarq $63, %r11
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512-NEXT:    movq %rdi, %rbx
+; AVX512-NEXT:    sarq $63, %rbx
+; AVX512-NEXT:    vmovq %xmm1, %r8
+; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r8
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r15
+; AVX512-NEXT:    movq %r15, %r9
+; AVX512-NEXT:    sarq $63, %r9
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r12
+; AVX512-NEXT:    movq %r12, %r13
+; AVX512-NEXT:    sarq $63, %r13
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    movq %rax, %rbp
+; AVX512-NEXT:    sarq $63, %rbp
+; AVX512-NEXT:    addq %rax, %rdi
+; AVX512-NEXT:    adcq %rbx, %rbp
+; AVX512-NEXT:    addq %r12, %rsi
+; AVX512-NEXT:    adcq %r11, %r13
+; AVX512-NEXT:    addq %r15, %rcx
+; AVX512-NEXT:    adcq %r10, %r9
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; AVX512-NEXT:    adcq %r14, %r8
+; AVX512-NEXT:    shldq $63, %rdx, %r8
+; AVX512-NEXT:    shldq $63, %rcx, %r9
+; AVX512-NEXT:    shldq $63, %rsi, %r13
+; AVX512-NEXT:    shldq $63, %rdi, %rbp
+; AVX512-NEXT:    vmovq %rbp, %xmm0
+; AVX512-NEXT:    vmovq %r13, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vmovq %r8, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = sext <4 x i64> %a0 to <4 x i128>
+  %x1 = sext <4 x i64> %a1 to <4 x i128>
+  %sum = add <4 x i128> %x0, %x1
+  %shift = ashr <4 x i128> %sum, <i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <4 x i128> %shift to <4 x i64>
+  ret <4 x i64> %res
+}
+
+;
+; 512-bit vectors
+;
+
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_fixed_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm10
+; SSE-NEXT:    pand %xmm7, %xmm10
+; SSE-NEXT:    movdqa %xmm2, %xmm11
+; SSE-NEXT:    pand %xmm6, %xmm11
+; SSE-NEXT:    movdqa %xmm1, %xmm9
+; SSE-NEXT:    pand %xmm5, %xmm9
+; SSE-NEXT:    movdqa %xmm0, %xmm8
+; SSE-NEXT:    pand %xmm4, %xmm8
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm5, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE-NEXT:    pxor %xmm4, %xmm3
+; SSE-NEXT:    paddb %xmm10, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm2
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    paddb %xmm11, %xmm2
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm4, %xmm1
+; SSE-NEXT:    paddb %xmm9, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    paddb %xmm8, %xmm0
+; SSE-NEXT:    psubb %xmm4, %xmm0
+; SSE-NEXT:    psubb %xmm4, %xmm1
+; SSE-NEXT:    psubb %xmm4, %xmm2
+; SSE-NEXT:    psubb %xmm4, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm7
+; AVX1-NEXT:    vpand %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm5, %xmm7, %xmm3
+; AVX1-NEXT:    vpsubb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; AVX512-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <64 x i8> %a0, %a1
+  %xor = xor <64 x i8> %a0, %a1
+  %shift = ashr <64 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <64 x i8> %and, %shift
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE2-LABEL: test_ext_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; SSE2-NEXT:    psraw $8, %xmm13
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15]
+; SSE2-NEXT:    psraw $8, %xmm14
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
+; SSE2-NEXT:    psraw $8, %xmm15
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15]
+; SSE2-NEXT:    psraw $8, %xmm12
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
+; SSE2-NEXT:    psraw $8, %xmm11
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
+; SSE2-NEXT:    psraw $8, %xmm10
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE2-NEXT:    psraw $8, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; SSE2-NEXT:    psraw $8, %xmm8
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    paddw %xmm13, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm7
+; SSE2-NEXT:    paddw %xmm14, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT:    psraw $8, %xmm2
+; SSE2-NEXT:    paddw %xmm15, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    paddw %xmm12, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    paddw %xmm11, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm5
+; SSE2-NEXT:    paddw %xmm10, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    paddw %xmm9, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    paddw %xmm8, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm7
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm6
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    packuswb %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    packuswb %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    packuswb %xmm7, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v64i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm8, %xmm9
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm3, %xmm11
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm2, %xmm13
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm1, %xmm15
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm8
+; SSE4-NEXT:    paddw %xmm9, %xmm8
+; SSE4-NEXT:    pmovsxbw %xmm7, %xmm3
+; SSE4-NEXT:    paddw %xmm10, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm7
+; SSE4-NEXT:    paddw %xmm11, %xmm7
+; SSE4-NEXT:    pmovsxbw %xmm6, %xmm2
+; SSE4-NEXT:    paddw %xmm12, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm6
+; SSE4-NEXT:    paddw %xmm13, %xmm6
+; SSE4-NEXT:    pmovsxbw %xmm5, %xmm1
+; SSE4-NEXT:    paddw %xmm14, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxbw %xmm0, %xmm5
+; SSE4-NEXT:    paddw %xmm15, %xmm5
+; SSE4-NEXT:    pmovsxbw %xmm4, %xmm0
+; SSE4-NEXT:    paddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE4-NEXT:    psrlw $1, %xmm8
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm7
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm6
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm5
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE4-NEXT:    pand %xmm4, %xmm0
+; SSE4-NEXT:    pand %xmm4, %xmm5
+; SSE4-NEXT:    packuswb %xmm5, %xmm0
+; SSE4-NEXT:    pand %xmm4, %xmm1
+; SSE4-NEXT:    pand %xmm4, %xmm6
+; SSE4-NEXT:    packuswb %xmm6, %xmm1
+; SSE4-NEXT:    pand %xmm4, %xmm2
+; SSE4-NEXT:    pand %xmm4, %xmm7
+; SSE4-NEXT:    packuswb %xmm7, %xmm2
+; SSE4-NEXT:    pand %xmm4, %xmm3
+; SSE4-NEXT:    pand %xmm4, %xmm8
+; SSE4-NEXT:    packuswb %xmm8, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm6, %xmm6
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm9, %xmm9
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm10, %xmm10
+; AVX1-NEXT:    vpaddw %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm10
+; AVX1-NEXT:    vpaddw %xmm5, %xmm10, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm10, %xmm10
+; AVX1-NEXT:    vpaddw %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT:    vpaddw %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm7
+; AVX1-NEXT:    vpaddw %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm8, %xmm8
+; AVX1-NEXT:    vpaddw %xmm8, %xmm9, %xmm8
+; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm2
+; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrlw $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
+; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpackuswb %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm8, %xmm1
+; AVX1-NEXT:    vpand %xmm5, %xmm8, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm8, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm6
+; AVX2-NEXT:    vpaddw %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm3
+; AVX2-NEXT:    vpaddw %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm4, %ymm2
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpackuswb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovsxbw %ymm2, %zmm2
+; AVX512-NEXT:    vpmovsxbw %ymm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovsxbw %ymm3, %zmm3
+; AVX512-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxbw %ymm1, %zmm1
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <64 x i8> %a0 to <64 x i16>
+  %x1 = sext <64 x i8> %a1 to <64 x i16>
+  %sum = add <64 x i16> %x0, %x1
+  %shift = ashr <64 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <64 x i16> %shift to <64 x i8>
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_fixed_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psraw $1, %xmm3
+; SSE-NEXT:    paddw %xmm8, %xmm3
+; SSE-NEXT:    psraw $1, %xmm2
+; SSE-NEXT:    paddw %xmm9, %xmm2
+; SSE-NEXT:    psraw $1, %xmm1
+; SSE-NEXT:    paddw %xmm10, %xmm1
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddw %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsraw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsraw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <32 x i16> %a0, %a1
+  %xor = xor <32 x i16> %a1, %a0
+  %shift = ashr <32 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <32 x i16> %and, %shift
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE2-LABEL: test_ext_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; SSE2-NEXT:    psrad $16, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3]
+; SSE2-NEXT:    psrad $16, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
+; SSE2-NEXT:    psrad $16, %xmm13
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
+; SSE2-NEXT:    psrad $16, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; SSE2-NEXT:    psrad $16, %xmm11
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE2-NEXT:    psrad $16, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    paddd %xmm8, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    paddd %xmm15, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    paddd %xmm3, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    paddd %xmm14, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    paddd %xmm13, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    paddd %xmm12, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    paddd %xmm11, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    paddd %xmm10, %xmm3
+; SSE2-NEXT:    pslld $15, %xmm9
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm9, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm8
+; SSE2-NEXT:    psrad $16, %xmm8
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm8, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm5, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    packssdw %xmm4, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm8, %xmm9
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm3, %xmm11
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm2, %xmm13
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm1, %xmm15
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm8
+; SSE4-NEXT:    paddd %xmm9, %xmm8
+; SSE4-NEXT:    pmovsxwd %xmm7, %xmm3
+; SSE4-NEXT:    paddd %xmm10, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm7
+; SSE4-NEXT:    paddd %xmm11, %xmm7
+; SSE4-NEXT:    pmovsxwd %xmm6, %xmm2
+; SSE4-NEXT:    paddd %xmm12, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm6
+; SSE4-NEXT:    paddd %xmm13, %xmm6
+; SSE4-NEXT:    pmovsxwd %xmm5, %xmm1
+; SSE4-NEXT:    paddd %xmm14, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxwd %xmm0, %xmm5
+; SSE4-NEXT:    paddd %xmm15, %xmm5
+; SSE4-NEXT:    pmovsxwd %xmm4, %xmm0
+; SSE4-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE4-NEXT:    psrld $1, %xmm8
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm7
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm6
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm5
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    pxor %xmm4, %xmm4
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm5, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm6, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2],xmm4[3],xmm7[4],xmm4[5],xmm7[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm7, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4],xmm4[5],xmm8[6],xmm4[7]
+; SSE4-NEXT:    packusdw %xmm8, %xmm3
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm6, %xmm6
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm9, %xmm9
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm10, %xmm10
+; AVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm10
+; AVX1-NEXT:    vpaddd %xmm5, %xmm10, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm10, %xmm10
+; AVX1-NEXT:    vpaddd %xmm6, %xmm10, %xmm6
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm7
+; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm8, %xmm8
+; AVX1-NEXT:    vpaddd %xmm8, %xmm9, %xmm8
+; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm4
+; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2],xmm8[3],xmm0[4],xmm8[5],xmm0[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2],xmm8[3],xmm7[4],xmm8[5],xmm7[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4],xmm8[5],xmm6[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2],xmm8[3],xmm3[4],xmm8[5],xmm3[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2],xmm8[3],xmm1[4],xmm8[5],xmm1[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4],xmm8[5],xmm5[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm8[1],xmm4[2],xmm8[3],xmm4[4],xmm8[5],xmm4[6],xmm8[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2],xmm8[3],xmm2[4],xmm8[5],xmm2[6],xmm8[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm5
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm6
+; AVX2-NEXT:    vpaddd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm3
+; AVX2-NEXT:    vpaddd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7],ymm3[8],ymm4[9],ymm3[10],ymm4[11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-NEXT:    vpackusdw %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7],ymm2[8],ymm4[9],ymm2[10],ymm4[11],ymm2[12],ymm4[13],ymm2[14],ymm4[15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovsxwd %ymm3, %zmm3
+; AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <32 x i16> %a0 to <32 x i32>
+  %x1 = sext <32 x i16> %a1 to <32 x i32>
+  %sum = add <32 x i32> %x0, %x1
+  %shift = ashr <32 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <32 x i32> %shift to <32 x i16>
+  ret <32 x i16> %res
+}
+
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE-LABEL: test_fixed_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrad $1, %xmm3
+; SSE-NEXT:    paddd %xmm8, %xmm3
+; SSE-NEXT:    psrad $1, %xmm2
+; SSE-NEXT:    paddd %xmm9, %xmm2
+; SSE-NEXT:    psrad $1, %xmm1
+; SSE-NEXT:    paddd %xmm10, %xmm1
+; SSE-NEXT:    psrad $1, %xmm0
+; SSE-NEXT:    paddd %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxord %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrad $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i32> %a0, %a1
+  %xor = xor <16 x i32> %a1, %a0
+  %shift = ashr <16 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = add <16 x i32> %and, %shift
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE2-LABEL: test_ext_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm3[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm13, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm12, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm11, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
+; SSE2-NEXT:    pxor %xmm10, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
+; SSE2-NEXT:    pxor %xmm10, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; SSE2-NEXT:    pxor %xmm14, %xmm14
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm10, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
+; SSE2-NEXT:    paddq %xmm13, %xmm10
+; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; SSE2-NEXT:    paddq %xmm12, %xmm7
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
+; SSE2-NEXT:    paddq %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm11, %xmm6
+; SSE2-NEXT:    pxor %xmm11, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm9, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm10
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm3, %xmm9
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm2, %xmm11
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm1, %xmm13
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm15
+; SSE4-NEXT:    pmovsxdq %xmm7, %xmm3
+; SSE4-NEXT:    paddq %xmm8, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm7
+; SSE4-NEXT:    paddq %xmm9, %xmm7
+; SSE4-NEXT:    pmovsxdq %xmm6, %xmm2
+; SSE4-NEXT:    paddq %xmm10, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm6
+; SSE4-NEXT:    paddq %xmm11, %xmm6
+; SSE4-NEXT:    pmovsxdq %xmm5, %xmm1
+; SSE4-NEXT:    paddq %xmm12, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm0, %xmm5
+; SSE4-NEXT:    paddq %xmm13, %xmm5
+; SSE4-NEXT:    pmovsxdq %xmm4, %xmm0
+; SSE4-NEXT:    paddq %xmm14, %xmm0
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE4-NEXT:    pmovsxdq %xmm4, %xmm4
+; SSE4-NEXT:    paddq %xmm15, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm7
+; SSE4-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm6, %xmm6
+; AVX1-NEXT:    vpmovsxdq %xmm4, %xmm4
+; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm8, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm9, %xmm9
+; AVX1-NEXT:    vpmovsxdq %xmm7, %xmm7
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm10
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm11, %xmm11
+; AVX1-NEXT:    vpaddq %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm11, %xmm11
+; AVX1-NEXT:    vpaddq %xmm6, %xmm11, %xmm6
+; AVX1-NEXT:    vpmovsxdq %xmm10, %xmm10
+; AVX1-NEXT:    vpaddq %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm10, %xmm10
+; AVX1-NEXT:    vpaddq %xmm10, %xmm8, %xmm8
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm2[2,3,2,3]
+; AVX1-NEXT:    vpmovsxdq %xmm10, %xmm10
+; AVX1-NEXT:    vpaddq %xmm10, %xmm9, %xmm9
+; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm5, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm8, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm9, %xmm7
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm3
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vpmovsxdq %xmm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vpmovsxdq %xmm5, %ymm5
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT:    vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT:    vpaddq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT:    vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT:    vpaddq %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm5[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],ymm4[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovsxdq %ymm2, %zmm2
+; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovsxdq %ymm3, %zmm3
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = sext <16 x i32> %a0 to <16 x i64>
+  %x1 = sext <16 x i32> %a1 to <16 x i64>
+  %sum = add <16 x i64> %x0, %x1
+  %shift = ashr <16 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <16 x i64> %shift to <16 x i32>
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_fixed_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm3, %xmm11
+; SSE2-NEXT:    pand %xmm7, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pand %xmm6, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm9
+; SSE2-NEXT:    pand %xmm5, %xmm9
+; SSE2-NEXT:    movdqa %xmm0, %xmm8
+; SSE2-NEXT:    pand %xmm4, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm11, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm10, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm9, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3]
+; SSE2-NEXT:    psrad $1, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE2-NEXT:    paddq %xmm8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_fixed_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm10
+; SSE4-NEXT:    pand %xmm7, %xmm10
+; SSE4-NEXT:    movdqa %xmm2, %xmm11
+; SSE4-NEXT:    pand %xmm6, %xmm11
+; SSE4-NEXT:    movdqa %xmm1, %xmm9
+; SSE4-NEXT:    pand %xmm5, %xmm9
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    pand %xmm4, %xmm8
+; SSE4-NEXT:    pxor %xmm4, %xmm0
+; SSE4-NEXT:    pxor %xmm5, %xmm1
+; SSE4-NEXT:    pxor %xmm6, %xmm2
+; SSE4-NEXT:    pxor %xmm7, %xmm3
+; SSE4-NEXT:    movdqa %xmm3, %xmm4
+; SSE4-NEXT:    psrad $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; SSE4-NEXT:    paddq %xmm10, %xmm3
+; SSE4-NEXT:    movdqa %xmm2, %xmm4
+; SSE4-NEXT:    psrad $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; SSE4-NEXT:    paddq %xmm11, %xmm2
+; SSE4-NEXT:    movdqa %xmm1, %xmm4
+; SSE4-NEXT:    psrad $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; SSE4-NEXT:    paddq %xmm9, %xmm1
+; SSE4-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-NEXT:    psrad $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; SSE4-NEXT:    paddq %xmm8, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT:    vpaddq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsraq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <8 x i64> %a0, %a1
+  %xor = xor <8 x i64> %a1, %a0
+  %shift = ashr <8 x i64> %xor, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = add <8 x i64> %and, %shift
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_ext_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    .cfi_def_cfa_offset 64
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm8, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq %xmm1, %rbp
+; SSE2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rbp
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rbx
+; SSE2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movq %xmm0, %r15
+; SSE2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r15
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r10
+; SSE2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movq %xmm7, %r9
+; SSE2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r12
+; SSE2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    movq %xmm6, %r13
+; SSE2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r14
+; SSE2-NEXT:    movq %r14, %rsi
+; SSE2-NEXT:    sarq $63, %rsi
+; SSE2-NEXT:    movq %xmm5, %r11
+; SSE2-NEXT:    movq %r11, %rdx
+; SSE2-NEXT:    sarq $63, %rdx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r8
+; SSE2-NEXT:    movq %r8, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movq %xmm4, %rcx
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    adcq %r15, %rax
+; SSE2-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    adcq %rbx, %rdi
+; SSE2-NEXT:    addq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    adcq %rbp, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE2-NEXT:    addq %r14, %r15
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE2-NEXT:    adcq (%rsp), %r10 # 8-byte Folded Reload
+; SSE2-NEXT:    shldq $63, %rcx, %r10
+; SSE2-NEXT:    shldq $63, %r8, %r9
+; SSE2-NEXT:    shldq $63, %r11, %r12
+; SSE2-NEXT:    shldq $63, %rbx, %r13
+; SSE2-NEXT:    shldq $63, %r15, %rsi
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rcx, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rcx, %rdi
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rcx, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    movq %rdi, %xmm4
+; SSE2-NEXT:    movq %rdx, %xmm1
+; SSE2-NEXT:    movq %rsi, %xmm5
+; SSE2-NEXT:    movq %r13, %xmm2
+; SSE2-NEXT:    movq %r12, %xmm6
+; SSE2-NEXT:    movq %r9, %xmm3
+; SSE2-NEXT:    movq %r10, %xmm7
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    pushq %rax
+; SSE4-NEXT:    .cfi_def_cfa_offset 64
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    movq %xmm3, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm3, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm2, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm2, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %xmm1, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, %rcx
+; SSE4-NEXT:    sarq $63, %rcx
+; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    pextrq $1, %xmm1, %rbp
+; SSE4-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rbp
+; SSE4-NEXT:    movq %xmm0, %rbx
+; SSE4-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %rbx
+; SSE4-NEXT:    pextrq $1, %xmm0, %r14
+; SSE4-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r14
+; SSE4-NEXT:    movq %xmm7, %r10
+; SSE4-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r10
+; SSE4-NEXT:    pextrq $1, %xmm7, %r9
+; SSE4-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r9
+; SSE4-NEXT:    movq %xmm6, %r15
+; SSE4-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r15
+; SSE4-NEXT:    pextrq $1, %xmm6, %r13
+; SSE4-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    sarq $63, %r13
+; SSE4-NEXT:    movq %xmm5, %r12
+; SSE4-NEXT:    movq %r12, %rsi
+; SSE4-NEXT:    sarq $63, %rsi
+; SSE4-NEXT:    pextrq $1, %xmm5, %r11
+; SSE4-NEXT:    movq %r11, %rdx
+; SSE4-NEXT:    sarq $63, %rdx
+; SSE4-NEXT:    movq %xmm4, %r8
+; SSE4-NEXT:    movq %r8, %rdi
+; SSE4-NEXT:    sarq $63, %rdi
+; SSE4-NEXT:    pextrq $1, %xmm4, %rcx
+; SSE4-NEXT:    movq %rcx, %rax
+; SSE4-NEXT:    sarq $63, %rax
+; SSE4-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE4-NEXT:    adcq %r14, %rax
+; SSE4-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE4-NEXT:    adcq %rbx, %rdi
+; SSE4-NEXT:    addq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE4-NEXT:    adcq %rbp, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    addq %r12, %r14
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE4-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE4-NEXT:    adcq (%rsp), %r10 # 8-byte Folded Reload
+; SSE4-NEXT:    shldq $63, %rcx, %r10
+; SSE4-NEXT:    shldq $63, %r8, %r9
+; SSE4-NEXT:    shldq $63, %r11, %r15
+; SSE4-NEXT:    shldq $63, %rbx, %r13
+; SSE4-NEXT:    shldq $63, %r14, %rsi
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rcx, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rcx, %rdi
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rcx, %rax
+; SSE4-NEXT:    movq %rax, %xmm4
+; SSE4-NEXT:    movq %rdi, %xmm0
+; SSE4-NEXT:    movq %rdx, %xmm5
+; SSE4-NEXT:    movq %rsi, %xmm1
+; SSE4-NEXT:    movq %r13, %xmm6
+; SSE4-NEXT:    movq %r15, %xmm2
+; SSE4-NEXT:    movq %r9, %xmm7
+; SSE4-NEXT:    movq %r10, %xmm3
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE4-NEXT:    addq $8, %rsp
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    pushq %rax
+; AVX1-NEXT:    .cfi_def_cfa_offset 64
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rbp
+; AVX1-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rbp
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rbx
+; AVX1-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %rbx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r15
+; AVX1-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r15
+; AVX1-NEXT:    vmovq %xmm3, %r9
+; AVX1-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r9
+; AVX1-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX1-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r12
+; AVX1-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r12
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r13
+; AVX1-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    sarq $63, %r13
+; AVX1-NEXT:    vmovq %xmm2, %r14
+; AVX1-NEXT:    movq %r14, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX1-NEXT:    movq %r11, %rdx
+; AVX1-NEXT:    sarq $63, %rdx
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r8
+; AVX1-NEXT:    movq %r8, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT:    movq %rcx, %rax
+; AVX1-NEXT:    sarq $63, %rax
+; AVX1-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    adcq %r15, %rax
+; AVX1-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    adcq %rbx, %rdi
+; AVX1-NEXT:    addq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    adcq %rbp, %rdx
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX1-NEXT:    addq %r14, %r15
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX1-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX1-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX1-NEXT:    shldq $63, %rcx, %r9
+; AVX1-NEXT:    shldq $63, %r8, %r10
+; AVX1-NEXT:    shldq $63, %r11, %r12
+; AVX1-NEXT:    shldq $63, %rbx, %r13
+; AVX1-NEXT:    shldq $63, %r15, %rsi
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rcx, %rdx
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rcx, %rdi
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rcx, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovq %rdi, %xmm1
+; AVX1-NEXT:    vmovq %rdx, %xmm2
+; AVX1-NEXT:    vmovq %rsi, %xmm3
+; AVX1-NEXT:    vmovq %r13, %xmm4
+; AVX1-NEXT:    vmovq %r12, %xmm5
+; AVX1-NEXT:    vmovq %r10, %xmm6
+; AVX1-NEXT:    vmovq %r9, %xmm7
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    addq $8, %rsp
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    .cfi_def_cfa_offset 64
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rbp
+; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rbp
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rbx
+; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %rbx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r15
+; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r15
+; AVX2-NEXT:    vmovq %xmm3, %r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r9
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r12
+; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r12
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r13
+; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    sarq $63, %r13
+; AVX2-NEXT:    vmovq %xmm2, %r14
+; AVX2-NEXT:    movq %r14, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX2-NEXT:    movq %r11, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r8
+; AVX2-NEXT:    movq %r8, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    sarq $63, %rax
+; AVX2-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    adcq %r15, %rax
+; AVX2-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    adcq %rbx, %rdi
+; AVX2-NEXT:    addq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT:    adcq %rbp, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    addq %r14, %r15
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX2-NEXT:    shldq $63, %rcx, %r9
+; AVX2-NEXT:    shldq $63, %r8, %r10
+; AVX2-NEXT:    shldq $63, %r11, %r12
+; AVX2-NEXT:    shldq $63, %rbx, %r13
+; AVX2-NEXT:    shldq $63, %r15, %rsi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rcx, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rcx, %rdi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rcx, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vmovq %rsi, %xmm3
+; AVX2-NEXT:    vmovq %r13, %xmm4
+; AVX2-NEXT:    vmovq %r12, %xmm5
+; AVX2-NEXT:    vmovq %r10, %xmm6
+; AVX2-NEXT:    vmovq %r9, %xmm7
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    .cfi_def_cfa_offset 64
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    sarq $63, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r13
+; AVX512-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r13
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r14
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r14
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r15
+; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r15
+; AVX512-NEXT:    vmovq %xmm1, %r9
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r9
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r11
+; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r11
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r12
+; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %r12
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rbp
+; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    sarq $63, %rbp
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; AVX512-NEXT:    vmovq %xmm0, %rbx
+; AVX512-NEXT:    movq %rbx, %rsi
+; AVX512-NEXT:    sarq $63, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512-NEXT:    movq %r10, %rdx
+; AVX512-NEXT:    sarq $63, %rdx
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r8
+; AVX512-NEXT:    movq %r8, %rdi
+; AVX512-NEXT:    sarq $63, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    movq %rcx, %rax
+; AVX512-NEXT:    sarq $63, %rax
+; AVX512-NEXT:    addq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    adcq %r15, %rax
+; AVX512-NEXT:    addq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT:    adcq %r14, %rdi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    addq %r10, %r15
+; AVX512-NEXT:    adcq %r13, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT:    addq %rbx, %r14
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT:    adcq (%rsp), %r9 # 8-byte Folded Reload
+; AVX512-NEXT:    shldq $63, %rcx, %r9
+; AVX512-NEXT:    shldq $63, %r8, %r11
+; AVX512-NEXT:    shldq $63, %r10, %r12
+; AVX512-NEXT:    shldq $63, %rbx, %rbp
+; AVX512-NEXT:    shldq $63, %r14, %rsi
+; AVX512-NEXT:    shldq $63, %r15, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rcx, %rdi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rcx, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %rdi, %xmm1
+; AVX512-NEXT:    vmovq %rdx, %xmm2
+; AVX512-NEXT:    vmovq %rsi, %xmm3
+; AVX512-NEXT:    vmovq %rbp, %xmm4
+; AVX512-NEXT:    vmovq %r12, %xmm5
+; AVX512-NEXT:    vmovq %r11, %xmm6
+; AVX512-NEXT:    vmovq %r9, %xmm7
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    addq $8, %rsp
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = sext <8 x i64> %a0 to <8 x i128>
+  %x1 = sext <8 x i64> %a1 to <8 x i128>
+  %sum = add <8 x i128> %x0, %x1
+  %shift = ashr <8 x i128> %sum, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <8 x i128> %shift to <8 x i64>
+  ret <8 x i64> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/avgflooru.ll b/llvm/test/CodeGen/X86/avgflooru.ll
new file mode 100644
index 00000000000000..e07c1f55991e84
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avgflooru.ll
@@ -0,0 +1,2629 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+;
+; 128-bit vectors
+;
+
+define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_fixed_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    paddb %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i8> %a0, %a1
+  %xor = xor <16 x i8> %a0, %a1
+  %shift = lshr <16 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <16 x i8> %and, %shift
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE2-LABEL: test_ext_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE2-NEXT:    paddw %xmm3, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm4
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    packuswb %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE4-NEXT:    paddw %xmm0, %xmm1
+; SSE4-NEXT:    paddw %xmm4, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    packuswb %xmm1, %xmm2
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i8> %a0 to <16 x i16>
+  %x1 = zext <16 x i8> %a1 to <16 x i16>
+  %sum = add <16 x i16> %x0, %x1
+  %shift = lshr <16 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <16 x i16> %shift to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_fixed_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <8 x i16> %a0, %a1
+  %xor = xor <8 x i16> %a1, %a0
+  %shift = lshr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <8 x i16> %and, %shift
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE2-LABEL: test_ext_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT:    paddd %xmm3, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE4-NEXT:    paddd %xmm0, %xmm1
+; SSE4-NEXT:    paddd %xmm4, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    packusdw %xmm1, %xmm2
+; SSE4-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %sum = add <8 x i32> %x0, %x1
+  %shift = lshr <8 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <8 x i32> %shift to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_fixed_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <4 x i32> %a0, %a1
+  %xor = xor <4 x i32> %a1, %a0
+  %shift = lshr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1>
+  %res = add <4 x i32> %and, %shift
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE2-LABEL: test_ext_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    paddq %xmm3, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE4-NEXT:    paddq %xmm0, %xmm1
+; SSE4-NEXT:    paddq %xmm4, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE4-NEXT:    movaps %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = zext <4 x i32> %a0 to <4 x i64>
+  %x1 = zext <4 x i32> %a1 to <4 x i64>
+  %sum = add <4 x i64> %x0, %x1
+  %shift = lshr <4 x i64> %sum, <i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <4 x i64> %shift to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE-LABEL: test_fixed_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_fixed_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpxor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %a0, %a1
+  %xor = xor <2 x i64> %a1, %a0
+  %shift = lshr <2 x i64> %xor, <i64 1, i64 1>
+  %res = add <2 x i64> %and, %shift
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: test_ext_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %xmm1, %rsi
+; SSE2-NEXT:    xorl %edi, %edi
+; SSE2-NEXT:    addq %rcx, %rsi
+; SSE2-NEXT:    setb %dil
+; SSE2-NEXT:    xorl %ecx, %ecx
+; SSE2-NEXT:    addq %rax, %rdx
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    shldq $63, %rdx, %rcx
+; SSE2-NEXT:    shldq $63, %rsi, %rdi
+; SSE2-NEXT:    movq %rdi, %xmm0
+; SSE2-NEXT:    movq %rcx, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v2i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm0, %rax
+; SSE4-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE4-NEXT:    movq %xmm1, %rdx
+; SSE4-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE4-NEXT:    xorl %edi, %edi
+; SSE4-NEXT:    addq %rcx, %rsi
+; SSE4-NEXT:    setb %dil
+; SSE4-NEXT:    xorl %ecx, %ecx
+; SSE4-NEXT:    addq %rax, %rdx
+; SSE4-NEXT:    setb %cl
+; SSE4-NEXT:    shldq $63, %rdx, %rcx
+; SSE4-NEXT:    shldq $63, %rsi, %rdi
+; SSE4-NEXT:    movq %rdi, %xmm1
+; SSE4-NEXT:    movq %rcx, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT:    xorl %edi, %edi
+; AVX1-NEXT:    addq %rcx, %rsi
+; AVX1-NEXT:    setb %dil
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    addq %rax, %rdx
+; AVX1-NEXT:    setb %cl
+; AVX1-NEXT:    shldq $63, %rdx, %rcx
+; AVX1-NEXT:    shldq $63, %rsi, %rdi
+; AVX1-NEXT:    vmovq %rdi, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    addq %rcx, %rsi
+; AVX2-NEXT:    setb %dil
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    addq %rax, %rdx
+; AVX2-NEXT:    setb %cl
+; AVX2-NEXT:    shldq $63, %rdx, %rcx
+; AVX2-NEXT:    shldq $63, %rsi, %rdi
+; AVX2-NEXT:    vmovq %rdi, %xmm0
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    xorl %edi, %edi
+; AVX512-NEXT:    addq %rcx, %rdx
+; AVX512-NEXT:    setb %dil
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    addq %rax, %rsi
+; AVX512-NEXT:    setb %cl
+; AVX512-NEXT:    shldq $63, %rsi, %rcx
+; AVX512-NEXT:    shldq $63, %rdx, %rdi
+; AVX512-NEXT:    vmovq %rdi, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    retq
+  %x0 = zext <2 x i64> %a0 to <2 x i128>
+  %x1 = zext <2 x i64> %a1 to <2 x i128>
+  %sum = add <2 x i128> %x0, %x1
+  %shift = lshr <2 x i128> %sum, <i128 1, i128 1>
+  %res = trunc <2 x i128> %shift to <2 x i64>
+  ret <2 x i64> %res
+}
+
+;
+; 256-bit vectors
+;
+
+define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_fixed_v32i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    paddb %xmm4, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    paddb %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <32 x i8> %a0, %a1
+  %xor = xor <32 x i8> %a0, %a1
+  %shift = lshr <32 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <32 x i8> %and, %shift
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE2-LABEL: test_ext_v32i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT:    paddw %xmm5, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    paddw %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    paddw %xmm6, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm7
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    packuswb %xmm7, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    packuswb %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm5, %xmm5
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; SSE4-NEXT:    paddw %xmm1, %xmm3
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; SSE4-NEXT:    paddw %xmm0, %xmm2
+; SSE4-NEXT:    paddw %xmm6, %xmm4
+; SSE4-NEXT:    paddw %xmm7, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    packuswb %xmm3, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    packuswb %xmm2, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-NEXT:    vpaddw %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; AVX1-NEXT:    vpaddw %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT:    vpaddw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <32 x i8> %a0 to <32 x i16>
+  %x1 = zext <32 x i8> %a1 to <32 x i16>
+  %sum = add <32 x i16> %x0, %x1
+  %shift = lshr <32 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <32 x i16> %shift to <32 x i8>
+  ret <32 x i8> %res
+}
+
+define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_fixed_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    paddw %xmm4, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i16> %a0, %a1
+  %xor = xor <16 x i16> %a1, %a0
+  %shift = lshr <16 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <16 x i16> %and, %shift
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE2-LABEL: test_ext_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm7
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm7, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm5, %xmm5
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE4-NEXT:    paddd %xmm1, %xmm3
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE4-NEXT:    paddd %xmm0, %xmm2
+; SSE4-NEXT:    paddd %xmm6, %xmm4
+; SSE4-NEXT:    paddd %xmm7, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    packusdw %xmm3, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    packusdw %xmm2, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX1-NEXT:    vpaddd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i16> %a0 to <16 x i32>
+  %x1 = zext <16 x i16> %a1 to <16 x i32>
+  %sum = add <16 x i32> %x0, %x1
+  %shift = lshr <16 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <16 x i32> %shift to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_fixed_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    paddd %xmm4, %xmm1
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <8 x i32> %a0, %a1
+  %xor = xor <8 x i32> %a1, %a0
+  %shift = lshr <8 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = add <8 x i32> %and, %shift
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE2-LABEL: test_ext_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT:    paddq %xmm5, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT:    paddq %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    paddq %xmm6, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm5, %xmm5
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE4-NEXT:    paddq %xmm1, %xmm3
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE4-NEXT:    paddq %xmm0, %xmm2
+; SSE4-NEXT:    paddq %xmm6, %xmm4
+; SSE4-NEXT:    paddq %xmm7, %xmm1
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE4-NEXT:    movaps %xmm1, %xmm0
+; SSE4-NEXT:    movaps %xmm4, %xmm1
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX1-NEXT:    vpaddq %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero
+; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i32> %a0 to <8 x i64>
+  %x1 = zext <8 x i32> %a1 to <8 x i64>
+  %sum = add <8 x i64> %x0, %x1
+  %shift = lshr <8 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <8 x i64> %shift to <8 x i32>
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE-LABEL: test_fixed_v4i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    paddq %xmm4, %xmm1
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    paddq %xmm5, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT:    retq
+  %and = and <4 x i64> %a0, %a1
+  %xor = xor <4 x i64> %a1, %a0
+  %shift = lshr <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1>
+  %res = add <4 x i64> %and, %shift
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: test_ext_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm4, %rdi
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r10
+; SSE2-NEXT:    movq %xmm0, %r11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r8
+; SSE2-NEXT:    movq %xmm3, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    xorl %ecx, %ecx
+; SSE2-NEXT:    addq %r11, %rax
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    xorl %r11d, %r11d
+; SSE2-NEXT:    addq %r10, %rdx
+; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    xorl %r10d, %r10d
+; SSE2-NEXT:    addq %r9, %rsi
+; SSE2-NEXT:    setb %r10b
+; SSE2-NEXT:    xorl %r9d, %r9d
+; SSE2-NEXT:    addq %rdi, %r8
+; SSE2-NEXT:    setb %r9b
+; SSE2-NEXT:    shldq $63, %r8, %r9
+; SSE2-NEXT:    shldq $63, %rsi, %r10
+; SSE2-NEXT:    shldq $63, %rdx, %r11
+; SSE2-NEXT:    shldq $63, %rax, %rcx
+; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    movq %r11, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movq %r10, %xmm1
+; SSE2-NEXT:    movq %r9, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v4i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movq %xmm1, %r8
+; SSE4-NEXT:    pextrq $1, %xmm1, %r9
+; SSE4-NEXT:    movq %xmm0, %r10
+; SSE4-NEXT:    pextrq $1, %xmm0, %r11
+; SSE4-NEXT:    movq %xmm3, %rdi
+; SSE4-NEXT:    pextrq $1, %xmm3, %rsi
+; SSE4-NEXT:    movq %xmm2, %rdx
+; SSE4-NEXT:    pextrq $1, %xmm2, %rax
+; SSE4-NEXT:    xorl %ecx, %ecx
+; SSE4-NEXT:    addq %r11, %rax
+; SSE4-NEXT:    setb %cl
+; SSE4-NEXT:    xorl %r11d, %r11d
+; SSE4-NEXT:    addq %r10, %rdx
+; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    xorl %r10d, %r10d
+; SSE4-NEXT:    addq %r9, %rsi
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    xorl %r9d, %r9d
+; SSE4-NEXT:    addq %r8, %rdi
+; SSE4-NEXT:    setb %r9b
+; SSE4-NEXT:    shldq $63, %rdi, %r9
+; SSE4-NEXT:    shldq $63, %rsi, %r10
+; SSE4-NEXT:    shldq $63, %rdx, %r11
+; SSE4-NEXT:    shldq $63, %rax, %rcx
+; SSE4-NEXT:    movq %rcx, %xmm1
+; SSE4-NEXT:    movq %r11, %xmm0
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE4-NEXT:    movq %r10, %xmm2
+; SSE4-NEXT:    movq %r9, %xmm1
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v4i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r10
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX1-NEXT:    vmovq %xmm1, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    addq %r11, %rax
+; AVX1-NEXT:    setb %cl
+; AVX1-NEXT:    xorl %r11d, %r11d
+; AVX1-NEXT:    addq %r10, %rdx
+; AVX1-NEXT:    setb %r11b
+; AVX1-NEXT:    xorl %r10d, %r10d
+; AVX1-NEXT:    addq %r9, %rsi
+; AVX1-NEXT:    setb %r10b
+; AVX1-NEXT:    xorl %r9d, %r9d
+; AVX1-NEXT:    addq %rdi, %r8
+; AVX1-NEXT:    setb %r9b
+; AVX1-NEXT:    shldq $63, %r8, %r9
+; AVX1-NEXT:    shldq $63, %rsi, %r10
+; AVX1-NEXT:    shldq $63, %rdx, %r11
+; AVX1-NEXT:    shldq $63, %rax, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vmovq %r11, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovq %r10, %xmm1
+; AVX1-NEXT:    vmovq %r9, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r10
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    addq %r11, %rax
+; AVX2-NEXT:    setb %cl
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    addq %r10, %rdx
+; AVX2-NEXT:    setb %r11b
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    addq %r9, %rsi
+; AVX2-NEXT:    setb %r10b
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    addq %rdi, %r8
+; AVX2-NEXT:    setb %r9b
+; AVX2-NEXT:    shldq $63, %r8, %r9
+; AVX2-NEXT:    shldq $63, %rsi, %r10
+; AVX2-NEXT:    shldq $63, %rdx, %r11
+; AVX2-NEXT:    shldq $63, %rax, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vmovq %r11, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vmovq %r10, %xmm1
+; AVX2-NEXT:    vmovq %r9, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovq %xmm0, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %r10
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX512-NEXT:    vmovq %xmm1, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm0, %r8
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    addq %r11, %rax
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    xorl %r11d, %r11d
+; AVX512-NEXT:    addq %r10, %r8
+; AVX512-NEXT:    setb %r11b
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    addq %r9, %rcx
+; AVX512-NEXT:    setb %r10b
+; AVX512-NEXT:    xorl %r9d, %r9d
+; AVX512-NEXT:    addq %rsi, %rdi
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    shldq $63, %rdi, %r9
+; AVX512-NEXT:    shldq $63, %rcx, %r10
+; AVX512-NEXT:    shldq $63, %r8, %r11
+; AVX512-NEXT:    shldq $63, %rax, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm0
+; AVX512-NEXT:    vmovq %r11, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %r10, %xmm1
+; AVX512-NEXT:    vmovq %r9, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %x0 = zext <4 x i64> %a0 to <4 x i128>
+  %x1 = zext <4 x i64> %a1 to <4 x i128>
+  %sum = add <4 x i128> %x0, %x1
+  %shift = lshr <4 x i128> %sum, <i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <4 x i128> %shift to <4 x i64>
+  ret <4 x i64> %res
+}
+
+;
+; 512-bit vectors
+;
+
+define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE-LABEL: test_fixed_v64i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm9
+; SSE-NEXT:    pand %xmm7, %xmm9
+; SSE-NEXT:    movdqa %xmm2, %xmm10
+; SSE-NEXT:    pand %xmm6, %xmm10
+; SSE-NEXT:    movdqa %xmm1, %xmm11
+; SSE-NEXT:    pand %xmm5, %xmm11
+; SSE-NEXT:    movdqa %xmm0, %xmm8
+; SSE-NEXT:    pand %xmm4, %xmm8
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    paddb %xmm9, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    paddb %xmm10, %xmm2
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    paddb %xmm11, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    paddb %xmm8, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm6
+; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm6, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <64 x i8> %a0, %a1
+  %xor = xor <64 x i8> %a0, %a1
+  %shift = lshr <64 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %res = add <64 x i8> %and, %shift
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) {
+; SSE2-LABEL: test_ext_v64i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm13
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
+; SSE2-NEXT:    paddw %xmm10, %xmm9
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; SSE2-NEXT:    paddw %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
+; SSE2-NEXT:    paddw %xmm11, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE2-NEXT:    paddw %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
+; SSE2-NEXT:    paddw %xmm12, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; SSE2-NEXT:    paddw %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15]
+; SSE2-NEXT:    paddw %xmm13, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE2-NEXT:    paddw %xmm4, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm9
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    packuswb %xmm9, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm7
+; SSE2-NEXT:    psrlw $1, %xmm2
+; SSE2-NEXT:    packuswb %xmm7, %xmm2
+; SSE2-NEXT:    psrlw $1, %xmm6
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    packuswb %xmm6, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm5
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    packuswb %xmm5, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v64i8:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    pxor %xmm13, %xmm13
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm14 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm13[8],xmm10[9],xmm13[9],xmm10[10],xmm13[10],xmm10[11],xmm13[11],xmm10[12],xmm13[12],xmm10[13],xmm13[13],xmm10[14],xmm13[14],xmm10[15],xmm13[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm12 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15]
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15]
+; SSE4-NEXT:    paddw %xmm9, %xmm7
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15]
+; SSE4-NEXT:    paddw %xmm10, %xmm6
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
+; SSE4-NEXT:    paddw %xmm11, %xmm5
+; SSE4-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; SSE4-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15]
+; SSE4-NEXT:    paddw %xmm8, %xmm4
+; SSE4-NEXT:    paddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE4-NEXT:    paddw %xmm14, %xmm2
+; SSE4-NEXT:    paddw %xmm15, %xmm1
+; SSE4-NEXT:    paddw %xmm12, %xmm0
+; SSE4-NEXT:    psrlw $1, %xmm7
+; SSE4-NEXT:    psrlw $1, %xmm6
+; SSE4-NEXT:    psrlw $1, %xmm5
+; SSE4-NEXT:    psrlw $1, %xmm4
+; SSE4-NEXT:    psrlw $1, %xmm3
+; SSE4-NEXT:    packuswb %xmm7, %xmm3
+; SSE4-NEXT:    psrlw $1, %xmm2
+; SSE4-NEXT:    packuswb %xmm6, %xmm2
+; SSE4-NEXT:    psrlw $1, %xmm1
+; SSE4-NEXT:    packuswb %xmm5, %xmm1
+; SSE4-NEXT:    psrlw $1, %xmm0
+; SSE4-NEXT:    packuswb %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v64i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; AVX1-NEXT:    vpaddw %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15]
+; AVX1-NEXT:    vpaddw %xmm7, %xmm12, %xmm7
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; AVX1-NEXT:    vpaddw %xmm12, %xmm8, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
+; AVX1-NEXT:    vpaddw %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero
+; AVX1-NEXT:    vpaddw %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero,xmm12[4],zero,xmm12[5],zero,xmm12[6],zero,xmm12[7],zero
+; AVX1-NEXT:    vpaddw %xmm2, %xmm9, %xmm2
+; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlw $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrlw $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; AVX2-NEXT:    vpaddw %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT:    vpaddw %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $1, %ymm4, %ymm2
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v64i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <64 x i8> %a0 to <64 x i16>
+  %x1 = zext <64 x i8> %a1 to <64 x i16>
+  %sum = add <64 x i16> %x0, %x1
+  %shift = lshr <64 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = trunc <64 x i16> %shift to <64 x i8>
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE-LABEL: test_fixed_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm3
+; SSE-NEXT:    paddw %xmm8, %xmm3
+; SSE-NEXT:    psrlw $1, %xmm2
+; SSE-NEXT:    paddw %xmm9, %xmm2
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    paddw %xmm10, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddw %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddw %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddw %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <32 x i16> %a0, %a1
+  %xor = xor <32 x i16> %a1, %a0
+  %shift = lshr <32 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %res = add <32 x i16> %and, %shift
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) {
+; SSE2-LABEL: test_ext_v32i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
+; SSE2-NEXT:    paddd %xmm9, %xmm10
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
+; SSE2-NEXT:    paddd %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; SSE2-NEXT:    paddd %xmm11, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
+; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm6, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; SSE2-NEXT:    paddd %xmm12, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm7, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE2-NEXT:    paddd %xmm13, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    pslld $15, %xmm10
+; SSE2-NEXT:    psrad $16, %xmm10
+; SSE2-NEXT:    pslld $15, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm10, %xmm0
+; SSE2-NEXT:    pslld $15, %xmm9
+; SSE2-NEXT:    psrad $16, %xmm9
+; SSE2-NEXT:    pslld $15, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm9, %xmm1
+; SSE2-NEXT:    pslld $15, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pslld $15, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm5, %xmm2
+; SSE2-NEXT:    pslld $15, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    pslld $15, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    packssdw %xmm4, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v32i16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    pxor %xmm13, %xmm13
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm14 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm12 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; SSE4-NEXT:    paddd %xmm9, %xmm7
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSE4-NEXT:    paddd %xmm10, %xmm6
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
+; SSE4-NEXT:    paddd %xmm11, %xmm5
+; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; SSE4-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
+; SSE4-NEXT:    paddd %xmm8, %xmm4
+; SSE4-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE4-NEXT:    paddd %xmm14, %xmm2
+; SSE4-NEXT:    paddd %xmm15, %xmm1
+; SSE4-NEXT:    paddd %xmm12, %xmm0
+; SSE4-NEXT:    psrld $1, %xmm7
+; SSE4-NEXT:    psrld $1, %xmm6
+; SSE4-NEXT:    psrld $1, %xmm5
+; SSE4-NEXT:    psrld $1, %xmm4
+; SSE4-NEXT:    psrld $1, %xmm3
+; SSE4-NEXT:    packusdw %xmm7, %xmm3
+; SSE4-NEXT:    psrld $1, %xmm2
+; SSE4-NEXT:    packusdw %xmm6, %xmm2
+; SSE4-NEXT:    psrld $1, %xmm1
+; SSE4-NEXT:    packusdw %xmm5, %xmm1
+; SSE4-NEXT:    psrld $1, %xmm0
+; SSE4-NEXT:    packusdw %xmm4, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v32i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX1-NEXT:    vpaddd %xmm5, %xmm11, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; AVX1-NEXT:    vpaddd %xmm7, %xmm12, %xmm7
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX1-NEXT:    vpaddd %xmm12, %xmm8, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; AVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
+; AVX1-NEXT:    vpaddd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
+; AVX1-NEXT:    vpaddd %xmm2, %xmm9, %xmm2
+; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrld $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7
+; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v32i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX2-NEXT:    vpaddd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX2-NEXT:    vpaddd %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v32i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <32 x i16> %a0 to <32 x i32>
+  %x1 = zext <32 x i16> %a1 to <32 x i32>
+  %sum = add <32 x i32> %x0, %x1
+  %shift = lshr <32 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = trunc <32 x i32> %shift to <32 x i16>
+  ret <32 x i16> %res
+}
+
+define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE-LABEL: test_fixed_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrld $1, %xmm3
+; SSE-NEXT:    paddd %xmm8, %xmm3
+; SSE-NEXT:    psrld $1, %xmm2
+; SSE-NEXT:    paddd %xmm9, %xmm2
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    paddd %xmm10, %xmm1
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddd %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxord %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrld $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <16 x i32> %a0, %a1
+  %xor = xor <16 x i32> %a1, %a0
+  %shift = lshr <16 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %res = add <16 x i32> %and, %shift
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) {
+; SSE2-LABEL: test_ext_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm12
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm8[2],xmm12[3],xmm8[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm13
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
+; SSE2-NEXT:    movdqa %xmm7, %xmm9
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; SSE2-NEXT:    paddq %xmm10, %xmm9
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm7, %xmm3
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; SSE2-NEXT:    paddq %xmm11, %xmm7
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; SSE2-NEXT:    paddq %xmm12, %xmm6
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
+; SSE2-NEXT:    paddq %xmm13, %xmm5
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm9
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm7
+; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm6
+; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
+; SSE2-NEXT:    psrlq $1, %xmm5
+; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v16i32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa %xmm3, %xmm9
+; SSE4-NEXT:    movdqa %xmm2, %xmm10
+; SSE4-NEXT:    movdqa %xmm1, %xmm11
+; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    pxor %xmm13, %xmm13
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; SSE4-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm14 = xmm2[0],zero,xmm2[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm13[2],xmm10[3],xmm13[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm12 = xmm8[0],zero,xmm8[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm13[2],xmm7[3],xmm13[3]
+; SSE4-NEXT:    paddq %xmm9, %xmm7
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm13[2],xmm6[3],xmm13[3]
+; SSE4-NEXT:    paddq %xmm10, %xmm6
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; SSE4-NEXT:    paddq %xmm11, %xmm5
+; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
+; SSE4-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
+; SSE4-NEXT:    paddq %xmm8, %xmm4
+; SSE4-NEXT:    paddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE4-NEXT:    paddq %xmm14, %xmm2
+; SSE4-NEXT:    paddq %xmm15, %xmm1
+; SSE4-NEXT:    paddq %xmm12, %xmm0
+; SSE4-NEXT:    psrlq $1, %xmm7
+; SSE4-NEXT:    psrlq $1, %xmm6
+; SSE4-NEXT:    psrlq $1, %xmm5
+; SSE4-NEXT:    psrlq $1, %xmm4
+; SSE4-NEXT:    psrlq $1, %xmm3
+; SSE4-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm2
+; SSE4-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm1
+; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; SSE4-NEXT:    psrlq $1, %xmm0
+; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v16i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm8
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
+; AVX1-NEXT:    vpaddq %xmm6, %xmm12, %xmm6
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm12 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX1-NEXT:    vpaddq %xmm7, %xmm12, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm12[2],xmm5[2],xmm12[3],xmm5[3]
+; AVX1-NEXT:    vpaddq %xmm13, %xmm9, %xmm9
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX1-NEXT:    vpaddq %xmm5, %xmm10, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm10 = xmm11[0],zero,xmm11[1],zero
+; AVX1-NEXT:    vpaddq %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero
+; AVX1-NEXT:    vpaddq %xmm3, %xmm8, %xmm3
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm6, %xmm2
+; AVX1-NEXT:    vpsrlq $1, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrlq $1, %xmm9, %xmm7
+; AVX1-NEXT:    vpsrlq $1, %xmm5, %xmm5
+; AVX1-NEXT:    vpsrlq $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX2-NEXT:    vpaddq %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX2-NEXT:    vpaddq %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm5[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],ymm4[2,3]
+; AVX2-NEXT:    vpsrlq $1, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm2, %zmm1
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x0 = zext <16 x i32> %a0 to <16 x i64>
+  %x1 = zext <16 x i32> %a1 to <16 x i64>
+  %sum = add <16 x i64> %x0, %x1
+  %shift = lshr <16 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = trunc <16 x i64> %shift to <16 x i32>
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE-LABEL: test_fixed_v8i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm2, %xmm9
+; SSE-NEXT:    pand %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm11
+; SSE-NEXT:    pxor %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm6, %xmm2
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    psrlq $1, %xmm3
+; SSE-NEXT:    paddq %xmm8, %xmm3
+; SSE-NEXT:    psrlq $1, %xmm2
+; SSE-NEXT:    paddq %xmm9, %xmm2
+; SSE-NEXT:    psrlq $1, %xmm1
+; SSE-NEXT:    paddq %xmm10, %xmm1
+; SSE-NEXT:    psrlq $1, %xmm0
+; SSE-NEXT:    paddq %xmm11, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_fixed_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm4
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm5
+; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_fixed_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm5
+; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_fixed_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpxorq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %and = and <8 x i64> %a0, %a1
+  %xor = xor <8 x i64> %a1, %a0
+  %shift = lshr <8 x i64> %xor, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %res = add <8 x i64> %and, %shift
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) {
+; SSE2-LABEL: test_ext_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 56
+; SSE2-NEXT:    .cfi_offset %rbx, -56
+; SSE2-NEXT:    .cfi_offset %r12, -48
+; SSE2-NEXT:    .cfi_offset %r13, -40
+; SSE2-NEXT:    .cfi_offset %r14, -32
+; SSE2-NEXT:    .cfi_offset %r15, -24
+; SSE2-NEXT:    .cfi_offset %rbp, -16
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %rbx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm3, %r12
+; SSE2-NEXT:    movq %xmm2, %rbp
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %r13
+; SSE2-NEXT:    movq %xmm1, %r15
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r14
+; SSE2-NEXT:    movq %xmm0, %r11
+; SSE2-NEXT:    movq %xmm7, %r10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r9
+; SSE2-NEXT:    movq %xmm6, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    movq %xmm5, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movq %xmm4, %rax
+; SSE2-NEXT:    xorl %ecx, %ecx
+; SSE2-NEXT:    addq %r11, %rax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    xorl %r11d, %r11d
+; SSE2-NEXT:    addq %r14, %rdx
+; SSE2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    xorl %r14d, %r14d
+; SSE2-NEXT:    addq %r15, %rsi
+; SSE2-NEXT:    setb %r14b
+; SSE2-NEXT:    xorl %r15d, %r15d
+; SSE2-NEXT:    addq %r13, %rdi
+; SSE2-NEXT:    setb %r15b
+; SSE2-NEXT:    xorl %r13d, %r13d
+; SSE2-NEXT:    addq %rbp, %r8
+; SSE2-NEXT:    setb %r13b
+; SSE2-NEXT:    xorl %ebp, %ebp
+; SSE2-NEXT:    addq %r12, %r9
+; SSE2-NEXT:    setb %bpl
+; SSE2-NEXT:    xorl %r12d, %r12d
+; SSE2-NEXT:    addq %rbx, %r10
+; SSE2-NEXT:    movq %xmm8, %rdx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; SSE2-NEXT:    setb %r12b
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    xorl %ebx, %ebx
+; SSE2-NEXT:    addq %rdx, %rax
+; SSE2-NEXT:    setb %bl
+; SSE2-NEXT:    shldq $63, %rax, %rbx
+; SSE2-NEXT:    shldq $63, %r10, %r12
+; SSE2-NEXT:    shldq $63, %r9, %rbp
+; SSE2-NEXT:    shldq $63, %r8, %r13
+; SSE2-NEXT:    shldq $63, %rdi, %r15
+; SSE2-NEXT:    shldq $63, %rsi, %r14
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rax, %r11
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    shldq $63, %rax, %rcx
+; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    movq %r11, %xmm4
+; SSE2-NEXT:    movq %r14, %xmm1
+; SSE2-NEXT:    movq %r15, %xmm5
+; SSE2-NEXT:    movq %r13, %xmm2
+; SSE2-NEXT:    movq %rbp, %xmm6
+; SSE2-NEXT:    movq %r12, %xmm3
+; SSE2-NEXT:    movq %rbx, %xmm7
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    .cfi_def_cfa_offset 40
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    .cfi_def_cfa_offset 32
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    .cfi_def_cfa_offset 24
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    .cfi_def_cfa_offset 16
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    .cfi_def_cfa_offset 8
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_ext_v8i64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 56
+; SSE4-NEXT:    .cfi_offset %rbx, -56
+; SSE4-NEXT:    .cfi_offset %r12, -48
+; SSE4-NEXT:    .cfi_offset %r13, -40
+; SSE4-NEXT:    .cfi_offset %r14, -32
+; SSE4-NEXT:    .cfi_offset %r15, -24
+; SSE4-NEXT:    .cfi_offset %rbp, -16
+; SSE4-NEXT:    pextrq $1, %xmm3, %r14
+; SSE4-NEXT:    movq %xmm2, %r13
+; SSE4-NEXT:    pextrq $1, %xmm2, %rbp
+; SSE4-NEXT:    movq %xmm1, %r12
+; SSE4-NEXT:    pextrq $1, %xmm1, %r15
+; SSE4-NEXT:    movq %xmm0, %rbx
+; SSE4-NEXT:    pextrq $1, %xmm0, %r11
+; SSE4-NEXT:    pextrq $1, %xmm7, %r10
+; SSE4-NEXT:    movq %xmm6, %r9
+; SSE4-NEXT:    pextrq $1, %xmm6, %r8
+; SSE4-NEXT:    movq %xmm5, %rdi
+; SSE4-NEXT:    pextrq $1, %xmm5, %rsi
+; SSE4-NEXT:    movq %xmm4, %rdx
+; SSE4-NEXT:    pextrq $1, %xmm4, %rax
+; SSE4-NEXT:    xorl %ecx, %ecx
+; SSE4-NEXT:    addq %r11, %rax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    setb %cl
+; SSE4-NEXT:    xorl %r11d, %r11d
+; SSE4-NEXT:    addq %rbx, %rdx
+; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    xorl %ebx, %ebx
+; SSE4-NEXT:    addq %r15, %rsi
+; SSE4-NEXT:    setb %bl
+; SSE4-NEXT:    xorl %r15d, %r15d
+; SSE4-NEXT:    addq %r12, %rdi
+; SSE4-NEXT:    setb %r15b
+; SSE4-NEXT:    xorl %r12d, %r12d
+; SSE4-NEXT:    addq %rbp, %r8
+; SSE4-NEXT:    setb %r12b
+; SSE4-NEXT:    xorl %ebp, %ebp
+; SSE4-NEXT:    addq %r13, %r9
+; SSE4-NEXT:    setb %bpl
+; SSE4-NEXT:    xorl %r13d, %r13d
+; SSE4-NEXT:    addq %r14, %r10
+; SSE4-NEXT:    movq %xmm3, %rdx
+; SSE4-NEXT:    setb %r13b
+; SSE4-NEXT:    movq %xmm7, %rax
+; SSE4-NEXT:    xorl %r14d, %r14d
+; SSE4-NEXT:    addq %rdx, %rax
+; SSE4-NEXT:    setb %r14b
+; SSE4-NEXT:    shldq $63, %rax, %r14
+; SSE4-NEXT:    shldq $63, %r10, %r13
+; SSE4-NEXT:    shldq $63, %r9, %rbp
+; SSE4-NEXT:    shldq $63, %r8, %r12
+; SSE4-NEXT:    shldq $63, %rdi, %r15
+; SSE4-NEXT:    shldq $63, %rsi, %rbx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rax, %r11
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE4-NEXT:    shldq $63, %rax, %rcx
+; SSE4-NEXT:    movq %rcx, %xmm4
+; SSE4-NEXT:    movq %r11, %xmm0
+; SSE4-NEXT:    movq %rbx, %xmm5
+; SSE4-NEXT:    movq %r15, %xmm1
+; SSE4-NEXT:    movq %r12, %xmm6
+; SSE4-NEXT:    movq %rbp, %xmm2
+; SSE4-NEXT:    movq %r13, %xmm7
+; SSE4-NEXT:    movq %r14, %xmm3
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    .cfi_def_cfa_offset 48
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    .cfi_def_cfa_offset 40
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    .cfi_def_cfa_offset 32
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    .cfi_def_cfa_offset 24
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    .cfi_def_cfa_offset 16
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    .cfi_def_cfa_offset 8
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_ext_v8i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 56
+; AVX1-NEXT:    .cfi_offset %rbx, -56
+; AVX1-NEXT:    .cfi_offset %r12, -48
+; AVX1-NEXT:    .cfi_offset %r13, -40
+; AVX1-NEXT:    .cfi_offset %r14, -32
+; AVX1-NEXT:    .cfi_offset %r15, -24
+; AVX1-NEXT:    .cfi_offset %rbp, -16
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovq %xmm4, %r15
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rbp
+; AVX1-NEXT:    vmovq %xmm0, %r13
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r14
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX1-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %r9
+; AVX1-NEXT:    vpextrq $1, %xmm0, %r8
+; AVX1-NEXT:    vmovq %xmm2, %rdi
+; AVX1-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    addq %r11, %rax
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    setb %cl
+; AVX1-NEXT:    xorl %r11d, %r11d
+; AVX1-NEXT:    addq %r14, %rdx
+; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    setb %r11b
+; AVX1-NEXT:    xorl %r14d, %r14d
+; AVX1-NEXT:    addq %r12, %rsi
+; AVX1-NEXT:    setb %r14b
+; AVX1-NEXT:    xorl %r12d, %r12d
+; AVX1-NEXT:    addq %r13, %rdi
+; AVX1-NEXT:    setb %r12b
+; AVX1-NEXT:    xorl %r13d, %r13d
+; AVX1-NEXT:    addq %rbp, %r8
+; AVX1-NEXT:    setb %r13b
+; AVX1-NEXT:    xorl %ebp, %ebp
+; AVX1-NEXT:    addq %r15, %r9
+; AVX1-NEXT:    setb %bpl
+; AVX1-NEXT:    xorl %r15d, %r15d
+; AVX1-NEXT:    addq %rbx, %r10
+; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    setb %r15b
+; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    xorl %ebx, %ebx
+; AVX1-NEXT:    addq %rdx, %rax
+; AVX1-NEXT:    setb %bl
+; AVX1-NEXT:    shldq $63, %rax, %rbx
+; AVX1-NEXT:    shldq $63, %r10, %r15
+; AVX1-NEXT:    shldq $63, %r9, %rbp
+; AVX1-NEXT:    shldq $63, %r8, %r13
+; AVX1-NEXT:    shldq $63, %rdi, %r12
+; AVX1-NEXT:    shldq $63, %rsi, %r14
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %r11
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vmovq %r11, %xmm1
+; AVX1-NEXT:    vmovq %r14, %xmm2
+; AVX1-NEXT:    vmovq %r12, %xmm3
+; AVX1-NEXT:    vmovq %r13, %xmm4
+; AVX1-NEXT:    vmovq %rbp, %xmm5
+; AVX1-NEXT:    vmovq %r15, %xmm6
+; AVX1-NEXT:    vmovq %rbx, %xmm7
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    .cfi_def_cfa_offset 48
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    .cfi_def_cfa_offset 40
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    .cfi_def_cfa_offset 32
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    .cfi_def_cfa_offset 24
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    .cfi_def_cfa_offset 16
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    .cfi_def_cfa_offset 8
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_ext_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 56
+; AVX2-NEXT:    .cfi_offset %rbx, -56
+; AVX2-NEXT:    .cfi_offset %r12, -48
+; AVX2-NEXT:    .cfi_offset %r13, -40
+; AVX2-NEXT:    .cfi_offset %r14, -32
+; AVX2-NEXT:    .cfi_offset %r15, -24
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT:    vmovq %xmm4, %r15
+; AVX2-NEXT:    vpextrq $1, %xmm4, %rbp
+; AVX2-NEXT:    vmovq %xmm0, %r13
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r14
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r9
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r8
+; AVX2-NEXT:    vmovq %xmm2, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    addq %r11, %rax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    setb %cl
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    addq %r14, %rdx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    setb %r11b
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    addq %r12, %rsi
+; AVX2-NEXT:    setb %r14b
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    addq %r13, %rdi
+; AVX2-NEXT:    setb %r12b
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    addq %rbp, %r8
+; AVX2-NEXT:    setb %r13b
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    addq %r15, %r9
+; AVX2-NEXT:    setb %bpl
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    addq %rbx, %r10
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    setb %r15b
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    addq %rdx, %rax
+; AVX2-NEXT:    setb %bl
+; AVX2-NEXT:    shldq $63, %rax, %rbx
+; AVX2-NEXT:    shldq $63, %r10, %r15
+; AVX2-NEXT:    shldq $63, %r9, %rbp
+; AVX2-NEXT:    shldq $63, %r8, %r13
+; AVX2-NEXT:    shldq $63, %rdi, %r12
+; AVX2-NEXT:    shldq $63, %rsi, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rax, %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    shldq $63, %rax, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vmovq %r11, %xmm1
+; AVX2-NEXT:    vmovq %r14, %xmm2
+; AVX2-NEXT:    vmovq %r12, %xmm3
+; AVX2-NEXT:    vmovq %r13, %xmm4
+; AVX2-NEXT:    vmovq %rbp, %xmm5
+; AVX2-NEXT:    vmovq %r15, %xmm6
+; AVX2-NEXT:    vmovq %rbx, %xmm7
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    .cfi_def_cfa_offset 48
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    .cfi_def_cfa_offset 40
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    .cfi_def_cfa_offset 32
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    .cfi_def_cfa_offset 24
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    .cfi_def_cfa_offset 8
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ext_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 56
+; AVX512-NEXT:    .cfi_offset %rbx, -56
+; AVX512-NEXT:    .cfi_offset %r12, -48
+; AVX512-NEXT:    .cfi_offset %r13, -40
+; AVX512-NEXT:    .cfi_offset %r14, -32
+; AVX512-NEXT:    .cfi_offset %r15, -24
+; AVX512-NEXT:    .cfi_offset %rbp, -16
+; AVX512-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
+; AVX512-NEXT:    vmovq %xmm2, %r15
+; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT:    vmovq %xmm2, %rbp
+; AVX512-NEXT:    vpextrq $1, %xmm2, %r12
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %r14
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rbx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %r8
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX512-NEXT:    vmovq %xmm2, %r11
+; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    addq %rbx, %rax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    setb %sil
+; AVX512-NEXT:    xorl %ebx, %ebx
+; AVX512-NEXT:    addq %r14, %rdi
+; AVX512-NEXT:    setb %bl
+; AVX512-NEXT:    xorl %r14d, %r14d
+; AVX512-NEXT:    addq %r12, %rcx
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    setb %r14b
+; AVX512-NEXT:    xorl %r12d, %r12d
+; AVX512-NEXT:    addq %rbp, %r11
+; AVX512-NEXT:    setb %r12b
+; AVX512-NEXT:    xorl %ebp, %ebp
+; AVX512-NEXT:    addq %r13, %rdx
+; AVX512-NEXT:    setb %bpl
+; AVX512-NEXT:    xorl %r13d, %r13d
+; AVX512-NEXT:    addq %r15, %r8
+; AVX512-NEXT:    setb %r13b
+; AVX512-NEXT:    xorl %r15d, %r15d
+; AVX512-NEXT:    addq %r10, %r9
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    setb %r15b
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    addq %rcx, %rax
+; AVX512-NEXT:    setb %r10b
+; AVX512-NEXT:    shldq $63, %rax, %r10
+; AVX512-NEXT:    shldq $63, %r9, %r15
+; AVX512-NEXT:    shldq $63, %r8, %r13
+; AVX512-NEXT:    shldq $63, %rdx, %rbp
+; AVX512-NEXT:    shldq $63, %r11, %r12
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rax, %r14
+; AVX512-NEXT:    shldq $63, %rdi, %rbx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    shldq $63, %rax, %rsi
+; AVX512-NEXT:    vmovq %rsi, %xmm0
+; AVX512-NEXT:    vmovq %rbx, %xmm1
+; AVX512-NEXT:    vmovq %r14, %xmm2
+; AVX512-NEXT:    vmovq %r12, %xmm3
+; AVX512-NEXT:    vmovq %rbp, %xmm4
+; AVX512-NEXT:    vmovq %r13, %xmm5
+; AVX512-NEXT:    vmovq %r15, %xmm6
+; AVX512-NEXT:    vmovq %r10, %xmm7
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    .cfi_def_cfa_offset 48
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    .cfi_def_cfa_offset 40
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    .cfi_def_cfa_offset 32
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    .cfi_def_cfa_offset 24
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    .cfi_def_cfa_offset 16
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    .cfi_def_cfa_offset 8
+; AVX512-NEXT:    retq
+  %x0 = zext <8 x i64> %a0 to <8 x i128>
+  %x1 = zext <8 x i64> %a1 to <8 x i128>
+  %sum = add <8 x i128> %x0, %x1
+  %shift = lshr <8 x i128> %sum, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1>
+  %res = trunc <8 x i128> %shift to <8 x i64>
+  ret <8 x i64> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
index 34ef23db345755..234c7a0a500d30 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -553,8 +553,8 @@ define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32
 ; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm2, %ymm3, %ymm1
-; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovmskps %ymm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -571,8 +571,8 @@ define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:  .LBB7_3:
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm1
-; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovmskps %ymm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll
index 374e75967d52fc..a8c068fc5b8650 100644
--- a/llvm/test/CodeGen/X86/cmov.ll
+++ b/llvm/test/CodeGen/X86/cmov.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false -mattr=+ndd --show-mc-encoding | FileCheck %s --check-prefix=NDD
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define i32 @test1(i32 %x, i32 %n, i32 %w, ptr %vp) nounwind readnone {
@@ -9,6 +10,13 @@ define i32 @test1(i32 %x, i32 %n, i32 %w, ptr %vp) nounwind readnone {
 ; CHECK-NEXT:    movl $12, %eax
 ; CHECK-NEXT:    cmovael (%rcx), %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    btl %esi, %edi # encoding: [0x0f,0xa3,0xf7]
+; NDD-NEXT:    movl $12, %eax # encoding: [0xb8,0x0c,0x00,0x00,0x00]
+; NDD-NEXT:    cmovael (%rcx), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0x01]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
 	%0 = lshr i32 %x, %n
 	%1 = and i32 %0, 1
@@ -25,6 +33,13 @@ define i32 @test2(i32 %x, i32 %n, i32 %w, ptr %vp) nounwind readnone {
 ; CHECK-NEXT:    movl $12, %eax
 ; CHECK-NEXT:    cmovbl (%rcx), %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test2:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    btl %esi, %edi # encoding: [0x0f,0xa3,0xf7]
+; NDD-NEXT:    movl $12, %eax # encoding: [0xb8,0x0c,0x00,0x00,0x00]
+; NDD-NEXT:    cmovbl (%rcx), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0x01]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
 	%0 = lshr i32 %x, %n
 	%1 = and i32 %0, 1
@@ -50,6 +65,16 @@ define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
 ; CHECK-NEXT:    callq bar@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test3:
+; NDD:       # %bb.0:
+; NDD-NEXT:    pushq %rax # encoding: [0x50]
+; NDD-NEXT:    testb $1, %dl # encoding: [0xf6,0xc2,0x01]
+; NDD-NEXT:    cmovel %esi, %edi # EVEX TO LEGACY Compression encoding: [0x0f,0x44,0xfe]
+; NDD-NEXT:    callq bar@PLT # encoding: [0xe8,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: bar@PLT-4, kind: FK_PCRel_4
+; NDD-NEXT:    popq %rax # encoding: [0x58]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %c = trunc i64 %a to i32
   %d = trunc i64 %b to i32
   %e = select i1 %p, i32 %c, i32 %d
@@ -114,6 +139,54 @@ define i1 @test4() nounwind {
 ; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test4:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movsbl g_3(%rip), %eax # encoding: [0x0f,0xbe,0x05,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: g_3-4, kind: reloc_riprel_4byte
+; NDD-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
+; NDD-NEXT:    shrl $7, %ecx # EVEX TO LEGACY Compression encoding: [0xc1,0xe9,0x07]
+; NDD-NEXT:    xorb $1, %cl # EVEX TO LEGACY Compression encoding: [0x80,0xf1,0x01]
+; NDD-NEXT:    sarl %cl, %eax, %ecx # encoding: [0x62,0xf4,0x74,0x18,0xd3,0xf8]
+; NDD-NEXT:    movzbl g_96(%rip), %eax # encoding: [0x0f,0xb6,0x05,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: g_96-4, kind: reloc_riprel_4byte
+; NDD-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
+; NDD-NEXT:    je .LBB3_2 # encoding: [0x74,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: .LBB3_2-1, kind: FK_PCRel_1
+; NDD-NEXT:  # %bb.1: # %bb.i.i.i
+; NDD-NEXT:    movzbl g_100(%rip), %edx # encoding: [0x0f,0xb6,0x15,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: g_100-4, kind: reloc_riprel_4byte
+; NDD-NEXT:  .LBB3_2: # %func_4.exit.i
+; NDD-NEXT:    pushq %rbx # encoding: [0x53]
+; NDD-NEXT:    xorl %edx, %edx # encoding: [0x31,0xd2]
+; NDD-NEXT:    testb %cl, %cl # encoding: [0x84,0xc9]
+; NDD-NEXT:    setne %bl # encoding: [0x0f,0x95,0xc3]
+; NDD-NEXT:    movzbl %al, %ecx # encoding: [0x0f,0xb6,0xc8]
+; NDD-NEXT:    cmovnel %edx, %ecx # EVEX TO LEGACY Compression encoding: [0x0f,0x45,0xca]
+; NDD-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
+; NDD-NEXT:    je .LBB3_5 # encoding: [0x74,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: .LBB3_5-1, kind: FK_PCRel_1
+; NDD-NEXT:  # %bb.3: # %func_4.exit.i
+; NDD-NEXT:    testb %bl, %bl # encoding: [0x84,0xdb]
+; NDD-NEXT:    jne .LBB3_5 # encoding: [0x75,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: .LBB3_5-1, kind: FK_PCRel_1
+; NDD-NEXT:  # %bb.4: # %bb.i.i
+; NDD-NEXT:    movzbl g_100(%rip), %ecx # encoding: [0x0f,0xb6,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: g_100-4, kind: reloc_riprel_4byte
+; NDD-NEXT:    xorl %ebx, %ebx # encoding: [0x31,0xdb]
+; NDD-NEXT:    movl %eax, %ecx # encoding: [0x89,0xc1]
+; NDD-NEXT:  .LBB3_5: # %func_1.exit
+; NDD-NEXT:    movb %cl, g_96(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 2, value: g_96-4, kind: reloc_riprel_4byte
+; NDD-NEXT:    movzbl %cl, %esi # encoding: [0x0f,0xb6,0xf1]
+; NDD-NEXT:    movl $_2E_str, %edi # encoding: [0xbf,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: _2E_str, kind: FK_Data_4
+; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NDD-NEXT:    callq printf@PLT # encoding: [0xe8,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 1, value: printf@PLT-4, kind: FK_PCRel_4
+; NDD-NEXT:    movl %ebx, %eax # encoding: [0x89,0xd8]
+; NDD-NEXT:    popq %rbx # encoding: [0x5b]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = load i8, ptr @g_3, align 1
   %1 = sext i8 %0 to i32
@@ -163,6 +236,14 @@ define i32 @test5(ptr nocapture %P) nounwind readonly {
 ; CHECK-NEXT:    setge %al
 ; CHECK-NEXT:    orl $-2, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test5:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NDD-NEXT:    cmpl $42, (%rdi) # encoding: [0x83,0x3f,0x2a]
+; NDD-NEXT:    setge %al # encoding: [0x0f,0x9d,0xc0]
+; NDD-NEXT:    orl $-2, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0xfe]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
 	%0 = load i32, ptr %P, align 4
 	%1 = icmp sgt i32 %0, 41
@@ -178,6 +259,14 @@ define i32 @test6(ptr nocapture %P) nounwind readonly {
 ; CHECK-NEXT:    setl %al
 ; CHECK-NEXT:    leal 4(%rax,%rax,8), %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test6:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NDD-NEXT:    cmpl $42, (%rdi) # encoding: [0x83,0x3f,0x2a]
+; NDD-NEXT:    setl %al # encoding: [0x0f,0x9c,0xc0]
+; NDD-NEXT:    leal 4(%rax,%rax,8), %eax # encoding: [0x8d,0x44,0xc0,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
 	%0 = load i32, ptr %P, align 4
 	%1 = icmp sgt i32 %0, 41
@@ -194,6 +283,13 @@ define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
 ; CHECK-NEXT:    cmovel %edx, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test7:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testb $1, %dil # encoding: [0x40,0xf6,0xc7,0x01]
+; NDD-NEXT:    cmovnel %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x45,0xd6]
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
   %d = select i1 %c, i8 %a, i8 %b
   ret i8 %d
 }
@@ -205,6 +301,13 @@ define i64 @test8(i64 %0, i64 %1, i64 %2) {
 ; CHECK-NEXT:    cmpq $-2147483648, %rdi # imm = 0x80000000
 ; CHECK-NEXT:    cmovlq %rdx, %rax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: test8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    cmpq $-2147483648, %rdi # encoding: [0x48,0x81,0xff,0x00,0x00,0x00,0x80]
+; NDD-NEXT:    # imm = 0x80000000
+; NDD-NEXT:    cmovgeq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x4d,0xd6]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %4 = icmp sgt i64 %0, -2147483649
   %5 = select i1 %4, i64 %1, i64 %2
   ret i64 %5
@@ -218,6 +321,14 @@ define i32 @smin(i32 %x) {
 ; CHECK-NEXT:    movl $-1, %eax
 ; CHECK-NEXT:    cmovnsl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: smin:
+; NDD:       # %bb.0:
+; NDD-NEXT:    notl %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xf7,0xd7]
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NDD-NEXT:    cmovsl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x48,0xc1]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %not_x = xor i32 %x, -1
   %1 = icmp slt i32 %not_x, -1
   %sel = select i1 %1, i32 %not_x, i32 -1
@@ -231,6 +342,13 @@ define i32 @pr47049_1(i32 %0) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmovlel %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: pr47049_1:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; NDD-NEXT:    cmovlel %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x4e,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %2 = icmp slt i32 %0, 1
   %3 = select i1 %2, i32 %0, i32 1
   ret i32 %3
@@ -243,6 +361,13 @@ define i32 @pr47049_2(i32 %0) {
 ; CHECK-NEXT:    movl $-1, %eax
 ; CHECK-NEXT:    cmovnsl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: pr47049_2:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $-1, %eax # encoding: [0xb8,0xff,0xff,0xff,0xff]
+; NDD-NEXT:    cmovnsl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x49,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %2 = icmp sgt i32 %0, -1
   %3 = select i1 %2, i32 %0, i32 -1
   ret i32 %3
@@ -255,6 +380,13 @@ define i32 @pr47049_3(i32 %0) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmovgl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: pr47049_3:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; NDD-NEXT:    cmovgl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x4f,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %2 = icmp sgt i32 %0, 1
   %3 = select i1 %2, i32 %0, i32 1
   ret i32 %3
@@ -267,6 +399,13 @@ define i32 @pr47049_4(i32 %0) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; NDD-LABEL: pr47049_4:
+; NDD:       # %bb.0:
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; NDD-NEXT:    cmovnel %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x45,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
   %2 = icmp ugt i32 %0, 1
   %3 = select i1 %2, i32 %0, i32 1
   ret i32 %3
diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll
index cd1953bec774d9..30e52f06307599 100644
--- a/llvm/test/CodeGen/X86/cmp.ll
+++ b/llvm/test/CodeGen/X86/cmp.ll
@@ -416,9 +416,8 @@ define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
 ;
 ; NDD-LABEL: test13:
 ; NDD:       # %bb.0:
-; NDD-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; NDD-NEXT:    testb $8, %dil # encoding: [0x40,0xf6,0xc7,0x08]
-; NDD-NEXT:    cmovnel %edx, %eax # encoding: [0x0f,0x45,0xc2]
+; NDD-NEXT:    cmovnel %edx, %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x45,0xf2]
 ; NDD-NEXT:    retq # encoding: [0xc3]
   %and = and i32 %mask, 8
   %tobool = icmp ne i32 %and, 0
@@ -436,9 +435,8 @@ define i32 @test14(i32 %mask, i32 %base, i32 %intra) {
 ;
 ; NDD-LABEL: test14:
 ; NDD:       # %bb.0:
-; NDD-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
-; NDD-NEXT:    shrl $7, %edi, %ecx # encoding: [0x62,0xf4,0x74,0x18,0xc1,0xef,0x07]
-; NDD-NEXT:    cmovnsl %edx, %eax # encoding: [0x0f,0x49,0xc2]
+; NDD-NEXT:    shrl $7, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xef,0x07]
+; NDD-NEXT:    cmovnsl %edx, %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x49,0xf2]
 ; NDD-NEXT:    retq # encoding: [0xc3]
   %s = lshr i32 %mask, 7
   %tobool = icmp sgt i32 %s, -1
@@ -1100,9 +1098,8 @@ define { i64, i64 } @pr39968(i64, i64, i32) {
 ; NDD:       # %bb.0:
 ; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; NDD-NEXT:    testb $64, %dl # encoding: [0xf6,0xc2,0x40]
-; NDD-NEXT:    cmovneq %rdi, %rsi # encoding: [0x48,0x0f,0x45,0xf7]
-; NDD-NEXT:    cmovneq %rdi, %rax # encoding: [0x48,0x0f,0x45,0xc7]
-; NDD-NEXT:    movq %rsi, %rdx # encoding: [0x48,0x89,0xf2]
+; NDD-NEXT:    cmovneq %rdi, %rsi, %rdx # encoding: [0x62,0xf4,0xec,0x18,0x45,0xf7]
+; NDD-NEXT:    cmovneq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x45,0xc7]
 ; NDD-NEXT:    retq # encoding: [0xc3]
   %4 = and i32 %2, 64
   %5 = icmp ne i32 %4, 0
diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll
index 0743592f3ca11b..9bb7fec7eeacbe 100644
--- a/llvm/test/CodeGen/X86/combine-pavg.ll
+++ b/llvm/test/CodeGen/X86/combine-pavg.ll
@@ -18,6 +18,22 @@ define <16 x i8> @combine_pavgb_self(<16 x i8> %a0) {
   ret <16 x i8> %1
 }
 
+define <16 x i8> @combine_pavgb_zero(<16 x i8> %a0) {
+; SSE-LABEL: combine_pavgb_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pavgb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pavgb_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> zeroinitializer, <16 x i8> %a0)
+  ret <16 x i8> %1
+}
+
 define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
 ; SSE-LABEL: combine_pavgw_knownbits:
 ; SSE:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index 0675ced68d7a7a..7eee418742ddb5 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -521,3 +521,276 @@ define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) {
   %2 = ashr <4 x i32> %1, <i32 10, i32 10, i32 10, i32 10>
   ret <4 x i32> %2
 }
+
+define <8 x i16> @combine_vec8i16_ashr_clamped(<8 x i16> %x, <8 x i16> %y) {
+; SSE2-LABEL: combine_vec8i16_ashr_clamped:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    psubw %xmm2, %xmm1
+; SSE2-NEXT:    psllw $12, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $4, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psraw $15, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    psraw $2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    paddw %xmm1, %xmm1
+; SSE2-NEXT:    psraw $15, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psraw $1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec8i16_ashr_clamped:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm1
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $8, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $4, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $2, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psraw $1, %xmm3
+; SSE41-NEXT:    paddw %xmm1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: combine_vec8i16_ashr_clamped:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec8i16_ashr_clamped:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %1 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %y, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
+  %2 = ashr <8 x i16> %x, %1
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @combine_vec4i32_ashr_clamped(<4 x i32> %x, <4 x i32> %y) {
+; SSE2-LABEL: combine_vec4i32_ashr_clamped:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    psrld $27, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm1, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad %xmm4, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrad %xmm2, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec4i32_ashr_clamped:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrad %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_vec4i32_ashr_clamped:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %y, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
+  %2 = ashr <4 x i32> %x, %1
+  ret <4 x i32> %2
+}
+
+define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) {
+; SSE2-LABEL: combine_vec4i64_ashr_clamped:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483711,2147483711,2147483711,2147483711]
+; SSE2-NEXT:    movdqa %xmm7, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [63,63]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm6, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psrlq %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    psrlq %xmm6, %xmm7
+; SSE2-NEXT:    movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    psrlq %xmm3, %xmm5
+; SSE2-NEXT:    psrlq %xmm6, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; SSE2-NEXT:    xorpd %xmm7, %xmm0
+; SSE2-NEXT:    psubq %xmm7, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT:    psrlq %xmm5, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrlq %xmm4, %xmm3
+; SSE2-NEXT:    psrlq %xmm5, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
+; SSE2-NEXT:    xorpd %xmm2, %xmm1
+; SSE2-NEXT:    psubq %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec4i64_ashr_clamped:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm7, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519]
+; SSE41-NEXT:    movdqa %xmm8, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711]
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pand %xmm6, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm9 = [63,63]
+; SSE41-NEXT:    movapd %xmm9, %xmm6
+; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm2, %xmm7
+; SSE41-NEXT:    pcmpeqd %xmm7, %xmm8
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE41-NEXT:    pand %xmm8, %xmm5
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm9, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrlq %xmm3, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm4, %xmm2
+; SSE41-NEXT:    psrlq %xmm9, %xmm2
+; SSE41-NEXT:    psrlq %xmm3, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm5, %xmm4
+; SSE41-NEXT:    psubq %xmm5, %xmm4
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq %xmm6, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
+; SSE41-NEXT:    psrlq %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlq %xmm6, %xmm2
+; SSE41-NEXT:    psrlq %xmm3, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pxor %xmm0, %xmm1
+; SSE41-NEXT:    psubq %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: combine_vec4i64_ashr_clamped:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [9223372036854775870,9223372036854775870,9223372036854775870,9223372036854775870]
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [63,63,63,63]
+; AVX2-NEXT:    vblendvpd %ymm3, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec4i64_ashr_clamped:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %1 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %y, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
+  %2 = ashr <4 x i64> %x, %1
+  ret <4 x i64> %2
+}
diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
index cbb5bd09c2399a..a332b3e8908003 100644
--- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -164,14 +164,13 @@ define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a
 ; SSE-LABEL: demandedbits_sitofp_blendvps:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    cvtdq2ps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: demandedbits_sitofp_blendvps:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
 ; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %cvt = sitofp <4 x i32> %a2 to <4 x float>
diff --git a/llvm/test/CodeGen/X86/fast-regalloc-live-out-debug-values.mir b/llvm/test/CodeGen/X86/fast-regalloc-live-out-debug-values.mir
index 56cbe3f7b56388..37a90a2f16d671 100644
--- a/llvm/test/CodeGen/X86/fast-regalloc-live-out-debug-values.mir
+++ b/llvm/test/CodeGen/X86/fast-regalloc-live-out-debug-values.mir
@@ -119,6 +119,7 @@
 name:            foo
 tracksRegLiveness: true
 frameInfo:
+  adjustsStack:    true
   hasCalls:        true
 stack:
   - { id: 0, name: a.addr, size: 4, alignment: 4, debug-info-variable: '!11',
diff --git a/llvm/test/CodeGen/X86/heap-alloc-markers.mir b/llvm/test/CodeGen/X86/heap-alloc-markers.mir
index 0bf83657cb06c0..6e0dc50bac0e19 100644
--- a/llvm/test/CodeGen/X86/heap-alloc-markers.mir
+++ b/llvm/test/CodeGen/X86/heap-alloc-markers.mir
@@ -34,6 +34,7 @@ name: test
 # CHECK-LABEL: {{^}}test:
 tracksRegLiveness: true
 frameInfo:
+  adjustsStack:    true
   hasCalls: true
 body: |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/instr-symbols.mir b/llvm/test/CodeGen/X86/instr-symbols.mir
index a900288d70869a..7af6ca81810123 100644
--- a/llvm/test/CodeGen/X86/instr-symbols.mir
+++ b/llvm/test/CodeGen/X86/instr-symbols.mir
@@ -23,6 +23,7 @@ name: test
 # CHECK-LABEL: {{^}}test:
 tracksRegLiveness: true
 frameInfo:
+  adjustsStack:    true
   hasCalls: true
 body: |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
new file mode 100644
index 00000000000000..8652136ae5cd94
--- /dev/null
+++ b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
@@ -0,0 +1,375 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+declare void @use.i1(i1)
+declare void @use.i32(i32)
+define i32 @sitofp_signbit_only(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_signbit_only:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_signbit_only:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    movmskps %xmm0, %eax
+; X64-NEXT:    shll $31, %eax
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @sitofp_signbit_only_okay_width(i16 %i_in) nounwind {
+; X86-LABEL: sitofp_signbit_only_okay_width:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    filds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_signbit_only_okay_width:
+; X64:       # %bb.0:
+; X64-NEXT:    shll $16, %edi
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    movmskps %xmm0, %eax
+; X64-NEXT:    shll $31, %eax
+; X64-NEXT:    retq
+  %f = sitofp i16 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @sitofp_signbit_only_fail_bad_width1(i64 %i_in) nounwind {
+; X86-LABEL: sitofp_signbit_only_fail_bad_width1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl (%esp), %eax
+; X86-NEXT:    popl %ecx
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_signbit_only_fail_bad_width1:
+; X64:       # %bb.0:
+; X64-NEXT:    cvtsi2ss %rdi, %xmm0
+; X64-NEXT:    movmskps %xmm0, %eax
+; X64-NEXT:    shll $31, %eax
+; X64-NEXT:    retq
+  %f = sitofp i64 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define <2 x i16> @sitofp_signbit_only_fail_bad_width2(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_signbit_only_fail_bad_width2:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, (%esp)
+; X86-NEXT:    fildl (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    shrl $16, %edx
+; X86-NEXT:    andl $32768, %edx # imm = 0x8000
+; X86-NEXT:    movl $32768, %eax # imm = 0x8000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    # kill: def $dx killed $dx killed $edx
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_signbit_only_fail_bad_width2:
+; X64:       # %bb.0:
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i2xi16 = bitcast float %f to <2 x i16>
+  %r = and <2 x i16> %i2xi16, <i16 32768, i16 32768>
+  ret <2 x i16> %r
+}
+
+define i32 @sitofp_many_bits_fail(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_many_bits_fail:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fildl (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $-2147483647, %eax # imm = 0x80000001
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_many_bits_fail:
+; X64:       # %bb.0:
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $-2147483647, %eax # imm = 0x80000001
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483649
+  ret i32 %r
+}
+
+define i32 @sitofp_multiuse_fail(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_multiuse_fail:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildl {{[0-9]+}}(%esp)
+; X86-NEXT:    fsts {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll use.i32@PLT
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_multiuse_fail:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %ebx
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    callq use.i32@PLT
+; X64-NEXT:    andl $-2147483648, %ebx # imm = 0x80000000
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  call void @use.i32(i32 %i)
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @sitofp_multiuse_okay(i32 %i_in) nounwind {
+; X86-LABEL: sitofp_multiuse_okay:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildl {{[0-9]+}}(%esp)
+; X86-NEXT:    fsts {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll use.i1@PLT
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_multiuse_okay:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    cvtsi2ss %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %ebx
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    callq use.i1@PLT
+; X64-NEXT:    andl $-2147483648, %ebx # imm = 0x80000000
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+  %f = sitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %cmp = icmp slt i32 %i, 0
+  call void @use.i1(i32 %i)
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @uitofp_signbit_only(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_signbit_only:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_signbit_only:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @uitofp_signbit_only_okay_width(i16 %i_in) nounwind {
+; X86-LABEL: uitofp_signbit_only_okay_width:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_signbit_only_okay_width:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %f = uitofp i16 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @uitofp_signbit_only_okay_width1(i64 %i_in) nounwind {
+; X86-LABEL: uitofp_signbit_only_okay_width1:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_signbit_only_okay_width1:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %f = uitofp i64 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define <2 x i16> @uitofp_signbit_only_fail_bad_width2(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_signbit_only_fail_bad_width2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $32768, %eax # imm = 0x8000
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_signbit_only_fail_bad_width2:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ss %rax, %xmm0
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i2xi16 = bitcast float %f to <2 x i16>
+  %r = and <2 x i16> %i2xi16, <i16 32768, i16 32768>
+  ret <2 x i16> %r
+}
+
+define i32 @uitofp_many_bits_fail(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_many_bits_fail:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_many_bits_fail:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ss %rax, %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %r = and i32 %i, 2147483649
+  ret i32 %r
+}
+
+define i32 @uitofp_multiuse_fail(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_multiuse_fail:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll use.i32@PLT
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_multiuse_fail:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ss %rax, %xmm0
+; X64-NEXT:    movd %xmm0, %edi
+; X64-NEXT:    callq use.i32@PLT
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  call void @use.i32(i32 %i)
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
+
+define i32 @uitofp_multiuse_okay(i32 %i_in) nounwind {
+; X86-LABEL: uitofp_multiuse_okay:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll use.i1@PLT
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_multiuse_okay:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ss %rax, %xmm0
+; X64-NEXT:    movd %xmm0, %edi
+; X64-NEXT:    callq use.i1@PLT
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %f = uitofp i32 %i_in to float
+  %i = bitcast float %f to i32
+  %cmp = icmp slt i32 %i, 0
+  call void @use.i1(i32 %i)
+  %r = and i32 %i, 2147483648
+  ret i32 %r
+}
diff --git a/llvm/test/CodeGen/X86/isel-select-cmov.ll b/llvm/test/CodeGen/X86/isel-select-cmov.ll
index 0e5293c9000f04..39a20bf6637bb8 100644
--- a/llvm/test/CodeGen/X86/isel-select-cmov.ll
+++ b/llvm/test/CodeGen/X86/isel-select-cmov.ll
@@ -13,6 +13,8 @@
 ; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10 -verify-machineinstrs              | FileCheck %s --check-prefix=GISEL-X86
 ; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10 -verify-machineinstrs -mattr=+cmov | FileCheck %s --check-prefix=GISEL-X86-CMOV
 
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs -mattr=+ndd    | FileCheck %s --check-prefix=NDD
+
 ; Test conditional move for the supported types (i16, i32, and i32) and
 ; conditon input (argument or cmp).
 ; When cmov is not available (i8 type or X86), the branch is expected.
@@ -114,6 +116,16 @@ define zeroext i8 @select_cmov_i8(i1 zeroext %cond, i8 zeroext %a, i8 zeroext %b
 ; GISEL-X86-CMOV-NEXT:    cmovnew %dx, %ax
 ; GISEL-X86-CMOV-NEXT:    ## kill: def $al killed $al killed $eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmov_i8:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    testb $1, %dil
+; NDD-NEXT:    jne LBB0_2
+; NDD-NEXT:  ## %bb.1:
+; NDD-NEXT:    movl %edx, %esi
+; NDD-NEXT:  LBB0_2:
+; NDD-NEXT:    movzbl %sil, %eax
+; NDD-NEXT:    retq
   %1 = select i1 %cond, i8 %a, i8 %b
   ret i8 %1
 }
@@ -207,6 +219,13 @@ define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroex
 ; GISEL-X86-CMOV-NEXT:    cmovnew %dx, %ax
 ; GISEL-X86-CMOV-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmov_i16:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    testb $1, %dil
+; NDD-NEXT:    cmovnew %si, %dx, %ax
+; NDD-NEXT:    movzwl %ax, %eax
+; NDD-NEXT:    retq
   %1 = select i1 %cond, i16 %a, i16 %b
   ret i16 %1
 }
@@ -305,6 +324,13 @@ define zeroext i16 @select_cmp_cmov_i16(i16 zeroext %a, i16 zeroext %b) {
 ; GISEL-X86-CMOV-NEXT:    cmovew %cx, %ax
 ; GISEL-X86-CMOV-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmp_cmov_i16:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    cmpw %si, %di
+; NDD-NEXT:    cmovbw %di, %si, %ax
+; NDD-NEXT:    movzwl %ax, %eax
+; NDD-NEXT:    retq
   %1 = icmp ult i16 %a, %b
   %2 = select i1 %1, i16 %a, i16 %b
   ret i16 %2
@@ -391,6 +417,12 @@ define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) {
 ; GISEL-X86-CMOV-NEXT:    testl %ecx, %ecx
 ; GISEL-X86-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmov_i32:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    testb $1, %dil
+; NDD-NEXT:    cmovnel %esi, %edx, %eax
+; NDD-NEXT:    retq
   %1 = select i1 %cond, i32 %a, i32 %b
   ret i32 %1
 }
@@ -482,6 +514,12 @@ define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
 ; GISEL-X86-CMOV-NEXT:    andl $1, %edx
 ; GISEL-X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmp_cmov_i32:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    cmpl %esi, %edi
+; NDD-NEXT:    cmovbl %edi, %esi, %eax
+; NDD-NEXT:    retq
   %1 = icmp ult i32 %a, %b
   %2 = select i1 %1, i32 %a, i32 %b
   ret i32 %2
@@ -584,6 +622,12 @@ define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) {
 ; GISEL-X86-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %edx
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmov_i64:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    testb $1, %dil
+; NDD-NEXT:    cmovneq %rsi, %rdx, %rax
+; NDD-NEXT:    retq
   %1 = select i1 %cond, i64 %a, i64 %b
   ret i64 %1
 }
@@ -754,6 +798,12 @@ define i64 @select_cmp_cmov_i64(i64 %a, i64 %b) nounwind {
 ; GISEL-X86-CMOV-NEXT:    popl %ebx
 ; GISEL-X86-CMOV-NEXT:    popl %ebp
 ; GISEL-X86-CMOV-NEXT:    retl
+;
+; NDD-LABEL: select_cmp_cmov_i64:
+; NDD:       ## %bb.0:
+; NDD-NEXT:    cmpq %rsi, %rdi
+; NDD-NEXT:    cmovbq %rdi, %rsi, %rax
+; NDD-NEXT:    retq
   %1 = icmp ult i64 %a, %b
   %2 = select i1 %1, i64 %a, i64 %b
   ret i64 %2
diff --git a/llvm/test/CodeGen/X86/pr85681.ll b/llvm/test/CodeGen/X86/pr85681.ll
new file mode 100644
index 00000000000000..3b27a0257ab43d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr85681.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=emeraldrapids | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s
+
+; PR85681 - shift i1/vXi1 X, Y -> X as only Y==0 is defined
+
+define i32 @shl(i32 %a0) {
+; CHECK-LABEL: shl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    retq
+  %v0 = bitcast i32 %a0 to <32 x i1>
+  %s = shl <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, %v0
+  %r = bitcast <32 x i1> %s to i32
+  ret i32 %r
+}
+
+define i32 @lshr(i32 %a0) {
+; CHECK-LABEL: lshr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    retq
+  %v0 = bitcast i32 %a0 to <32 x i1>
+  %s = lshr <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, %v0
+  %r = bitcast <32 x i1> %s to i32
+  ret i32 %r
+}
+
+define i32 @ashr(i32 %a0) {
+; CHECK-LABEL: ashr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    retq
+  %v0 = bitcast i32 %a0 to <32 x i1>
+  %s = ashr <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, %v0
+  %r = bitcast <32 x i1> %s to i32
+  ret i32 %r
+}
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir b/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
index 30a68e6c2efd2a..4a18351bde493d 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
@@ -61,7 +61,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/CodeGen/X86/statepoint-vreg.mir b/llvm/test/CodeGen/X86/statepoint-vreg.mir
index bfeadfc93da8f6..a0c596f249931c 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg.mir
+++ b/llvm/test/CodeGen/X86/statepoint-vreg.mir
@@ -134,6 +134,8 @@ registers:
 liveins:
   - { reg: '$rdi', virtual-reg: '%0' }
   - { reg: '$rsi', virtual-reg: '%1' }
+frameInfo:
+  adjustsStack:    true
 fixedStack:      []
 stack:           []
 callSites:       []
diff --git a/llvm/test/CodeGen/X86/tls-desc.ll b/llvm/test/CodeGen/X86/tls-desc.ll
new file mode 100644
index 00000000000000..c73986e69e7918
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tls-desc.ll
@@ -0,0 +1,199 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i686 --relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 --relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64 --relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=X64
+
+@x = thread_local global i32 0, align 4
+@y = internal thread_local global i32 1, align 4
+@z = external hidden thread_local global i32, align 4
+
+define ptr @f1() nounwind {
+; X86-LABEL: f1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll .L0$pb
+; X86-NEXT:  .L0$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp0:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ebx
+; X86-NEXT:    #APP
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    leal x@tlsdesc(%ebx), %eax
+; X86-NEXT:    calll *x@tlscall(%eax)
+; X86-NEXT:    addl %gs:0, %eax
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    #APP
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X32-LABEL: f1:
+; X32:       # %bb.0:
+; X32-NEXT:    pushq %rax
+; X32-NEXT:    #APP
+; X32-NEXT:    #NO_APP
+; X32-NEXT:    leal x@tlsdesc(%rip), %eax
+; X32-NEXT:    callq *x@tlscall(%eax)
+; X32-NEXT:    # kill: def $eax killed $eax def $rax
+; X32-NEXT:    addl %fs:0, %eax
+; X32-NEXT:    #APP
+; X32-NEXT:    #NO_APP
+; X32-NEXT:    popq %rcx
+; X32-NEXT:    retq
+;
+; X64-LABEL: f1:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    #APP
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    leaq x@tlsdesc(%rip), %rax
+; X64-NEXT:    callq *x@tlscall(%rax)
+; X64-NEXT:    addq %fs:0, %rax
+; X64-NEXT:    #APP
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %a = call { i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=r,=r,=r,=r,=r,=r,~{dirflag},~{fpsr},~{flags}"()
+  %b = call ptr @llvm.threadlocal.address.p0(ptr @x)
+  %a.0 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 0
+  %a.1 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 1
+  %a.2 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 2
+  %a.3 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 3
+  %a.4 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 4
+  %a.5 = extractvalue { i32, i32, i32, i32, i32, i32 } %a, 5
+  call void asm sideeffect "", "r,r,r,r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %a.0, i32 %a.1, i32 %a.2, i32 %a.3, i32 %a.4, i32 %a.5)
+  ret ptr %b
+}
+
+define i32 @f2() nounwind {
+; X86-LABEL: f2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    calll .L1$pb
+; X86-NEXT:  .L1$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp1:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L1$pb), %ebx
+; X86-NEXT:    movl %gs:0, %ecx
+; X86-NEXT:    leal x@tlsdesc(%ebx), %eax
+; X86-NEXT:    calll *x@tlscall(%eax)
+; X86-NEXT:    movl (%eax,%ecx), %eax
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X32-LABEL: f2:
+; X32:       # %bb.0:
+; X32-NEXT:    pushq %rax
+; X32-NEXT:    movl %fs:0, %ecx
+; X32-NEXT:    leal x@tlsdesc(%rip), %eax
+; X32-NEXT:    callq *x@tlscall(%eax)
+; X32-NEXT:    movl (%eax,%ecx), %eax
+; X32-NEXT:    popq %rcx
+; X32-NEXT:    retq
+;
+; X64-LABEL: f2:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %fs:0, %rcx
+; X64-NEXT:    leaq x@tlsdesc(%rip), %rax
+; X64-NEXT:    callq *x@tlscall(%rax)
+; X64-NEXT:    movl (%rax,%rcx), %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %1 = tail call ptr @llvm.threadlocal.address.p0(ptr @x)
+  %2 = load i32, ptr %1
+  ret i32 %2
+}
+
+define ptr @f3() nounwind {
+; X86-LABEL: f3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    calll .L2$pb
+; X86-NEXT:  .L2$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp2:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.L2$pb), %ebx
+; X86-NEXT:    leal x@tlsdesc(%ebx), %eax
+; X86-NEXT:    calll *x@tlscall(%eax)
+; X86-NEXT:    addl %gs:0, %eax
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X32-LABEL: f3:
+; X32:       # %bb.0:
+; X32-NEXT:    pushq %rax
+; X32-NEXT:    leal x@tlsdesc(%rip), %eax
+; X32-NEXT:    callq *x@tlscall(%eax)
+; X32-NEXT:    # kill: def $eax killed $eax def $rax
+; X32-NEXT:    addl %fs:0, %eax
+; X32-NEXT:    popq %rcx
+; X32-NEXT:    retq
+;
+; X64-LABEL: f3:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    leaq x@tlsdesc(%rip), %rax
+; X64-NEXT:    callq *x@tlscall(%rax)
+; X64-NEXT:    addq %fs:0, %rax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %1 = tail call ptr @llvm.threadlocal.address.p0(ptr @x)
+  ret ptr %1
+}
+
+define i32 @f4() nounwind {
+; X86-LABEL: f4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    calll .L3$pb
+; X86-NEXT:  .L3$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp3:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp3-.L3$pb), %ebx
+; X86-NEXT:    movl %gs:0, %edx
+; X86-NEXT:    leal _TLS_MODULE_BASE_@tlsdesc(%ebx), %eax
+; X86-NEXT:    calll *_TLS_MODULE_BASE_@tlscall(%eax)
+; X86-NEXT:    movl y@DTPOFF(%eax,%edx), %ecx
+; X86-NEXT:    addl z@DTPOFF(%eax,%edx), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X32-LABEL: f4:
+; X32:       # %bb.0:
+; X32-NEXT:    pushq %rax
+; X32-NEXT:    movl %fs:0, %edx
+; X32-NEXT:    leal _TLS_MODULE_BASE_@tlsdesc(%rip), %eax
+; X32-NEXT:    callq *_TLS_MODULE_BASE_@tlscall(%eax)
+; X32-NEXT:    movl y@DTPOFF(%eax,%edx), %ecx
+; X32-NEXT:    addl z@DTPOFF(%eax,%edx), %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    popq %rcx
+; X32-NEXT:    retq
+;
+; X64-LABEL: f4:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %fs:0, %rdx
+; X64-NEXT:    leaq _TLS_MODULE_BASE_@tlsdesc(%rip), %rax
+; X64-NEXT:    callq *_TLS_MODULE_BASE_@tlscall(%rax)
+; X64-NEXT:    movl y@DTPOFF(%rax,%rdx), %ecx
+; X64-NEXT:    addl z@DTPOFF(%rax,%rdx), %ecx
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %1 = load i32, ptr @y, align 4
+  %2 = load i32, ptr @z, align 4
+  %3 = add nsw i32 %1, %2
+  ret i32 %3
+}
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
new file mode 100644
index 00000000000000..e6a07b4aeb2719
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
+
+define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
+; CHECK-LABEL: vpdpwssd_test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
+  ret <16 x i32> %4
+}
diff --git a/llvm/test/DebugInfo/AArch64/ptrauth.ll b/llvm/test/DebugInfo/AArch64/ptrauth.ll
new file mode 100644
index 00000000000000..5d9099f05704ba
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/ptrauth.ll
@@ -0,0 +1,70 @@
+; RUN: llc %s -filetype=obj -mtriple arm64e-apple-darwin -o - \
+; RUN:   | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 0, 0x04d2)")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d2)
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 1, 0x04d3)")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_address_discriminated (true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d3)
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 1, 0x04d4, "isa-pointer")")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_address_discriminated (true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d4)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_isa_pointer	(true)
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 1, 0x04d5, "authenticates-null-values")")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_address_discriminated (true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d5)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_authenticates_null_values	(true)
+
+; CHECK: DW_AT_type	(0x{{0+}}[[TY:.*]] "void *__ptrauth(4, 1, 0x04d6, "isa-pointer,authenticates-null-values")")
+; CHECK: 0x{{0+}}[[TY]]: DW_TAG_LLVM_ptrauth_type
+; CHECK-NEXT: DW_AT_type {{.*}}"void *"
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_key (0x04)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_address_discriminated (true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_extra_discriminator (0x04d6)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_isa_pointer	(true)
+; CHECK-NEXT: DW_AT_LLVM_ptrauth_authenticates_null_values	(true)
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+@p = global ptr null, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!10}
+!llvm.module.flags = !{!19, !20}
+
+!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!1 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!2 = !DIGlobalVariableExpression(var: !7, expr: !DIExpression())
+!3 = !DIGlobalVariableExpression(var: !8, expr: !DIExpression())
+!4 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression())
+!5 = distinct !DIGlobalVariable(name: "p1", scope: !10, file: !11, line: 1, type: !14, isLocal: false, isDefinition: true)
+!6 = distinct !DIGlobalVariable(name: "p2", scope: !10, file: !11, line: 1, type: !15, isLocal: false, isDefinition: true)
+!7 = distinct !DIGlobalVariable(name: "p3", scope: !10, file: !11, line: 1, type: !16, isLocal: false, isDefinition: true)
+!8 = distinct !DIGlobalVariable(name: "p4", scope: !10, file: !11, line: 1, type: !17, isLocal: false, isDefinition: true)
+!9 = distinct !DIGlobalVariable(name: "p5", scope: !10, file: !11, line: 1, type: !18, isLocal: false, isDefinition: true)
+!10 = distinct !DICompileUnit(language: DW_LANG_C99, file: !11, emissionKind: FullDebug, globals: !13)
+!11 = !DIFile(filename: "/tmp/p.c", directory: "/")
+!12 = !{}
+!13 = !{!0,!1,!2,!3,!4}
+!14 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: false, ptrAuthExtraDiscriminator: 1234)
+!15 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1235)
+!16 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1236, ptrAuthIsaPointer: true)
+!17 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1237, ptrAuthAuthenticatesNullValues: true)
+!18 = !DIDerivedType(tag: DW_TAG_LLVM_ptrauth_type, baseType: !21, ptrAuthKey: 4, ptrAuthIsAddressDiscriminated: true, ptrAuthExtraDiscriminator: 1238, ptrAuthIsaPointer: true, ptrAuthAuthenticatesNullValues: true)
+!19 = !{i32 2, !"Dwarf Version", i32 4}
+!20 = !{i32 2, !"Debug Info Version", i32 3}
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null)
diff --git a/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll b/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll
new file mode 100644
index 00000000000000..9240bf25b6f63f
--- /dev/null
+++ b/llvm/test/DebugInfo/ARM/hardware-loop-phi-insertion.ll
@@ -0,0 +1,84 @@
+; RUN: llc --stop-after=hardware-loops < %s | FileCheck %s
+
+;; Tests that Hardware Loop Insertion does not insert new phi nodes after debug
+;; records when they appear immediately after the last existing phi node.
+
+; CHECK-LABEL: for.body:
+; CHECK-NEXT: = phi i32
+; CHECK-NEXT: = phi i32
+; CHECK-NEXT: call void @llvm.dbg.value
+
+source_filename = "repro.c"
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+@z = dso_local local_unnamed_addr global i32 42, align 4, !dbg !0
+@arr = dso_local local_unnamed_addr global [10 x i32] zeroinitializer, align 4, !dbg !5
+
+define dso_local void @func1() local_unnamed_addr #0 !dbg !18 {
+entry:
+  %0 = load i32, ptr @z, align 4, !tbaa !26
+  br label %for.body, !dbg !30
+
+for.body:                                         ; preds = %entry, %for.body
+  %p1.04 = phi ptr [ @arr, %entry ], [ %incdec.ptr, %for.body ]
+  %i.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.dbg.value(metadata ptr %p1.04, metadata !23, metadata !DIExpression()), !dbg !25
+  store i32 %0, ptr %p1.04, align 4, !dbg !32, !tbaa !26
+  %inc = add nuw nsw i32 %i.03, 1, !dbg !34
+  %incdec.ptr = getelementptr inbounds i8, ptr %p1.04, i32 4, !dbg !35
+  %exitcond.not = icmp eq i32 %inc, 10, !dbg !36
+  br i1 %exitcond.not, label %for.end, label %for.body, !dbg !30, !llvm.loop !37
+
+for.end:                                          ; preds = %for.body
+  ret void, !dbg !41
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!11, !12, !13, !14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "z", scope: !2, file: !3, line: 2, type: !8, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 19.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "repro.c", directory: "/home/gbtozers/dev/upstream-llvm")
+!4 = !{!0, !5}
+!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!6 = distinct !DIGlobalVariable(name: "arr", scope: !2, file: !3, line: 1, type: !7, isLocal: false, isDefinition: true)
+!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 320, elements: !9)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !{!10}
+!10 = !DISubrange(count: 10)
+!11 = !{i32 7, !"Dwarf Version", i32 5}
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !{i32 1, !"wchar_size", i32 4}
+!14 = !{i32 1, !"min_enum_size", i32 4}
+!15 = !{i32 7, !"frame-pointer", i32 2}
+!16 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!17 = !{!"clang version 19.0.0git"}
+!18 = distinct !DISubprogram(name: "func1", scope: !3, file: !3, line: 4, type: !19, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{null}
+!21 = !{!23}
+!22 = !DILocalVariable(name: "i", scope: !18, file: !3, line: 6, type: !8)
+!23 = !DILocalVariable(name: "p1", scope: !18, file: !3, line: 7, type: !24)
+!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 32)
+!25 = !DILocation(line: 0, scope: !18)
+!26 = !{!27, !27, i64 0}
+!27 = !{!"int", !28, i64 0}
+!28 = !{!"omnipotent char", !29, i64 0}
+!29 = !{!"Simple C/C++ TBAA"}
+!30 = !DILocation(line: 8, column: 3, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !18, file: !3, line: 8, column: 3)
+!32 = !DILocation(line: 9, column: 10, scope: !33)
+!33 = distinct !DILexicalBlock(scope: !31, file: !3, line: 8, column: 3)
+!34 = !DILocation(line: 8, column: 27, scope: !33)
+!35 = !DILocation(line: 8, column: 32, scope: !33)
+!36 = !DILocation(line: 8, column: 21, scope: !33)
+!37 = distinct !{!37, !30, !38, !39, !40}
+!38 = !DILocation(line: 9, column: 12, scope: !31)
+!39 = !{!"llvm.loop.mustprogress"}
+!40 = !{!"llvm.loop.unroll.disable"}
+!41 = !DILocation(line: 10, column: 1, scope: !18)
diff --git a/llvm/test/DebugInfo/MIR/X86/debug-loc-0.mir b/llvm/test/DebugInfo/MIR/X86/debug-loc-0.mir
index 56a4d835aaa597..0b007456be1e6f 100644
--- a/llvm/test/DebugInfo/MIR/X86/debug-loc-0.mir
+++ b/llvm/test/DebugInfo/MIR/X86/debug-loc-0.mir
@@ -75,7 +75,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/DebugInfo/MIR/X86/prolog-epilog-indirection.mir b/llvm/test/DebugInfo/MIR/X86/prolog-epilog-indirection.mir
index 6941467fe0e4a3..4df967ce034939 100644
--- a/llvm/test/DebugInfo/MIR/X86/prolog-epilog-indirection.mir
+++ b/llvm/test/DebugInfo/MIR/X86/prolog-epilog-indirection.mir
@@ -104,6 +104,7 @@ alignment:       16
 tracksRegLiveness: true
 frameInfo:       
   maxAlignment:    4
+  adjustsStack:    true
   hasCalls:        true
 stack:           
   - { id: 0, name: l_1081, type: default, offset: 0, size: 4, alignment: 4, 
diff --git a/llvm/test/DebugInfo/NVPTX/no-extra-loc.ll b/llvm/test/DebugInfo/NVPTX/no-extra-loc.ll
new file mode 100644
index 00000000000000..6d3a69beffe875
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/no-extra-loc.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-cuda | %ptxas-verify %}
+
+
+define i32 @foo(i32 %a, i32 %b) !dbg !3 {
+
+; CHECK:     .loc    [[FILE:[0-9]+]] 26 0          // extra-lineinfo.cu:26:0
+; CHECK-NOT: .loc    [[FILE]]        26 0          // extra-lineinfo.cu:26:0
+; CHECK:     .file   [[FILE]] "/test/directory/extra-lineinfo.cu"
+
+  %add = add i32 %b, %a, !dbg !6
+  ret i32 %add, !dbg !6
+}
+
+!llvm.dbg.cu = !{!0}
+!nvvm.annotations = !{}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "extra-lineinfo.cu", directory: "/test/directory/")
+!2 = !{i32 1, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "kernel", linkageName: "foo", scope: !1, file: !1, line: 123, type: !4, scopeLine: 26, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{}
+!6 = !DILocation(line: 40, column: 22, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !3, file: !1, line: 3, column: 17)
diff --git a/llvm/test/DebugInfo/X86/live-debug-vars-dse.mir b/llvm/test/DebugInfo/X86/live-debug-vars-dse.mir
index 9443ed5e332c13..908889063584c2 100644
--- a/llvm/test/DebugInfo/X86/live-debug-vars-dse.mir
+++ b/llvm/test/DebugInfo/X86/live-debug-vars-dse.mir
@@ -107,7 +107,7 @@ frameInfo:
   stackSize:       0
   offsetAdjustment: 0
   maxAlignment:    8
-  adjustsStack:    false
+  adjustsStack:    true
   hasCalls:        true
   stackProtector:  ''
   maxCallFrameSize: 4294967295
diff --git a/llvm/test/DebugInfo/X86/prolog-params.mir b/llvm/test/DebugInfo/X86/prolog-params.mir
index af21bc85ccd746..6629dca810f954 100644
--- a/llvm/test/DebugInfo/X86/prolog-params.mir
+++ b/llvm/test/DebugInfo/X86/prolog-params.mir
@@ -98,6 +98,8 @@ fixedStack:
       isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true }
   - { id: 2, type: default, offset: 0, size: 4, alignment: 16, stack-id: default,
       isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true }
+frameInfo:
+  adjustsStack:    true
 stack:
   - { id: 0, name: arr, type: default, offset: 0, size: 8, alignment: 4,
       stack-id: default, callee-saved-register: '', callee-saved-restored: true }
diff --git a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
index 0a618c6780d1d8..d8cb542285c89b 100755
--- a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
+++ b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
@@ -1,4 +1,4 @@
-;; Tests that we can debug-print DPValues that have no markers attached.
+;; Tests that we can debug-print DbgVariableRecords that have no markers attached.
 ; RUN: opt -passes="instcombine" -debug %s -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
diff --git a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
index 2e765619fcb896..490f24ff76ff56 100644
--- a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
+++ b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
@@ -26,6 +26,8 @@
 ; CHECK-NEXT: {{^}}  store i32 %[[VAL_ADD]]{{.+}}, !DIAssignID ![[ASSIGNID:[0-9]+]]
 ; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata i32 %[[VAL_ADD]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ptr %[[VAL_B]], metadata !DIExpression()), !dbg ![[LOC_4:[0-9]+]]
 ; NEWDBG-NEXT: {{^}}    #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]])
+; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata ![[EMPTY:[0-9]+]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ![[EMPTY]], metadata !DIExpression()), !dbg ![[LOC_4]]
+; NEWDBG-NEXT: {{^}}    #dbg_assign(![[EMPTY:[0-9]+]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ![[EMPTY]], !DIExpression(), ![[LOC_4]])
 ; CHECK-NEXT: {{^}}  ret i32
 
 ; OLDDBG-DAG: declare void @llvm.dbg.value
@@ -40,6 +42,7 @@
 ; CHECK-DAG: ![[LOC_3]] = !DILocation(line: 3, column: 25
 ; CHECK-DAG: ![[LOC_4]] = !DILocation(line: 3, column: 30
 ; CHECK-DAG: ![[LABEL_ID]] = !DILabel(
+; CHECK-DAG: ![[EMPTY]] = !{}
 
 define dso_local i32 @f(i32 %a) !dbg !7 {
 entry:
@@ -51,6 +54,7 @@ entry:
   call void @llvm.dbg.label(metadata !50), !dbg !32
   store i32 %add, ptr %b, !dbg !32, !DIAssignID !40
   call void @llvm.dbg.assign(metadata i32 %add, metadata !21, metadata !DIExpression(), metadata !40, metadata ptr %b, metadata !DIExpression()), !dbg !33
+  call void @llvm.dbg.assign(metadata !2, metadata !21, metadata !DIExpression(), metadata !40, metadata !2, metadata !DIExpression()), !dbg !33
   ret i32 %add, !dbg !33
 
 }
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll
index 5d068872fcace0..4359d5305d4d91 100644
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail6.ll
@@ -85,7 +85,6 @@ exit:
   ret void
 }
 
-; FIXME: The fakeresume1 here should be marked as musttail.
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
diff --git a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll
index 6ea81c6ff0b096..2a14be0f921806 100644
--- a/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/Coro/coro-split-musttail7.ll
@@ -88,7 +88,6 @@ exit:
   ret void
 }
 
-; FIXME: The fakeresume1 here should be marked as musttail.
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
diff --git a/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll b/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll
index 76afc4bf007c2d..8b387cd4962979 100644
--- a/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll
+++ b/llvm/test/Instrumentation/ThreadSanitizer/atomic.ll
@@ -78,6 +78,26 @@ entry:
 ; CHECK-LABEL: atomic8_xchg_monotonic
 ; CHECK: call i8 @__tsan_atomic8_exchange(ptr %a, i8 0, i32 0), !dbg
 
+define void @atomic8_xchg_monotonic_ptr(ptr %a, ptr %b) nounwind uwtable {
+entry:
+  atomicrmw xchg ptr %a, ptr %b monotonic, !dbg !7
+  ret void, !dbg !7
+}
+; CHECK-LABEL: atomic8_xchg_monotonic_ptr
+; CHECK: [[ARG:%.*]] = ptrtoint ptr %b to i64, !dbg
+; CHECK: [[RES:%.*]] = call i64 @__tsan_atomic64_exchange(ptr %a, i64 [[ARG]], i32 0), !dbg
+; CHECK: [[CAST:%.*]] = inttoptr i64 [[RES]] to ptr, !dbg
+
+define void @atomic8_xchg_monotonic_float(ptr %a, float %b) nounwind uwtable {
+entry:
+  atomicrmw xchg ptr %a, float %b monotonic, !dbg !7
+  ret void, !dbg !7
+}
+; CHECK-LABEL: atomic8_xchg_monotonic_float
+; CHECK: [[ARG:%.*]] = bitcast float %b to i32, !dbg
+; CHECK: [[RES:%.*]] = call i32 @__tsan_atomic32_exchange(ptr %a, i32 [[ARG]], i32 0), !dbg
+; CHECK: [[CAST:%.*]] = bitcast i32 [[RES]] to float, !dbg
+
 define void @atomic8_add_monotonic(ptr %a) nounwind uwtable {
 entry:
   atomicrmw add ptr %a, i8 0 monotonic, !dbg !7
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index 74d9c86881d52c..ccf8cf67ede6dc 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -32,7 +32,6 @@ entry:
 ; CHECK-DUMP: <main>:
 ; CHECK-DUMP:      bl      0x8 <main+0x8>
 ; CHECK-DUMP: <foo>:
-; CHECK-DUMP:     paciasp
 
 ; `main` doesn't support BTI while `foo` does, so in the binary
 ; we should see only PAC which is supported by both.
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
deleted file mode 100644
index c25857ceed7b40..00000000000000
--- a/llvm/test/LTO/AArch64/link-sign-return-address.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; Testcase to check that module with different branch-target-enforcement can
-; be mixed.
-;
-; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
-; RUN: llvm-lto -exported-symbol main \
-; RUN:          -exported-symbol foo \
-; RUN:          -filetype=obj \
-; RUN:           %t2.bc %t1.bc \
-; RUN:           -o %t1.exe 2>&1
-; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s
-; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-gnu"
-
-declare i32 @foo();
-
-define i32 @main() {
-entry:
-  %add = call i32 @foo()
-  ret i32 %add
-}
-
-!llvm.module.flags = !{!0, !1, !2, !3 }
-!0 = !{i32 8, !"branch-target-enforcement", i32 0}
-!1 = !{i32 8, !"sign-return-address", i32 0}
-!2 = !{i32 8, !"sign-return-address-all", i32 0}
-!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
-
-; CHECK-DUMP: <foo>:
-; CHECK-DUMP:     paciasp
-; CHECK-DUMP:     mov     w0, #0x2a
-; CHECK-DUMP:     autiasp
-; CHECK-DUMP:     ret
-; CHECK-DUMP: <main>:
-; CHECK-DUMP-NOT:  paciasp
-; CHECK-DUMP:      str     x30,
-; CHECK-DUMP:      bl      0x14 <main+0x4>
-
-; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary
-; we should not see anything.
-; CHECK-PROP-NOT:   Properties: aarch64 feature: PAC
\ No newline at end of file
diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll
index 37bd8c37f8b5e5..a90f2128e4430a 100644
--- a/llvm/test/Linker/link-arm-and-thumb.ll
+++ b/llvm/test/Linker/link-arm-and-thumb.ll
@@ -13,12 +13,11 @@ entry:
   ret i32 %add
 }
 
-; CHECK: define i32 @main() [[MAIN_ATTRS:#[0-9]+]]
+; CHECK: define i32 @main() {
 ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]]
 ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]]
 
-; CHECK: attributes [[MAIN_ATTRS]] = { {{.*}} }
-; CHECK: attributes [[ARM_ATTRS]] = { {{.*}} "target-features"="-thumb-mode" }
-; CHECK: attributes [[THUMB_ATTRS]] = { {{.*}} "target-features"="+thumb-mode" }
+; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" }
+; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" }
 
 ; STDERR-NOT: warning: Linking two modules of different target triples:
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_sop1.s
index 8a7f64331317ed..c4029b0658b2e2 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_sop1.s
@@ -2964,6 +2964,9 @@ s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
 s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA)
 // GFX11: encoding: [0x85,0x4c,0x80,0xbe]
 
+s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA_TO_PC)
+// GFX11: encoding: [0x86,0x4c,0x80,0xbe]
+
 s_ctz_i32_b32 s5, s1
 // GFX11: encoding: [0x01,0x08,0x85,0xbe]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index 4fd355f10f341a..939320e9ef2dce 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -3708,9 +3708,12 @@ s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
 s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA)
 // GFX12: encoding: [0x85,0x4c,0x80,0xbe]
 
-s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_SE_AID_ID)
+s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA_TO_PC)
 // GFX12: encoding: [0x86,0x4c,0x80,0xbe]
 
+s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_SE_AID_ID)
+// GFX12: encoding: [0x87,0x4c,0x80,0xbe]
+
 s_ctz_i32_b32 s5, s1
 // GFX12: encoding: [0x01,0x08,0x85,0xbe]
 
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
index 8b90e20bb87d1f..7b591904e877fd 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
@@ -29,7 +29,7 @@
 // OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 00000c60 80000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 00000c60 80000000 00040000 00000000
 // complete
 // OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
@@ -39,12 +39,12 @@
 // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 00b0 00000060 80000000 00000000 00000000
+// OBJDUMP-NEXT: 00b0 00000060 80000000 00040000 00000000
 // disabled_user_sgpr
 // OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000
 // OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 00f0 00000c60 80000000 00000000 00000000
+// OBJDUMP-NEXT: 00f0 00000c60 80000000 00040000 00000000
 
 .text
 // ASM: .text
diff --git a/llvm/test/MC/AMDGPU/mcexpr_amd.s b/llvm/test/MC/AMDGPU/mcexpr_amd.s
new file mode 100644
index 00000000000000..a9639c3acc305a
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mcexpr_amd.s
@@ -0,0 +1,130 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -filetype=obj < %s > %t
+// RUN: llvm-objdump --syms %t | FileCheck --check-prefix=OBJDUMP %s
+
+// OBJDUMP: SYMBOL TABLE:
+// OBJDUMP-NEXT: 0000000000000000 l       *ABS*  0000000000000000 zero
+// OBJDUMP-NEXT: 0000000000000001 l       *ABS*  0000000000000000 one
+// OBJDUMP-NEXT: 0000000000000002 l       *ABS*  0000000000000000 two
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 three
+// OBJDUMP-NEXT: 7fffffffffffffff l       *ABS*  0000000000000000 i64_max
+// OBJDUMP-NEXT: 8000000000000000 l       *ABS*  0000000000000000 i64_min
+// OBJDUMP-NEXT: 0000000000000005 l       *ABS*  0000000000000000 max_expression_all
+// OBJDUMP-NEXT: 0000000000000005 l       *ABS*  0000000000000000 five
+// OBJDUMP-NEXT: 0000000000000004 l       *ABS*  0000000000000000 four
+// OBJDUMP-NEXT: 0000000000000002 l       *ABS*  0000000000000000 max_expression_two
+// OBJDUMP-NEXT: 0000000000000001 l       *ABS*  0000000000000000 max_expression_one
+// OBJDUMP-NEXT: 000000000000000a l       *ABS*  0000000000000000 max_literals
+// OBJDUMP-NEXT: 000000000000000f l       *ABS*  0000000000000000 max_with_max_sym
+// OBJDUMP-NEXT: 000000000000000f l       *ABS*  0000000000000000 max
+// OBJDUMP-NEXT: ffffffffffffffff l       *ABS*  0000000000000000 neg_one
+// OBJDUMP-NEXT: ffffffffffffffff l       *ABS*  0000000000000000 max_neg_numbers
+// OBJDUMP-NEXT: ffffffffffffffff l       *ABS*  0000000000000000 max_neg_number
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 max_with_subexpr
+// OBJDUMP-NEXT: 0000000000000006 l       *ABS*  0000000000000000 max_as_subexpr
+// OBJDUMP-NEXT: 0000000000000005 l       *ABS*  0000000000000000 max_recursive_subexpr
+// OBJDUMP-NEXT: 7fffffffffffffff l       *ABS*  0000000000000000 max_expr_one_max
+// OBJDUMP-NEXT: 7fffffffffffffff l       *ABS*  0000000000000000 max_expr_two_max
+// OBJDUMP-NEXT: 7fffffffffffffff l       *ABS*  0000000000000000 max_expr_three_max
+// OBJDUMP-NEXT: 8000000000000000 l       *ABS*  0000000000000000 max_expr_one_min
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 max_expr_two_min
+// OBJDUMP-NEXT: 0000000000989680 l       *ABS*  0000000000000000 max_expr_three_min
+// OBJDUMP-NEXT: 0000000000000007 l       *ABS*  0000000000000000 or_expression_all
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 or_expression_two
+// OBJDUMP-NEXT: 0000000000000001 l       *ABS*  0000000000000000 or_expression_one
+// OBJDUMP-NEXT: 000000000000000f l       *ABS*  0000000000000000 or_literals
+// OBJDUMP-NEXT: 0000000000000000 l       *ABS*  0000000000000000 or_false
+// OBJDUMP-NEXT: 00000000000000ff l       *ABS*  0000000000000000 or_with_or_sym
+// OBJDUMP-NEXT: 00000000000000ff l       *ABS*  0000000000000000 or
+// OBJDUMP-NEXT: 0000000000000003 l       *ABS*  0000000000000000 or_with_subexpr
+// OBJDUMP-NEXT: 0000000000000008 l       *ABS*  0000000000000000 or_as_subexpr
+// OBJDUMP-NEXT: 0000000000000007 l       *ABS*  0000000000000000 or_recursive_subexpr
+
+// ASM: .set zero, 0
+// ASM: .set one, 1
+// ASM: .set two, 2
+// ASM: .set three, 3
+// ASM: .set i64_max, 9223372036854775807
+// ASM: .set i64_min, -9223372036854775808
+
+.set zero, 0
+.set one, 1
+.set two, 2
+.set three, 3
+.set i64_max, 0x7FFFFFFFFFFFFFFF
+.set i64_min, 0x8000000000000000
+
+// ASM: .set max_expression_all, max(1, 2, five, 3, four)
+// ASM: .set max_expression_two, 2
+// ASM: .set max_expression_one, 1
+// ASM: .set max_literals, 10
+// ASM: .set max_with_max_sym, max(max, 4, 3, 1, 2)
+
+.set max_expression_all, max(one, two, five, three, four)
+.set max_expression_two, max(one, two)
+.set max_expression_one, max(one)
+.set max_literals, max(1,2,3,4,5,6,7,8,9,10)
+.set max_with_max_sym, max(max, 4, 3, one, two)
+
+// ASM: .set max_neg_numbers, -1
+// ASM: .set max_neg_number, -1
+
+.set neg_one, -1
+.set max_neg_numbers, max(-5, -4, -3, -2, neg_one)
+.set max_neg_number, max(neg_one)
+
+// ASM: .set max_with_subexpr, 3
+// ASM: .set max_as_subexpr, 1+(max(4, 3, five))
+// ASM: .set max_recursive_subexpr, max(max(1, four), 3, max_expression_all)
+
+.set max_with_subexpr, max(((one | 3) << 3) / 8)
+.set max_as_subexpr, 1 + max(4, 3, five)
+.set max_recursive_subexpr, max(max(one, four), three, max_expression_all)
+
+// ASM: .set max_expr_one_max, 9223372036854775807
+// ASM: .set max_expr_two_max, max(9223372036854775807, five)
+// ASM: .set max_expr_three_max, max(9223372036854775807, five, 10000000)
+
+.set max_expr_one_max, max(i64_max)
+.set max_expr_two_max, max(i64_max, five)
+.set max_expr_three_max, max(i64_max, five, 10000000)
+
+// ASM: .set max_expr_one_min, -9223372036854775808
+// ASM: .set max_expr_two_min, 3
+// ASM: .set max_expr_three_min, 10000000
+
+.set max_expr_one_min, max(i64_min)
+.set max_expr_two_min, max(i64_min, three)
+.set max_expr_three_min, max(i64_min, three, 10000000)
+
+// ASM: .set or_expression_all, or(1, 2, five, 3, four)
+// ASM: .set or_expression_two, 3
+// ASM: .set or_expression_one, 1
+// ASM: .set or_literals, 15
+// ASM: .set or_false, 0
+// ASM: .set or_with_or_sym, or(or, 4, 3, 1, 2)
+
+.set or_expression_all, or(one, two, five, three, four)
+.set or_expression_two, or(one, two)
+.set or_expression_one, or(one)
+.set or_literals, or(1,2,3,4,5,6,7,8,9,10)
+.set or_false, or(zero, 0, (2-2), 5 > 6)
+.set or_with_or_sym, or(or, 4, 3, one, two)
+
+// ASM: .set or_with_subexpr, 3
+// ASM: .set or_as_subexpr, 1+(or(4, 3, five))
+// ASM: .set or_recursive_subexpr, or(or(1, four), 3, or_expression_all)
+
+.set or_with_subexpr, or(((one | 3) << 3) / 8)
+.set or_as_subexpr, 1 + or(4, 3, five)
+.set or_recursive_subexpr, or(or(one, four), three, or_expression_all)
+
+// ASM: .set four, 4
+// ASM: .set five, 5
+// ASM: .set max, 15
+// ASM: .set or, 255
+
+.set four, 4
+.set five, 5
+.set max, 0xF
+.set or, 0xFF
diff --git a/llvm/test/MC/AMDGPU/mcexpr_amd_err.s b/llvm/test/MC/AMDGPU/mcexpr_amd_err.s
new file mode 100644
index 00000000000000..ea02e013627218
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/mcexpr_amd_err.s
@@ -0,0 +1,53 @@
+// RUN: not llvm-mc -triple amdgcn-amd-amdhsa %s 2>&1 | FileCheck --check-prefix=ASM %s
+
+.set one, 1
+.set two, 2
+.set three, 3
+
+.set max_empty, max()
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: empty max expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set or_empty, or()
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: empty or expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_post_aux_comma, max(one,)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: mismatch of commas in max expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_pre_aux_comma, max(,one)
+// asm: :[[@line-1]]:{{[0-9]+}}: error: unknown token in expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_double_comma, max(one,, two)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unknown token in expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_no_comma, max(one two)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unexpected token in max expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_missing_paren, max(two
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unexpected token in max expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_expression_one, max(three, four,
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unknown token in expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set or_expression_one, or(four, five
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: unexpected token in or expression
+// ASM: :[[@LINE-2]]:{{[0-9]+}}: error: missing expression
+
+.set max_no_lparen, max four, five)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: expected newline
+
+.set max_no_paren, max one, two, three
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: expected newline
+
+.set max_rparen_only, max)
+// ASM: :[[@LINE-1]]:{{[0-9]+}}: error: expected newline
+
+.set four, 4
+.set five, 5
diff --git a/llvm/test/MC/ARM/arm-branch-errors.s b/llvm/test/MC/ARM/arm-branch-errors.s
index bbf6445f5c18a4..5d7ae12fb3bc00 100644
--- a/llvm/test/MC/ARM/arm-branch-errors.s
+++ b/llvm/test/MC/ARM/arm-branch-errors.s
@@ -10,13 +10,13 @@
 
 @ CHECK: error: invalid instruction, any one of the following would fix this:
 @ CHECK:        b #2
-@ CHECK: note: instruction requires: thumb
 @ CHECK: note: invalid operand for instruction
+@ CHECK: note: instruction requires: thumb
 @ CHECK: error: invalid instruction, any one of the following would fix this:
 @ CHECK:        bl #2
 @ CHECK: note: instruction requires: thumb
 @ CHECK: note: invalid operand for instruction
 @ CHECK: error: invalid instruction, any one of the following would fix this:
 @ CHECK:        beq #2
-@ CHECK: note: instruction requires: thumb
 @ CHECK: note: invalid operand for instruction
+@ CHECK: note: instruction requires: thumb
diff --git a/llvm/test/MC/ARM/arm11-hint-instr.s b/llvm/test/MC/ARM/arm11-hint-instr.s
index 4193a686870aba..d9eaa5a89ab230 100644
--- a/llvm/test/MC/ARM/arm11-hint-instr.s
+++ b/llvm/test/MC/ARM/arm11-hint-instr.s
@@ -65,7 +65,13 @@
 @ CHECK-THUMB: wfe                             @ encoding: [0x20,0xbf]
 @ CHECK-THUMB: wfi                             @ encoding: [0x30,0xbf]
 @ CHECK-THUMB: sev                             @ encoding: [0x40,0xbf]
-@ CHECK-ERROR-THUMB: error: instruction requires: v7 clrex
+@ CHECK-ERROR-THUMB: error: invalid instruction, any one of the following would fix this:
+@ CHECK-ERROR-THUMB: clrex
+@ CHECK-ERROR-THUMB: ^
+@ CHECK-ERROR-THUMB: note: instruction requires: v7 clrex
+@ CHECK-ERROR-THUMB: clrex
+@ CHECK-ERROR-THUMB: ^
+@ CHECK-ERROR-THUMB: note: instruction requires: arm-mode
 @ CHECK-ERROR-THUMB: clrex
 @ CHECK-ERROR-THUMB: ^
 
diff --git a/llvm/test/MC/ARM/basic-arm-instructions.s b/llvm/test/MC/ARM/basic-arm-instructions.s
index 055f3ce316153d..84a7cf52fa30e0 100644
--- a/llvm/test/MC/ARM/basic-arm-instructions.s
+++ b/llvm/test/MC/ARM/basic-arm-instructions.s
@@ -1774,6 +1774,9 @@ Lforward:
         pkhtb r2, r2, r3, asr #31
         pkhtb r2, r2, r3, asr #15
 
+        it ne
+        pkhtbne r2, r2, r3, asr #15
+
 @ CHECK: pkhbt	r2, r2, r3              @ encoding: [0x13,0x20,0x82,0xe6]
 @ CHECK: pkhbt	r2, r2, r3, lsl #31     @ encoding: [0x93,0x2f,0x82,0xe6]
 @ CHECK: pkhbt	r2, r2, r3              @ encoding: [0x13,0x20,0x82,0xe6]
@@ -1782,6 +1785,7 @@ Lforward:
 @ CHECK: pkhbt	r2, r3, r2              @ encoding: [0x12,0x20,0x83,0xe6]
 @ CHECK: pkhtb	r2, r2, r3, asr #31     @ encoding: [0xd3,0x2f,0x82,0xe6]
 @ CHECK: pkhtb	r2, r2, r3, asr #15     @ encoding: [0xd3,0x27,0x82,0xe6]
+@ CHECK: pkhtbne r2, r2, r3, asr #15    @ encoding: [0xd3,0x27,0x82,0x16]
 
 @------------------------------------------------------------------------------
 @ FIXME: PLD
diff --git a/llvm/test/MC/ARM/cde-fp-vec.s b/llvm/test/MC/ARM/cde-fp-vec.s
index 4b139579b719ba..fa18ffa766e2e3 100644
--- a/llvm/test/MC/ARM/cde-fp-vec.s
+++ b/llvm/test/MC/ARM/cde-fp-vec.s
@@ -12,7 +12,7 @@ ittt eq
 vcx1a   p1, s7, #2047
 // ERROR: [[@LINE+1]]:{{[0-9]+}}: error: instructions in IT block must be predicable
 vcx2    p0, d0, d15, #0
-// ERROR-FP: [[@LINE+2]]:{{[0-9]+}}: error: invalid instruction
+// ERROR-FP: [[@LINE+2]]:{{[0-9]+}}: error: instruction requires: mve
 // ERROR-MVE: [[@LINE+1]]:{{[0-9]+}}: error: instructions in IT block must be predicable
 vcx3    p0, q0, q7, q0, #12
 nop
@@ -33,12 +33,15 @@ vcx1a   p1, d3, #2047
 // ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
 vcx1    p0, q1, #1234
 // CHECK-MVE-NEXT: vcx1a p1, q5, #4095 @ encoding: [0x2f,0xfd,0xff,0xa1]
-// ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: invalid instruction
+// ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: instruction requires: mve
 vcx1a   p1, q5, #4095
 
 // ERROR: [[@LINE+1]]:{{[0-9]+}}: error: invalid instruction
 vcx1a   p1, s7, s7, #2047
-// ERROR: [[@LINE+1]]:{{[0-9]+}}: error: operand must be an immediate in the range [0,2047]
+// ERROR-FP: [[@LINE+4]]:{{[0-9]+}}: error: operand must be an immediate in the range [0,2047]
+// ERROR-MVE: [[@LINE+3]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this
+// ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: note: operand must be a register in range [q0, q7]
+// ERROR-MVE: [[@LINE+1]]:{{[0-9]+}}: note: operand must be an immediate in the range [0,2047]
 vcx1    p0, d0, #2048
 // ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: operand must be an immediate in the range [0,2047]
 vcx1a   p1, s0, #2048
@@ -51,10 +54,13 @@ vcx1    p8, d0, #1234
 vcx1    p0, d16, #1234
 // ERROR: [[@LINE+1]]:{{[0-9]+}}: error: invalid instruction
 vcx1    p0, s32, #1234
-// ERROR-FP: [[@LINE+4]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
-// ERROR-FP: [[@LINE+3]]:{{[0-9]+}}: note: operand must be a register in range [s0, s31]
-// ERROR-FP: [[@LINE+2]]:{{[0-9]+}}: note: operand must be a register in range [d0, d15]
-// ERROR-MVE: [[@LINE+1]]:{{[0-9]+}}: error: operand must be a register in range [q0, q7]
+// ERROR-FP: [[@LINE+7]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
+// ERROR-FP: [[@LINE+6]]:{{[0-9]+}}: note: operand must be a register in range [s0, s31]
+// ERROR-FP: [[@LINE+5]]:{{[0-9]+}}: note: operand must be a register in range [d0, d15]
+// ERROR-MVE: [[@LINE+4]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
+// ERROR-MVE: [[@LINE+3]]:{{[0-9]+}}: note: operand must be a register in range [q0, q7]
+// ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: note: operand must be a register in range [s0, s31]
+// ERROR-MVE: [[@LINE+1]]:{{[0-9]+}}: note: operand must be a register in range [d0, d15]
 vcx1    p0, q8, #1234
 // ERROR: [[@LINE+3]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
 // ERROR: [[@LINE+2]]:{{[0-9]+}}: note: operand must be a register in range [s0, s31]
@@ -116,7 +122,7 @@ vcx3a   p1, d1, d11, d12, #8
 // ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: error: operand must be an immediate in the range [0,15]
 // ERROR-FP: error: invalid instruction
 vcx3a   p1, q1, q2, q3, #16
-// ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: error: invalid instruction
+// ERROR-MVE: [[@LINE+2]]:{{[0-9]+}}: error: operand must be a register in range [d0, d15]
 // ERROR-FP: [[@LINE+1]]:{{[0-9]+}}: error: operand must be a register in range [d0, d15]
 vcx3    p0, d0, q0, d7, #1
 // ERROR: [[@LINE+1]]:{{[0-9]+}}: error: operand must be a register in range [s0, s31]
diff --git a/llvm/test/MC/ARM/cde-vec-pred.s b/llvm/test/MC/ARM/cde-vec-pred.s
index 6274fafa1224a4..9932f8d000337c 100644
--- a/llvm/test/MC/ARM/cde-vec-pred.s
+++ b/llvm/test/MC/ARM/cde-vec-pred.s
@@ -19,7 +19,7 @@ vcx3at   p1, q3, q7, q6, #15
 vcx3e    p0, q0, q2, q0, #12
 
 vpt.i8 eq, q0, q0
-// ERROR: [[@LINE+1]]:{{[0-9]+}}: error: incorrect predication in VPT block; got 'none', but expected 't'
+// ERROR: error: incorrect predication in VPT block; got 'none', but expected 't'
 vcx1    p0, q1, #1234
 
 vpt.i8 eq, q0, q0
diff --git a/llvm/test/MC/ARM/cps.s b/llvm/test/MC/ARM/cps.s
index bafdfdea537b8b..1034ed93810dd4 100644
--- a/llvm/test/MC/ARM/cps.s
+++ b/llvm/test/MC/ARM/cps.s
@@ -26,6 +26,6 @@
 @ V6-ERRORS: note: too many operands for instruction
 @ V6-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ V6-ERRORS: cps #0
-@ V6-ERRORS: note: too few operands for instruction
 @ V6-ERRORS: note: instruction requires: arm-mode
 @ V6-ERRORS: note: instruction requires: thumb2
+@ V6-ERRORS: note: too few operands for instruction
diff --git a/llvm/test/MC/ARM/diagnostics.s b/llvm/test/MC/ARM/diagnostics.s
index e6d80ea7a6280c..fa23a7da1e4048 100644
--- a/llvm/test/MC/ARM/diagnostics.s
+++ b/llvm/test/MC/ARM/diagnostics.s
@@ -288,7 +288,7 @@
 @ CHECK-ERRORS: error: 'asr' shift amount must be in range [1,32]
 @ CHECK-ERRORS: 	ssat	r8, #1, r10, asr #33
 @ CHECK-ERRORS: 	    	                  ^
-@ CHECK-ERRORS: error: shift operator 'asr' or 'lsl' expected
+@ CHECK-ERRORS: error: operand must be a register in range [r0, r14]
 @ CHECK-ERRORS:         ssat    r8, #1, r10, lsr #5
 @ CHECK-ERRORS:                              ^
 @ CHECK-ERRORS: error: '#' expected
diff --git a/llvm/test/MC/ARM/directive-arch_extension-crypto.s b/llvm/test/MC/ARM/directive-arch_extension-crypto.s
index 8d3cd9e5e1d0da..05b6d9e040188e 100644
--- a/llvm/test/MC/ARM/directive-arch_extension-crypto.s
+++ b/llvm/test/MC/ARM/directive-arch_extension-crypto.s
@@ -10,15 +10,16 @@
 	.syntax unified
 
 	.arch_extension crypto
-@ CHECK-V7: error: architectural extension 'crypto' is not allowed for the current base architecture
+@ CHECK-V7: architectural extension 'crypto' is not allowed for the current base architecture
 @ CHECK-V7-NEXT: 	.arch_extension crypto
 @ CHECK-V7-NEXT:                     ^
 
 	.type crypto,%function
 crypto:
 	vmull.p64 q0, d0, d1
-@ CHECK-V7: error: instruction requires: aes armv8
-
+@ CHECK-V7: error: invalid instruction, any one of the following would fix this:
+@ CHECK-V7: note: invalid operand for instruction
+@ CHECK-V7: note: instruction requires: aes armv8
 	aesd.8 q0, q1
 @ CHECK-V7: error: instruction requires: aes armv8
 	aese.8 q0, q1
@@ -51,14 +52,18 @@ crypto:
 @ CHECK-V7: error: instruction requires: sha2 armv8
 
 	.arch_extension nocrypto
+@ CHECK-V7: error: architectural extension 'sha2' is not allowed for the current base architecture
+@ CHECK-V7: error: architectural extension 'aes' is not allowed for the current base architecture
 @ CHECK-V7: error: architectural extension 'crypto' is not allowed for the current base architecture
-@ CHECK-V7-NEXT: 	.arch_extension nocrypto
+@ CHECK-V7-NEXT:     .arch_extension nocrypto
 @ CHECK-V7-NEXT:                     ^
 
 	.type nocrypto,%function
 nocrypto:
 	vmull.p64 q0, d0, d1
-@ CHECK-V7: error: instruction requires: aes armv8
+@ CHECK-V7: error: invalid instruction, any one of the following
+@ CHECK-V7: note: invalid operand for instruction
+@ CHECK-V7: note: instruction requires: aes armv8
 @ CHECK-V8: error: instruction requires: aes
 
 	aesd.8 q0, q1
diff --git a/llvm/test/MC/ARM/invalid-fp-armv8.s b/llvm/test/MC/ARM/invalid-fp-armv8.s
index dca0e448d1140f..c8ce261791cfd0 100644
--- a/llvm/test/MC/ARM/invalid-fp-armv8.s
+++ b/llvm/test/MC/ARM/invalid-fp-armv8.s
@@ -88,6 +88,8 @@ vrinta.f64.f64 s3, q0
 vrintn.f32.f32 d3, d0
 @ V8: error: instruction requires: NEON
 vrintp.f32 q3, q0
-@ V8: error: instruction requires: NEON
+@ V8: error: invalid instruction, any one of the following would fix this:
+@ V8: note: instruction requires: mve.fp
+@ V8: note: instruction requires: NEON
 vrintmlt.f32 q3, q0
 @ V8: error: instruction 'vrintm' is not predicable, but condition code specified
diff --git a/llvm/test/MC/ARM/lsl-zero-errors.s b/llvm/test/MC/ARM/lsl-zero-errors.s
index e021aa9eb986d5..1e51c587211d1b 100644
--- a/llvm/test/MC/ARM/lsl-zero-errors.s
+++ b/llvm/test/MC/ARM/lsl-zero-errors.s
@@ -55,22 +55,22 @@
 
 // CHECK-NONARM: error: invalid instruction, any one of the following would fix this:
 // CHECK-NONARM-NEXT: mov pc, r0, lsl #0
-// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 // CHECK-THUMBV7: note: operand must be a register in range [r0, r12] or r14
 // CHECK-THUMBV8: note: operand must be a register in range [r0, r14]
+// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 
 // CHECK-NONARM: error: invalid instruction, any one of the following would fix this:
 // CHECK-NONARM-NEXT: mov r0, pc, lsl #0
-// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 // CHECK-NONARM: note: invalid operand for instruction
 // CHECK-NONARM: note: invalid operand for instruction
 // CHECK-NONARM: note: operand must be an immediate in the range [256,65535]
+// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 
 // CHECK-NONARM: error: invalid instruction, any one of the following would fix this:
 // CHECK-NONARM-NEXT: mov pc, pc, lsl #0
-// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 // CHECK-THUMBV7: note: operand must be a register in range [r0, r12] or r14
 // CHECK-THUMBV8: note: operand must be a register in range [r0, r14]
+// CHECK-NONARM: note: operand must be a register in range [r0, r15]
 
 // CHECK-NONARM: error: invalid instruction, any one of the following would fix this:
 // CHECK-NONARM-NEXT: movs pc, r0, lsl #0
@@ -134,8 +134,8 @@
 // FIXME: We should consistently have the "requires ARMv8" error here
 // CHECK-THUMBV7: error: invalid instruction, any one of the following would fix this:
 // CHECK-THUMBV7-NEXT: mov sp, sp, lsl #0
-// CHECK-THUMBV7: note: operand must be a register in range [r0, r15]
 // CHECK-THUMBV7: note: operand must be a register in range [r0, r12] or r14
+// CHECK-THUMBV7: note: operand must be a register in range [r0, r15]
 
 // CHECK-THUMBV7: error: invalid instruction, any one of the following would fix this:
 // CHECK-THUMBV7-NEXT: movs sp, sp, lsl #0
diff --git a/llvm/test/MC/ARM/mve-load-store.s b/llvm/test/MC/ARM/mve-load-store.s
index 5c6d2a172b252c..797ab1e22dd010 100644
--- a/llvm/test/MC/ARM/mve-load-store.s
+++ b/llvm/test/MC/ARM/mve-load-store.s
@@ -5,55 +5,55 @@
 # RUN:     FileCheck --check-prefix=ERROR-NOMVE < %t %s
 
 # CHECK: vldrb.u8 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r0]
 
 # CHECK: vldrb.u8 q1, [r0] @ encoding: [0x90,0xed,0x00,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q1, [r0]
 
 # CHECK: vldrb.u8 q0, [r11] @ encoding: [0x9b,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r11]
 
 # CHECK: vldrb.u8 q3, [r11] @ encoding: [0x9b,0xed,0x00,0x7e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q3, [r11]
 
 # CHECK: vldrb.u8 q0, [r4, #56] @ encoding: [0x94,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r4, #56]
 
 # CHECK: vldrb.u8 q4, [r4, #56] @ encoding: [0x94,0xed,0x38,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q4, [r4, #56]
 
 # CHECK: vldrb.u8 q0, [r8, #56] @ encoding: [0x98,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r8, #56]
 
 # CHECK: vldrb.u8 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [r4, #56]!
 
 # CHECK: vldrb.u8 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [r4, #56]!
 
 # CHECK: vldrb.u8 q5, [r4], #-25 @ encoding: [0x34,0xec,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [r4], #-25
 
 # CHECK: vldrb.u8 q5, [r10], #-25 @ encoding: [0x3a,0xec,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [r10], #-25
 
 # CHECK: vldrb.u8 q5, [sp, #-25] @ encoding: [0x1d,0xed,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [sp, #-25]
 
 # CHECK: vldrb.u8 q5, [sp, #-127] @ encoding: [0x1d,0xed,0x7f,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q5, [sp, #-127]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: invalid operand for instruction
@@ -69,55 +69,55 @@ vldrb.u8 q0, [r0, #-128]!
 vldrb.u8 q0, [r0], #128
 
 # CHECK: vstrb.8 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r0]
 
 # CHECK: vstrb.8 q1, [r0] @ encoding: [0x80,0xed,0x00,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q1, [r0]
 
 # CHECK: vstrb.8 q0, [r11] @ encoding: [0x8b,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r11]
 
 # CHECK: vstrb.8 q3, [r11] @ encoding: [0x8b,0xed,0x00,0x7e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q3, [r11]
 
 # CHECK: vstrb.8 q0, [r4, #56] @ encoding: [0x84,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r4, #56]
 
 # CHECK: vstrb.8 q4, [r4, #56] @ encoding: [0x84,0xed,0x38,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q4, [r4, #56]
 
 # CHECK: vstrb.8 q0, [r8, #56] @ encoding: [0x88,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r8, #56]
 
 # CHECK: vstrb.8 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [r4, #56]!
 
 # CHECK: vstrb.8 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [r4, #56]!
 
 # CHECK: vstrb.8 q5, [r4], #-25 @ encoding: [0x24,0xec,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [r4], #-25
 
 # CHECK: vstrb.8 q5, [r10], #-25 @ encoding: [0x2a,0xec,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [r10], #-25
 
 # CHECK: vstrb.8 q5, [sp, #-25] @ encoding: [0x0d,0xed,0x19,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [sp, #-25]
 
 # CHECK: vstrb.8 q5, [sp, #127] @ encoding: [0x8d,0xed,0x7f,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q5, [sp, #127]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: invalid operand for instruction
@@ -133,735 +133,735 @@ vstrb.u8 q0, [r0, #-128]!
 vstrb.u8 q0, [r0], #128
 
 # CHECK: vldrb.u16 q0, [r0] @ encoding: [0x90,0xfd,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r0]
 
 # CHECK: vldrb.u16 q1, [r0] @ encoding: [0x90,0xfd,0x80,0x2e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q1, [r0]
 
 # CHECK: vldrb.u16 q0, [r7] @ encoding: [0x97,0xfd,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r7]
 
 # CHECK: vldrb.u16 q3, [r7] @ encoding: [0x97,0xfd,0x80,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q3, [r7]
 
 # CHECK: vldrb.u16 q0, [r4, #56] @ encoding: [0x94,0xfd,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r4, #56]
 
 # CHECK: vldrb.u16 q4, [r4, #56] @ encoding: [0x94,0xfd,0xb8,0x8e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q4, [r4, #56]
 
 # CHECK: vldrb.u16 q0, [r2, #56] @ encoding: [0x92,0xfd,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r2, #56]
 
 # CHECK: vldrb.u16 q5, [r4, #56]! @ encoding: [0xb4,0xfd,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r4, #56]!
 
 # CHECK: vldrb.u16 q5, [r4, #56]! @ encoding: [0xb4,0xfd,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r4, #56]!
 
 # CHECK: vldrb.u16 q5, [r4], #-1 @ encoding: [0x34,0xfc,0x81,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r4], #-1
 
 # CHECK: vldrb.u16 q5, [r3], #-25 @ encoding: [0x33,0xfc,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r3], #-25
 
 # CHECK: vldrb.u16 q5, [r6, #-25] @ encoding: [0x16,0xfd,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r6, #-25]
 
 # CHECK: vldrb.u16 q5, [r6, #-64] @ encoding: [0x16,0xfd,0xc0,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q5, [r6, #-64]
 
 # CHECK: vldrb.s16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r0]
 
 # CHECK: vldrb.s16 q1, [r0] @ encoding: [0x90,0xed,0x80,0x2e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q1, [r0]
 
 # CHECK: vldrb.s16 q0, [r7] @ encoding: [0x97,0xed,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r7]
 
 # CHECK: vldrb.s16 q3, [r7] @ encoding: [0x97,0xed,0x80,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q3, [r7]
 
 # CHECK: vldrb.s16 q0, [r4, #56] @ encoding: [0x94,0xed,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r4, #56]
 
 # CHECK: vldrb.s16 q4, [r4, #56] @ encoding: [0x94,0xed,0xb8,0x8e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q4, [r4, #56]
 
 # CHECK: vldrb.s16 q0, [r2, #56] @ encoding: [0x92,0xed,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r2, #56]
 
 # CHECK: vldrb.s16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r4, #56]!
 
 # CHECK: vldrb.s16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r4, #56]!
 
 # CHECK: vldrb.s16 q5, [r4], #-25 @ encoding: [0x34,0xec,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r4], #-25
 
 # CHECK: vldrb.s16 q5, [r3], #-25 @ encoding: [0x33,0xec,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r3], #-25
 
 # CHECK: vldrb.s16 q5, [r6, #-25] @ encoding: [0x16,0xed,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r6, #-25]
 
 # CHECK: vldrb.s16 q5, [r6, #-64] @ encoding: [0x16,0xed,0xc0,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q5, [r6, #-64]
 
 # CHECK: vstrb.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r0]
 
 # CHECK: vstrb.16 q1, [r0] @ encoding: [0x80,0xed,0x80,0x2e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q1, [r0]
 
 # CHECK: vstrb.16 q0, [r7] @ encoding: [0x87,0xed,0x80,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r7]
 
 # CHECK: vstrb.16 q3, [r7] @ encoding: [0x87,0xed,0x80,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q3, [r7]
 
 # CHECK: vstrb.16 q0, [r4, #56] @ encoding: [0x84,0xed,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r4, #56]
 
 # CHECK: vstrb.16 q4, [r4, #56] @ encoding: [0x84,0xed,0xb8,0x8e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q4, [r4, #56]
 
 # CHECK: vstrb.16 q0, [r5, #56] @ encoding: [0x85,0xed,0xb8,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r5, #56]
 
 # CHECK: vstrb.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r4, #56]!
 
 # CHECK: vstrb.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0xb8,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r4, #56]!
 
 # CHECK: vstrb.16 q5, [r4], #-25 @ encoding: [0x24,0xec,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r4], #-25
 
 # CHECK: vstrb.16 q5, [r3], #-25 @ encoding: [0x23,0xec,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r3], #-25
 
 # CHECK: vstrb.16 q5, [r2, #-25] @ encoding: [0x02,0xed,0x99,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r2, #-25]
 
 # CHECK: vstrb.16 q5, [r2, #-64] @ encoding: [0x02,0xed,0xc0,0xae]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q5, [r2, #-64]
 
 # CHECK: vldrb.u32 q0, [r0] @ encoding: [0x90,0xfd,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r0]
 
 # CHECK: vldrb.u32 q1, [r0] @ encoding: [0x90,0xfd,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q1, [r0]
 
 # CHECK: vldrb.u32 q0, [r7] @ encoding: [0x97,0xfd,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r7]
 
 # CHECK: vldrb.u32 q3, [r7] @ encoding: [0x97,0xfd,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q3, [r7]
 
 # CHECK: vldrb.u32 q0, [r4, #56] @ encoding: [0x94,0xfd,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r4, #56]
 
 # CHECK: vldrb.u32 q4, [r4, #56] @ encoding: [0x94,0xfd,0x38,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q4, [r4, #56]
 
 # CHECK: vldrb.u32 q0, [r2, #56] @ encoding: [0x92,0xfd,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r2, #56]
 
 # CHECK: vldrb.u32 q5, [r4, #56]! @ encoding: [0xb4,0xfd,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r4, #56]!
 
 # CHECK: vldrb.u32 q5, [r4, #56]! @ encoding: [0xb4,0xfd,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r4, #56]!
 
 # CHECK: vldrb.u32 q5, [r4], #-25 @ encoding: [0x34,0xfc,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r4], #-25
 
 # CHECK: vldrb.u32 q5, [r3], #-25 @ encoding: [0x33,0xfc,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r3], #-25
 
 # CHECK: vldrb.u32 q5, [r6, #-25] @ encoding: [0x16,0xfd,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r6, #-25]
 
 # CHECK: vldrb.u32 q5, [r6, #-64] @ encoding: [0x16,0xfd,0x40,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q5, [r6, #-64]
 
 # CHECK: vldrb.s32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r0]
 
 # CHECK: vldrb.s32 q1, [r0] @ encoding: [0x90,0xed,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q1, [r0]
 
 # CHECK: vldrb.s32 q0, [r7] @ encoding: [0x97,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r7]
 
 # CHECK: vldrb.s32 q3, [r7] @ encoding: [0x97,0xed,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q3, [r7]
 
 # CHECK: vldrb.s32 q0, [r4, #56] @ encoding: [0x94,0xed,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r4, #56]
 
 # CHECK: vldrb.s32 q4, [r4, #56] @ encoding: [0x94,0xed,0x38,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q4, [r4, #56]
 
 # CHECK: vldrb.s32 q0, [r2, #56] @ encoding: [0x92,0xed,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r2, #56]
 
 # CHECK: vldrb.s32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r4, #56]!
 
 # CHECK: vldrb.s32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r4, #56]!
 
 # CHECK: vldrb.s32 q5, [r4], #-25 @ encoding: [0x34,0xec,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r4], #-25
 
 # CHECK: vldrb.s32 q5, [r3], #-25 @ encoding: [0x33,0xec,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r3], #-25
 
 # CHECK: vldrb.s32 q5, [r6, #-25] @ encoding: [0x16,0xed,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r6, #-25]
 
 # CHECK: vldrb.s32 q5, [r6, #-64] @ encoding: [0x16,0xed,0x40,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q5, [r6, #-64]
 
 # CHECK: vstrb.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r0]
 
 # CHECK: vstrb.32 q1, [r0] @ encoding: [0x80,0xed,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q1, [r0]
 
 # CHECK: vstrb.32 q0, [r7] @ encoding: [0x87,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r7]
 
 # CHECK: vstrb.32 q3, [r7] @ encoding: [0x87,0xed,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q3, [r7]
 
 # CHECK: vstrb.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r4, #56]
 
 # CHECK: vstrb.32 q4, [r4, #56] @ encoding: [0x84,0xed,0x38,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q4, [r4, #56]
 
 # CHECK: vstrb.32 q0, [r5, #56] @ encoding: [0x85,0xed,0x38,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r5, #56]
 
 # CHECK: vstrb.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r4, #56]!
 
 # CHECK: vstrb.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r4, #56]!
 
 # CHECK: vstrb.32 q5, [r4], #-25 @ encoding: [0x24,0xec,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r4], #-25
 
 # CHECK: vstrb.32 q5, [r3], #-25 @ encoding: [0x23,0xec,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r3], #-25
 
 # CHECK: vstrb.32 q5, [r2, #-25] @ encoding: [0x02,0xed,0x19,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r2, #-25]
 
 # CHECK: vstrb.32 q5, [r2, #-64] @ encoding: [0x02,0xed,0x40,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q5, [r2, #-64]
 
 # CHECK: vldrh.u16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0]
 
 # CHECK: vldrh.u16 q1, [r0] @ encoding: [0x90,0xed,0x80,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q1, [r0]
 
 # CHECK: vldrh.u16 q0, [r11] @ encoding: [0x9b,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r11]
 
 # CHECK: vldrh.u16 q3, [r11] @ encoding: [0x9b,0xed,0x80,0x7e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q3, [r11]
 
 # CHECK: vldrh.u16 q0, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r4, #56]
 
 # CHECK: vldrh.u16 q4, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q4, [r4, #56]
 
 # CHECK: vldrh.u16 q0, [r8, #56] @ encoding: [0x98,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r8, #56]
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q5, [r4], #-26 @ encoding: [0x34,0xec,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r4], #-26
 
 # CHECK: vldrh.u16 q5, [r10], #-26 @ encoding: [0x3a,0xec,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r10], #-26
 
 # CHECK: vldrh.u16 q5, [sp, #-26] @ encoding: [0x1d,0xed,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [sp, #-26]
 
 # CHECK: vldrh.u16 q5, [sp, #-64] @ encoding: [0x1d,0xed,0xa0,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [sp, #-64]
 
 # CHECK: vldrh.u16 q5, [sp, #-254] @ encoding: [0x1d,0xed,0xff,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [sp, #-254]
 
 # CHECK: vldrh.u16 q5, [r10], #254 @ encoding: [0xba,0xec,0xff,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q5, [r10], #254
 
 # CHECK: vstrh.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r0]
 
 # CHECK: vstrh.16 q1, [r0] @ encoding: [0x80,0xed,0x80,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q1, [r0]
 
 # CHECK: vstrh.16 q0, [r11] @ encoding: [0x8b,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r11]
 
 # CHECK: vstrh.16 q3, [r11] @ encoding: [0x8b,0xed,0x80,0x7e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q3, [r11]
 
 # CHECK: vstrh.16 q0, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r4, #56]
 
 # CHECK: vstrh.16 q4, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q4, [r4, #56]
 
 # CHECK: vstrh.16 q0, [r8, #56] @ encoding: [0x88,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r8, #56]
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q5, [r4], #-26 @ encoding: [0x24,0xec,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r4], #-26
 
 # CHECK: vstrh.16 q5, [r10], #-26 @ encoding: [0x2a,0xec,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r10], #-26
 
 # CHECK: vstrh.16 q5, [sp, #-26] @ encoding: [0x0d,0xed,0x8d,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [sp, #-26]
 
 # CHECK: vstrh.16 q5, [sp, #-64] @ encoding: [0x0d,0xed,0xa0,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [sp, #-64]
 
 # CHECK: vstrh.16 q5, [sp, #-254] @ encoding: [0x0d,0xed,0xff,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [sp, #-254]
 
 # CHECK: vstrh.16 q5, [r10], #254 @ encoding: [0xaa,0xec,0xff,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q5, [r10], #254
 
 # CHECK: vldrh.u32 q0, [r0] @ encoding: [0x98,0xfd,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r0]
 
 # CHECK: vldrh.u32 q1, [r0] @ encoding: [0x98,0xfd,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q1, [r0]
 
 # CHECK: vldrh.u32 q0, [r7] @ encoding: [0x9f,0xfd,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r7]
 
 # CHECK: vldrh.u32 q3, [r7] @ encoding: [0x9f,0xfd,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q3, [r7]
 
 # CHECK: vldrh.u32 q0, [r4, #56] @ encoding: [0x9c,0xfd,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r4, #56]
 
 # CHECK: vldrh.u32 q4, [r4, #56] @ encoding: [0x9c,0xfd,0x1c,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q4, [r4, #56]
 
 # CHECK: vldrh.u32 q0, [r2, #56] @ encoding: [0x9a,0xfd,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r2, #56]
 
 # CHECK: vldrh.u32 q5, [r4, #56]! @ encoding: [0xbc,0xfd,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r4, #56]!
 
 # CHECK: vldrh.u32 q5, [r4, #56]! @ encoding: [0xbc,0xfd,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r4, #56]!
 
 # CHECK: vldrh.u32 q5, [r4], #-26 @ encoding: [0x3c,0xfc,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r4], #-26
 
 # CHECK: vldrh.u32 q5, [r3], #-26 @ encoding: [0x3b,0xfc,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r3], #-26
 
 # CHECK: vldrh.u32 q5, [r6, #-26] @ encoding: [0x1e,0xfd,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r6, #-26]
 
 # CHECK: vldrh.u32 q5, [r6, #-64] @ encoding: [0x1e,0xfd,0x20,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r6, #-64]
 
 # CHECK: vldrh.u32 q5, [r6, #-254] @ encoding: [0x1e,0xfd,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r6, #-254]
 
 # CHECK: vldrh.u32 q5, [r4, #254]! @ encoding: [0xbc,0xfd,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q5, [r4, #254]!
 
 # CHECK: vldrh.s32 q0, [r0] @ encoding: [0x98,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r0]
 
 # CHECK: vldrh.s32 q1, [r0] @ encoding: [0x98,0xed,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q1, [r0]
 
 # CHECK: vldrh.s32 q0, [r7] @ encoding: [0x9f,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r7]
 
 # CHECK: vldrh.s32 q3, [r7] @ encoding: [0x9f,0xed,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q3, [r7]
 
 # CHECK: vldrh.s32 q0, [r4, #56] @ encoding: [0x9c,0xed,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r4, #56]
 
 # CHECK: vldrh.s32 q4, [r4, #56] @ encoding: [0x9c,0xed,0x1c,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q4, [r4, #56]
 
 # CHECK: vldrh.s32 q0, [r2, #56] @ encoding: [0x9a,0xed,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r2, #56]
 
 # CHECK: vldrh.s32 q5, [r4, #56]! @ encoding: [0xbc,0xed,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r4, #56]!
 
 # CHECK: vldrh.s32 q5, [r4, #56]! @ encoding: [0xbc,0xed,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r4, #56]!
 
 # CHECK: vldrh.s32 q5, [r4], #-26 @ encoding: [0x3c,0xec,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r4], #-26
 
 # CHECK: vldrh.s32 q5, [r3], #-26 @ encoding: [0x3b,0xec,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r3], #-26
 
 # CHECK: vldrh.s32 q5, [r6, #-26] @ encoding: [0x1e,0xed,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r6, #-26]
 
 # CHECK: vldrh.s32 q5, [r6, #-64] @ encoding: [0x1e,0xed,0x20,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r6, #-64]
 
 # CHECK: vldrh.s32 q5, [r6, #-254] @ encoding: [0x1e,0xed,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r6, #-254]
 
 # CHECK: vldrh.s32 q5, [r4, #254]! @ encoding: [0xbc,0xed,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q5, [r4, #254]!
 
 # CHECK: vstrh.32 q0, [r0] @ encoding: [0x88,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r0]
 
 # CHECK: vstrh.32 q1, [r0] @ encoding: [0x88,0xed,0x00,0x2f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q1, [r0]
 
 # CHECK: vstrh.32 q0, [r7] @ encoding: [0x8f,0xed,0x00,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r7]
 
 # CHECK: vstrh.32 q3, [r7] @ encoding: [0x8f,0xed,0x00,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q3, [r7]
 
 # CHECK: vstrh.32 q0, [r4, #56] @ encoding: [0x8c,0xed,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r4, #56]
 
 # CHECK: vstrh.32 q4, [r4, #56] @ encoding: [0x8c,0xed,0x1c,0x8f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q4, [r4, #56]
 
 # CHECK: vstrh.32 q0, [r5, #56] @ encoding: [0x8d,0xed,0x1c,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r5, #56]
 
 # CHECK: vstrh.32 q5, [r4, #56]! @ encoding: [0xac,0xed,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r4, #56]!
 
 # CHECK: vstrh.32 q5, [r4, #56]! @ encoding: [0xac,0xed,0x1c,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r4, #56]!
 
 # CHECK: vstrh.32 q5, [r4], #-26 @ encoding: [0x2c,0xec,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r4], #-26
 
 # CHECK: vstrh.32 q5, [r3], #-26 @ encoding: [0x2b,0xec,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r3], #-26
 
 # CHECK: vstrh.32 q5, [r2, #-26] @ encoding: [0x0a,0xed,0x0d,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r2, #-26]
 
 # CHECK: vstrh.32 q5, [r2, #-64] @ encoding: [0x0a,0xed,0x20,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r2, #-64]
 
 # CHECK: vstrh.32 q5, [r2, #-254] @ encoding: [0x0a,0xed,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r2, #-254]
 
 # CHECK: vstrh.32 q5, [r4, #254]! @ encoding: [0xac,0xed,0x7f,0xaf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q5, [r4, #254]!
 
 # CHECK: vldrw.u32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0]
 
 # CHECK: vldrw.u32 q1, [r0] @ encoding: [0x90,0xed,0x00,0x3f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q1, [r0]
 
 # CHECK: vldrw.u32 q0, [r11] @ encoding: [0x9b,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r11]
 
 # CHECK: vldrw.u32 q3, [r11] @ encoding: [0x9b,0xed,0x00,0x7f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q3, [r11]
 
 # CHECK: vldrw.u32 q0, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r4, #56]
 
 # CHECK: vldrw.u32 q4, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x9f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q4, [r4, #56]
 
 # CHECK: vldrw.u32 q0, [r8, #56] @ encoding: [0x98,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r8, #56]
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q5, [r4], #-28 @ encoding: [0x34,0xec,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r4], #-28
 
 # CHECK: vldrw.u32 q5, [r10], #-28 @ encoding: [0x3a,0xec,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r10], #-28
 
 # CHECK: vldrw.u32 q5, [sp, #-28] @ encoding: [0x1d,0xed,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [sp, #-28]
 
 # CHECK: vldrw.u32 q5, [sp, #-64] @ encoding: [0x1d,0xed,0x10,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [sp, #-64]
 
 # CHECK: vldrw.u32 q5, [sp, #-508] @ encoding: [0x1d,0xed,0x7f,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [sp, #-508]
 
 # CHECK: vldrw.u32 q5, [r4, #508]! @ encoding: [0xb4,0xed,0x7f,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q5, [r4, #508]!
 
 # CHECK: vstrw.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r0]
 
 # CHECK: vstrw.32 q1, [r0] @ encoding: [0x80,0xed,0x00,0x3f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q1, [r0]
 
 # CHECK: vstrw.32 q0, [r11] @ encoding: [0x8b,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r11]
 
 # CHECK: vstrw.32 q3, [r11] @ encoding: [0x8b,0xed,0x00,0x7f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q3, [r11]
 
 # CHECK: vstrw.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r4, #56]
 
 # CHECK: vstrw.32 q4, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x9f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q4, [r4, #56]
 
 # CHECK: vstrw.32 q0, [r8, #56] @ encoding: [0x88,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r8, #56]
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q5, [r4], #-28 @ encoding: [0x24,0xec,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r4], #-28
 
 # CHECK: vstrw.32 q5, [r10], #-28 @ encoding: [0x2a,0xec,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r10], #-28
 
 # CHECK: vstrw.32 q5, [sp, #-28] @ encoding: [0x0d,0xed,0x07,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [sp, #-28]
 
 # CHECK: vstrw.32 q5, [sp, #-64] @ encoding: [0x0d,0xed,0x10,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [sp, #-64]
 
 # CHECK: vstrw.32 q5, [sp, #-508] @ encoding: [0x0d,0xed,0x7f,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [sp, #-508]
 
 # CHECK: vstrw.32 q5, [r4, #508]! @ encoding: [0xa4,0xed,0x7f,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q5, [r4, #508]!
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: invalid operand for instruction
@@ -885,271 +885,271 @@ vstrw.32 q5, [sp, #-3]
 vstrw.32 q5, [sp, #512]
 
 # CHECK: vldrb.u8 q0, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r0, q1]
 
 # CHECK: vldrb.u8 q3, [r10, q1] @ encoding: [0x9a,0xfc,0x02,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q3, [r10, q1]
 
 # CHECK: vldrb.u16 q0, [r0, q1] @ encoding: [0x90,0xfc,0x82,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r0, q1]
 
 # CHECK: vldrb.u16 q3, [r9, q1] @ encoding: [0x99,0xfc,0x82,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q3, [r9, q1]
 
 # CHECK: vldrb.s16 q0, [r0, q1] @ encoding: [0x90,0xec,0x82,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r0, q1]
 
 # CHECK: vldrb.s16 q3, [sp, q1] @ encoding: [0x9d,0xec,0x82,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q3, [sp, q1]
 
 # CHECK: vldrb.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r0, q1]
 
 # CHECK: vldrb.u32 q3, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q3, [r0, q1]
 
 # CHECK: vldrb.s32 q0, [r0, q1] @ encoding: [0x90,0xec,0x02,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r0, q1]
 
 # CHECK: vldrb.s32 q3, [r0, q1] @ encoding: [0x90,0xec,0x02,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q3, [r0, q1]
 
 # CHECK: vldrh.u16 q0, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0, q1]
 
 # CHECK: vldrh.u16 q3, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q3, [r0, q1]
 
 # CHECK: vldrh.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x12,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r0, q1]
 
 # CHECK: vldrh.u32 q3, [r0, q1] @ encoding: [0x90,0xfc,0x12,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q3, [r0, q1]
 
 # CHECK: vldrh.s32 q0, [r0, q1] @ encoding: [0x90,0xec,0x12,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r0, q1]
 
 # CHECK: vldrh.s32 q3, [r0, q1] @ encoding: [0x90,0xec,0x12,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q3, [r0, q1]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u8 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u16 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s16 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.u32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0, q0, uxtw #1]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u32 q0, [r0, q0, uxtw #1]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s32 q0, [r0, q0, uxtw #1]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0, q0, uxtw #2]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [r0, q0]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector offset register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [r0, q0, uxtw #3]
 
 # CHECK: vldrh.u16 q0, [r0, q1, uxtw #1] @ encoding: [0x90,0xfc,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.u16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vldrw.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0, q1]
 
 # CHECK: vldrw.u32 q3, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q3, [r0, q1]
 
 # CHECK: vldrw.u32 q0, [r0, q1, uxtw #2] @ encoding: [0x90,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vldrw.u32 q0, [sp, q1, uxtw #2] @ encoding: [0x9d,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [sp, q1, uxtw #2]
 
 # CHECK: vldrd.u64 q0, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [r0, q1]
 
 # CHECK: vldrd.u64 q3, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q3, [r0, q1]
 
 # CHECK: vldrd.u64 q0, [r0, q1, uxtw #3] @ encoding: [0x90,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vldrd.u64 q0, [sp, q1, uxtw #3] @ encoding: [0x9d,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [sp, q1, uxtw #3]
 
 # CHECK: vstrb.8 q0, [r0, q1] @ encoding: [0x80,0xec,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q0, [r0, q1]
 
 # CHECK: vstrb.8 q3, [r10, q1] @ encoding: [0x8a,0xec,0x02,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q3, [r10, q1]
 
 # CHECK: vstrb.8 q3, [r0, q3] @ encoding: [0x80,0xec,0x06,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.8 q3, [r0, q3]
 
 # CHECK: vstrb.16 q0, [r0, q1] @ encoding: [0x80,0xec,0x82,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q0, [r0, q1]
 
 # CHECK: vstrb.16 q3, [sp, q1] @ encoding: [0x8d,0xec,0x82,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q3, [sp, q1]
 
 # CHECK: vstrb.16 q3, [r0, q3] @ encoding: [0x80,0xec,0x86,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.16 q3, [r0, q3]
 
 # CHECK: vstrb.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x02,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q0, [r0, q1]
 
 # CHECK: vstrb.32 q3, [r0, q1] @ encoding: [0x80,0xec,0x02,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q3, [r0, q1]
 
 # CHECK: vstrb.32 q3, [r0, q3] @ encoding: [0x80,0xec,0x06,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.32 q3, [r0, q3]
 
 # CHECK: vstrh.16 q0, [r0, q1] @ encoding: [0x80,0xec,0x92,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q1] @ encoding: [0x80,0xec,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q3, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q3] @ encoding: [0x80,0xec,0x96,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q3, [r0, q3]
 
 # CHECK: vstrh.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x12,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q0, [r0, q1]
 
 # CHECK: vstrh.32 q3, [r0, q1] @ encoding: [0x80,0xec,0x12,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q3, [r0, q1]
 
 # CHECK: vstrh.32 q3, [r0, q3] @ encoding: [0x80,0xec,0x16,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q3, [r0, q3]
 
 # CHECK: vstrh.16 q0, [r0, q1, uxtw #1] @ encoding: [0x80,0xec,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vstrh.32 q3, [r8, q3, uxtw #1] @ encoding: [0x88,0xec,0x17,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.32 q3, [r8, q3, uxtw #1]
 
 # CHECK: vstrw.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r0, q1]
 
 # CHECK: vstrw.32 q3, [r0, q1] @ encoding: [0x80,0xec,0x42,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q3, [r0, q1]
 
 # CHECK: vstrw.32 q3, [r0, q3] @ encoding: [0x80,0xec,0x46,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q3, [r0, q3]
 
 # CHECK: vstrw.32 q0, [r0, q1, uxtw #2] @ encoding: [0x80,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vstrw.32 q0, [sp, q1, uxtw #2] @ encoding: [0x8d,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [sp, q1, uxtw #2]
 
 # CHECK: vstrd.64 q0, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q0, [r0, q1]
 
 # CHECK: vstrd.64 q3, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q3, [r0, q1]
 
 # CHECK: vstrd.64 q3, [r0, q3] @ encoding: [0x80,0xec,0xd6,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q3, [r0, q3]
 
 # CHECK: vstrd.64 q0, [r0, q1, uxtw #3] @ encoding: [0x80,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vstrd.64 q0, [sp, q1, uxtw #3] @ encoding: [0x8d,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q0, [sp, q1, uxtw #3]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: operand must be a register in range [q0, q7]
@@ -1189,79 +1189,79 @@ vstrh.16 q0, [r0, q1, uxtw #2]
 vstrb.32 q0, [r11, q1, uxtw #1]
 
 # CHECK: vldrw.u32 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q0, [q1]
 
 # CHECK: vldrw.u32 q7, [q1] @ encoding: [0x92,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1]
 
 # CHECK: vldrw.u32 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1]!
 
 # CHECK: vldrw.u32 q7, [q1, #4] @ encoding: [0x92,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #4]
 
 # CHECK: vldrw.u32 q7, [q1, #-4] @ encoding: [0x12,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #-4]
 
 # CHECK: vldrw.u32 q7, [q1, #508] @ encoding: [0x92,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #508]
 
 # CHECK: vldrw.u32 q7, [q1, #-508] @ encoding: [0x12,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #-508]
 
 # CHECK: vldrw.u32 q7, [q1, #264] @ encoding: [0x92,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #264]
 
 # CHECK: vldrw.u32 q7, [q1, #4]! @ encoding: [0xb2,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #4]!
 
 # CHECK: vstrw.32 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q0, [q1]
 
 # CHECK: vstrw.32 q1, [q1] @ encoding: [0x82,0xfd,0x00,0x3e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q1, [q1]
 
 # CHECK: vstrw.32 q7, [q1] @ encoding: [0x82,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1]
 
 # CHECK: vstrw.32 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1]!
 
 # CHECK: vstrw.32 q7, [q7] @ encoding: [0x8e,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q7]
 
 # CHECK: vstrw.32 q7, [q1, #4] @ encoding: [0x82,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #4]
 
 # CHECK: vstrw.32 q7, [q1, #-4] @ encoding: [0x02,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #-4]
 
 # CHECK: vstrw.32 q7, [q1, #508] @ encoding: [0x82,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #508]
 
 # CHECK: vstrw.32 q7, [q1, #-508] @ encoding: [0x02,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #-508]
 
 # CHECK: vstrw.32 q7, [q1, #264]! @ encoding: [0xa2,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.32 q7, [q1, #264]!
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: operand must be a register in range [q0, q7]
@@ -1277,103 +1277,103 @@ vstrw.32 q4, [q1, #3]!
 vldrw.u32 q7, [q1, #512]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector pointer register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q1, [q1, #264]
 
 # CHECK: vldrd.u64 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q0, [q1]
 
 # CHECK: vldrd.u64 q7, [q1] @ encoding: [0x92,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1]
 
 # CHECK: vldrd.u64 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1]!
 
 # CHECK: vldrd.u64 q7, [q1, #8] @ encoding: [0x92,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #8]
 
 # CHECK: vldrd.u64 q7, [q1, #-8] @ encoding: [0x12,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #-8]
 
 # CHECK: vldrd.u64 q7, [q1, #1016] @ encoding: [0x92,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #1016]
 
 # CHECK: vldrd.u64 q7, [q1, #-1016] @ encoding: [0x12,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #-1016]
 
 # CHECK: vldrd.u64 q7, [q1, #264] @ encoding: [0x92,0xfd,0x21,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #264]
 
 # CHECK: vldrd.u64 q7, [q1, #624] @ encoding: [0x92,0xfd,0x4e,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #624]
 
 # CHECK: vldrd.u64 q7, [q1, #264] @ encoding: [0x92,0xfd,0x21,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #264]
 
 # CHECK: vldrd.u64 q7, [q1, #-1016]! @ encoding: [0x32,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q7, [q1, #-1016]!
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: destination vector register and vector pointer register can't be identical
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.u64 q6, [q6]
 
 # CHECK: vstrd.64 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q0, [q1]
 
 # CHECK: vstrd.64 q1, [q1] @ encoding: [0x82,0xfd,0x00,0x3f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q1, [q1]
 
 # CHECK: vstrd.64 q7, [q1] @ encoding: [0x82,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1]
 
 # CHECK: vstrd.64 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1]!
 
 # CHECK: vstrd.64 q7, [q7] @ encoding: [0x8e,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q7]
 
 # CHECK: vstrd.64 q7, [q1, #8] @ encoding: [0x82,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #8]
 
 # CHECK: vstrd.64 q7, [q1, #-8]! @ encoding: [0x22,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #-8]!
 
 # CHECK: vstrd.64 q7, [q1, #1016] @ encoding: [0x82,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #1016]
 
 # CHECK: vstrd.64 q7, [q1, #-1016] @ encoding: [0x02,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #-1016]
 
 # CHECK: vstrd.64 q7, [q1, #264] @ encoding: [0x82,0xfd,0x21,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #264]
 
 # CHECK: vstrd.64 q7, [q1, #624] @ encoding: [0x82,0xfd,0x4e,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #624]
 
 # CHECK: vstrd.64 q7, [q1, #264] @ encoding: [0x82,0xfd,0x21,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.64 q7, [q1, #264]
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: operand must be a register in range [q0, q7]
@@ -1393,547 +1393,547 @@ vstrd.64 q4, [q1, #3]
 vstrd.64 q4, [q1, #4]
 
 # CHECK: vldrb.u8 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s8 q0, [r0]
 
 # CHECK: vldrb.u8 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.8 q0, [r0]
 
 # CHECK: vldrb.u8 q0, [r8, #56] @ encoding: [0x98,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s8 q0, [r8, #56]
 
 # CHECK: vldrb.u8 q0, [r8, #56] @ encoding: [0x98,0xed,0x38,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.8 q0, [r8, #56]
 
 # CHECK: vldrb.u8 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s8 q5, [r4, #56]!
 
 # CHECK: vldrb.u8 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.8 q5, [r4, #56]!
 
 # CHECK: vstrb.8 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.u8 q0, [r0]
 
 # CHECK: vstrb.8 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.s8 q0, [r0]
 
 # CHECK: vstrb.8 q4, [r4, #56] @ encoding: [0x84,0xed,0x38,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.u8 q4, [r4, #56]
 
 # CHECK: vstrb.8 q4, [r4, #56] @ encoding: [0x84,0xed,0x38,0x9e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.s8 q4, [r4, #56]
 
 # CHECK: vstrb.8 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.u8 q5, [r4, #56]!
 
 # CHECK: vstrb.8 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x38,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.s8 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q0, [r0]
 
 # CHECK: vldrh.u16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q0, [r0]
 
 # CHECK: vldrh.u16 q0, [r0] @ encoding: [0x90,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q0, [r0]
 
 # CHECK: vldrh.u16 q0, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q0, [r4, #56]
 
 # CHECK: vldrh.u16 q0, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q0, [r4, #56]
 
 # CHECK: vldrh.u16 q0, [r4, #56] @ encoding: [0x94,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q0, [r4, #56]
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q5, [r4, #56]!
 
 # CHECK: vldrh.u16 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q0, [r0]
 
 # CHECK: vstrh.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q0, [r0]
 
 # CHECK: vstrh.16 q0, [r0] @ encoding: [0x80,0xed,0x80,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q0, [r0]
 
 # CHECK: vstrh.16 q0, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q0, [r4, #56]
 
 # CHECK: vstrh.16 q0, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q0, [r4, #56]
 
 # CHECK: vstrh.16 q0, [r4, #56] @ encoding: [0x84,0xed,0x9c,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q0, [r4, #56]
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q5, [r4, #56]!
 
 # CHECK: vstrh.16 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x9c,0xbe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [r0]
 
 # CHECK: vldrw.u32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [r0]
 
 # CHECK: vldrw.u32 q0, [r0] @ encoding: [0x90,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [r0]
 
 # CHECK: vldrw.u32 q0, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [r4, #56]
 
 # CHECK: vldrw.u32 q0, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [r4, #56]
 
 # CHECK: vldrw.u32 q0, [r4, #56] @ encoding: [0x94,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [r4, #56]
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q5, [r4, #56]!
 
 # CHECK: vldrw.u32 q5, [r4, #56]! @ encoding: [0xb4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [r0]
 
 # CHECK: vstrw.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [r0]
 
 # CHECK: vstrw.32 q0, [r0] @ encoding: [0x80,0xed,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [r0]
 
 # CHECK: vstrw.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [r4, #56]
 
 # CHECK: vstrw.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [r4, #56]
 
 # CHECK: vstrw.32 q0, [r4, #56] @ encoding: [0x84,0xed,0x0e,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [r4, #56]
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q5, [r4, #56]!
 
 # CHECK: vstrw.32 q5, [r4, #56]! @ encoding: [0xa4,0xed,0x0e,0xbf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q5, [r4, #56]!
 
 # CHECK: vldrb.u8 q0, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.s8 q0, [r0, q1]
 
 # CHECK: vldrb.u8 q0, [r0, q1] @ encoding: [0x90,0xfc,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrb.8 q0, [r0, q1]
 
 # CHECK: vldrh.u16 q3, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q3, [r0, q1]
 
 # CHECK: vldrh.u16 q3, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q3, [r0, q1]
 
 # CHECK: vldrh.u16 q3, [r0, q1] @ encoding: [0x90,0xfc,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q3, [r0, q1]
 
 # CHECK: vldrh.u16 q0, [r0, q1, uxtw #1] @ encoding: [0x90,0xfc,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.s16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vldrh.u16 q0, [r0, q1, uxtw #1] @ encoding: [0x90,0xfc,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.f16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vldrh.u16 q0, [r0, q1, uxtw #1] @ encoding: [0x90,0xfc,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrh.16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vldrw.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [r0, q1]
 
 # CHECK: vldrw.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [r0, q1]
 
 # CHECK: vldrw.u32 q0, [r0, q1] @ encoding: [0x90,0xfc,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [r0, q1]
 
 # CHECK: vldrw.u32 q0, [r0, q1, uxtw #2] @ encoding: [0x90,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vldrw.u32 q0, [r0, q1, uxtw #2] @ encoding: [0x90,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vldrw.u32 q0, [r0, q1, uxtw #2] @ encoding: [0x90,0xfc,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vldrd.u64 q0, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q0, [r0, q1]
 
 # CHECK: vldrd.u64 q0, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q0, [r0, q1]
 
 # CHECK: vldrd.u64 q0, [r0, q1] @ encoding: [0x90,0xfc,0xd2,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q0, [r0, q1]
 
 # CHECK: vldrd.u64 q0, [r0, q1, uxtw #3] @ encoding: [0x90,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vldrd.u64 q0, [r0, q1, uxtw #3] @ encoding: [0x90,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vldrd.u64 q0, [r0, q1, uxtw #3] @ encoding: [0x90,0xfc,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vstrb.8 q0, [r0, q1] @ encoding: [0x80,0xec,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.u8 q0, [r0, q1]
 
 # CHECK: vstrb.8 q0, [r0, q1] @ encoding: [0x80,0xec,0x02,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrb.s8 q0, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q1] @ encoding: [0x80,0xec,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q3, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q1] @ encoding: [0x80,0xec,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q3, [r0, q1]
 
 # CHECK: vstrh.16 q3, [r0, q1] @ encoding: [0x80,0xec,0x92,0x6e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q3, [r0, q1]
 
 # CHECK: vstrh.16 q0, [r0, q1, uxtw #1] @ encoding: [0x80,0xec,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.u16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vstrh.16 q0, [r0, q1, uxtw #1] @ encoding: [0x80,0xec,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.s16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vstrh.16 q0, [r0, q1, uxtw #1] @ encoding: [0x80,0xec,0x93,0x0e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrh.f16 q0, [r0, q1, uxtw #1]
 
 # CHECK: vstrw.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [r0, q1]
 
 # CHECK: vstrw.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [r0, q1]
 
 # CHECK: vstrw.32 q0, [r0, q1] @ encoding: [0x80,0xec,0x42,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [r0, q1]
 
 # CHECK: vstrw.32 q0, [r0, q1, uxtw #2] @ encoding: [0x80,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vstrw.32 q0, [r0, q1, uxtw #2] @ encoding: [0x80,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vstrw.32 q0, [r0, q1, uxtw #2] @ encoding: [0x80,0xec,0x43,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [r0, q1, uxtw #2]
 
 # CHECK: vstrd.64 q3, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q3, [r0, q1]
 
 # CHECK: vstrd.64 q3, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q3, [r0, q1]
 
 # CHECK: vstrd.64 q3, [r0, q1] @ encoding: [0x80,0xec,0xd2,0x6f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q3, [r0, q1]
 
 # CHECK: vstrd.64 q0, [r0, q1, uxtw #3] @ encoding: [0x80,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vstrd.64 q0, [r0, q1, uxtw #3] @ encoding: [0x80,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vstrd.64 q0, [r0, q1, uxtw #3] @ encoding: [0x80,0xec,0xd3,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q0, [r0, q1, uxtw #3]
 
 # CHECK: vldrw.u32 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q0, [q1]
 
 # CHECK: vldrw.u32 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q0, [q1]
 
 # CHECK: vldrw.u32 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q0, [q1]
 
 # CHECK: vldrw.u32 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q7, [q1]!
 
 # CHECK: vldrw.u32 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q7, [q1]!
 
 # CHECK: vldrw.u32 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q7, [q1]!
 
 # CHECK: vldrw.u32 q7, [q1, #4] @ encoding: [0x92,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q7, [q1, #4]
 
 # CHECK: vldrw.u32 q7, [q1, #4] @ encoding: [0x92,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q7, [q1, #4]
 
 # CHECK: vldrw.u32 q7, [q1, #4] @ encoding: [0x92,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.32 q7, [q1, #4]
 
 # CHECK: vldrw.u32 q7, [q1, #4]! @ encoding: [0xb2,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.s32 q7, [q1, #4]!
 
 # CHECK: vldrw.u32 q7, [q1, #4]! @ encoding: [0xb2,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.f32 q7, [q1, #4]!
 
 # CHECK: vldrw.u32 q7, [q1, #4]! @ encoding: [0xb2,0xfd,0x01,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrw.u32 q7, [q1, #4]!
 
 # CHECK: vstrw.32 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q0, [q1]
 
 # CHECK: vstrw.32 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q0, [q1]
 
 # CHECK: vstrw.32 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1e]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q0, [q1]
 
 # CHECK: vstrw.32 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q7, [q1]!
 
 # CHECK: vstrw.32 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q7, [q1]!
 
 # CHECK: vstrw.32 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q7, [q1]!
 
 # CHECK: vstrw.32 q7, [q1, #508] @ encoding: [0x82,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q7, [q1, #508]
 
 # CHECK: vstrw.32 q7, [q1, #508] @ encoding: [0x82,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q7, [q1, #508]
 
 # CHECK: vstrw.32 q7, [q1, #508] @ encoding: [0x82,0xfd,0x7f,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q7, [q1, #508]
 
 # CHECK: vstrw.32 q7, [q1, #264]! @ encoding: [0xa2,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.u32 q7, [q1, #264]!
 
 # CHECK: vstrw.32 q7, [q1, #264]! @ encoding: [0xa2,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.s32 q7, [q1, #264]!
 
 # CHECK: vstrw.32 q7, [q1, #264]! @ encoding: [0xa2,0xfd,0x42,0xfe]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrw.f32 q7, [q1, #264]!
 
 # CHECK: vldrd.u64 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q0, [q1]
 
 # CHECK: vldrd.u64 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q0, [q1]
 
 # CHECK: vldrd.u64 q0, [q1] @ encoding: [0x92,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q0, [q1]
 
 # CHECK: vldrd.u64 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q7, [q1]!
 
 # CHECK: vldrd.u64 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q7, [q1]!
 
 # CHECK: vldrd.u64 q7, [q1]! @ encoding: [0xb2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q7, [q1]!
 
 # CHECK: vldrd.u64 q7, [q1, #8] @ encoding: [0x92,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q7, [q1, #8]
 
 # CHECK: vldrd.u64 q7, [q1, #8] @ encoding: [0x92,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q7, [q1, #8]
 
 # CHECK: vldrd.u64 q7, [q1, #8] @ encoding: [0x92,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q7, [q1, #8]
 
 # CHECK: vldrd.u64 q7, [q1, #-1016]! @ encoding: [0x32,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.s64 q7, [q1, #-1016]!
 
 # CHECK: vldrd.u64 q7, [q1, #-1016]! @ encoding: [0x32,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.f64 q7, [q1, #-1016]!
 
 # CHECK: vldrd.u64 q7, [q1, #-1016]! @ encoding: [0x32,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vldrd.64 q7, [q1, #-1016]!
 
 # CHECK: vstrd.64 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q0, [q1]
 
 # CHECK: vstrd.64 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q0, [q1]
 
 # CHECK: vstrd.64 q0, [q1] @ encoding: [0x82,0xfd,0x00,0x1f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q0, [q1]
 
 # CHECK: vstrd.64 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q7, [q1]!
 
 # CHECK: vstrd.64 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q7, [q1]!
 
 # CHECK: vstrd.64 q7, [q1]! @ encoding: [0xa2,0xfd,0x00,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q7, [q1]!
 
 # CHECK: vstrd.64 q7, [q1, #1016] @ encoding: [0x82,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q7, [q1, #1016]
 
 # CHECK: vstrd.64 q7, [q1, #1016] @ encoding: [0x82,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q7, [q1, #1016]
 
 # CHECK: vstrd.64 q7, [q1, #1016] @ encoding: [0x82,0xfd,0x7f,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q7, [q1, #1016]
 
 # CHECK: vstrd.64 q7, [q1, #-8]! @ encoding: [0x22,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.u64 q7, [q1, #-8]!
 
 # CHECK: vstrd.64 q7, [q1, #-8]! @ encoding: [0x22,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.s64 q7, [q1, #-8]!
 
 # CHECK: vstrd.64 q7, [q1, #-8]! @ encoding: [0x22,0xfd,0x01,0xff]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vstrd.f64 q7, [q1, #-8]!
 
 vpste
diff --git a/llvm/test/MC/ARM/mve-misc.s b/llvm/test/MC/ARM/mve-misc.s
index f3af9e0afe64fb..251a77b4710dcd 100644
--- a/llvm/test/MC/ARM/mve-misc.s
+++ b/llvm/test/MC/ARM/mve-misc.s
@@ -7,63 +7,63 @@
 # RUN:     FileCheck --check-prefix=ERROR-NOMVE < %t %s
 
 # CHECK: vpsel   q0, q5, q2  @ encoding: [0x3b,0xfe,0x05,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vpsel   q0, q5, q2
 
 # CHECK: vpnot  @ encoding: [0x31,0xfe,0x4d,0x0f]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vpnot
 
 # CHECK: wlstp.8     lr, r0, #1668  @ encoding: [0x00,0xf0,0x43,0xc3]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r0, #1668
 
 # CHECK: wlstp.16     lr, r0, #1668  @ encoding: [0x10,0xf0,0x43,0xc3]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.16     lr, r0, #1668
 
 # CHECK: wlstp.32     lr, r4, #2706  @ encoding: [0x24,0xf0,0x49,0xcd]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.32     lr, r4, #2706
 
 # CHECK: wlstp.64     lr, lr, #3026  @ encoding: [0x3e,0xf0,0xe9,0xcd]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.64     lr, lr, #3026
 
 # CHECK: wlstp.8     lr, r5, #3436  @ encoding: [0x05,0xf0,0xb7,0xc6]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r5, #3436
 
 # CHECK: wlstp.16     lr, r1, #1060  @ encoding: [0x11,0xf0,0x13,0xc2]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.16     lr, r1, #1060
 
 # CHECK: wlstp.32     lr, r7, #4036  @ encoding: [0x27,0xf0,0xe3,0xc7]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.32     lr, r7, #4036
 
 # CHECK: wlstp.8     lr, r1, #538  @ encoding: [0x01,0xf0,0x0d,0xc9]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r1, #538
 
 # CHECK: wlstp.8     lr, r10, #1404  @ encoding: [0x0a,0xf0,0xbf,0xc2]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r10, #1404
 
 # CHECK: wlstp.8     lr, r10, #1408  @ encoding: [0x0a,0xf0,0xc1,0xc2]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r10, #1408
 
 # CHECK: wlstp.8     lr, r10, #2358  @ encoding: [0x0a,0xf0,0x9b,0xcc]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r10, #2358
 
 # CHECK: wlstp.8     lr, r10, #4086  @ encoding: [0x0a,0xf0,0xfb,0xcf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r10, #4086
 
 # CHECK: wlstp.8     lr, r11, #1442  @ encoding: [0x0b,0xf0,0xd1,0xca]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r11, #1442
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: loop end is out of range or not a positive multiple of 2
@@ -87,39 +87,39 @@ wlstp.16     lr, sp, #1442
 wlstp.32     r10, r11, #1442
 
 # CHECK: wlstp.8     lr, r1, .Lendloop  @ encoding: [0x01'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.8     lr, r1, .Lendloop
 
 # CHECK: wlstp.16     lr, r2, .Lendloop  @ encoding: [0x12'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.16     lr, r2, .Lendloop
 
 # CHECK: wlstp.32     lr, r3, .Lendloop  @ encoding: [0x23'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.32     lr, r3, .Lendloop
 
 # CHECK: wlstp.64     lr, r5, .Lendloop  @ encoding: [0x35'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.64     lr, r5, .Lendloop
 
 # CHECK: wlstp.64     lr, r5, #0  @ encoding: [0x35,0xf0,0x01,0xc0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 wlstp.64     lr, r5, #0
 
 # CHECK: dlstp.8     lr, r5  @ encoding: [0x05,0xf0,0x01,0xe0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 dlstp.8     lr, r5
 
 # CHECK: dlstp.16     lr, r5  @ encoding: [0x15,0xf0,0x01,0xe0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 dlstp.16     lr, r5
 
 # CHECK: dlstp.32     lr, r7  @ encoding: [0x27,0xf0,0x01,0xe0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 dlstp.32     lr, r7
 
 # CHECK: dlstp.64     lr, r2  @ encoding: [0x32,0xf0,0x01,0xe0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 dlstp.64     lr, r2
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: operand must be a register in range [r0, r12] or r14
@@ -135,15 +135,15 @@ dlstp.64     r10, r0
 dlstp.64     lr, pc
 
 # CHECK: letp lr, #-2 @ encoding: [0x1f,0xf0,0x01,0xc8]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 letp lr, #-2
 
 # CHECK: letp lr, #-8 @ encoding: [0x1f,0xf0,0x05,0xc0]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 letp lr, #-8
 
 # CHECK: letp lr, #-4094 @ encoding: [0x1f,0xf0,0xff,0xcf]
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 letp lr, #-4094
 
 # ERROR: [[@LINE+2]]:{{[0-9]+}}: {{error|note}}: invalid operand for instruction
@@ -159,7 +159,7 @@ letp lr, #8
 letp lr, #-4096
 
 # CHECK: letp lr, .Lstartloop @ encoding: [0x1f'A',0xf0'A',0x01'A',0xc0'A']
-# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 letp lr, .Lstartloop
 
 # CHECK: lctp @ encoding: [0x0f,0xf0,0x01,0xe0]
@@ -172,8 +172,11 @@ it eq
 # ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 lctpeq
 
+# ERROR-NOMVE: [[@LINE+1]]:1: error: instruction requires: mve
 vpste
+# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
 vpselt.s16 q0, q1, q2
+# ERROR-NOMVE: [[@LINE+1]]:1: error: invalid instruction
 vpsele.i32 q0, q1, q2
 # CHECK: vpste @ encoding: [0x71,0xfe,0x4d,0x8f]
 # CHECK: vpselt q0, q1, q2 @ encoding: [0x33,0xfe,0x05,0x0f]
diff --git a/llvm/test/MC/ARM/neon-complex.s b/llvm/test/MC/ARM/neon-complex.s
index 0d428b596c94c0..6054a08dc82e89 100644
--- a/llvm/test/MC/ARM/neon-complex.s
+++ b/llvm/test/MC/ARM/neon-complex.s
@@ -29,8 +29,10 @@
 // FP16-ARM: vcmla.f16       q0, q1, q2, #0  @ encoding: [0x44,0x08,0x22,0xfc]
 // FP16-THUMB: vcmla.f16       q0, q1, q2, #0  @ encoding: [0x22,0xfc,0x44,0x08]
 // NO-FP16-STDERR: :[[@LINE-3]]:{{[0-9]*}}: note: instruction requires: full half-float
-// V82A: :[[@LINE-4]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-// NO-NEON_STDERR: :[[@LINE-5]]:{{[0-9]*}}: error: instruction requires: NEON
+// V82A: :[[@LINE-4]]:{{[0-9]*}}: error: invalid instruction, any one of the following would fix this:
+// V82A: :[[@LINE-5]]:{{[0-9]*}}: note: instruction requires: mve.fp
+// V82A: :[[@LINE-6]]:{{[0-9]*}}: note: instruction requires: armv8.3a
+// NO-NEON_STDERR: :[[@LINE-7]]:{{[0-9]*}}: error: instruction requires: NEON
   vcmla.f32 d0, d1, d2, #0
 // ARM: vcmla.f32       d0, d1, d2, #0  @ encoding: [0x02,0x08,0x31,0xfc]
 // THUMB: vcmla.f32       d0, d1, d2, #0  @ encoding: [0x31,0xfc,0x02,0x08]
@@ -39,8 +41,10 @@
   vcmla.f32 q0, q1, q2, #0
 // ARM: vcmla.f32       q0, q1, q2, #0  @ encoding: [0x44,0x08,0x32,0xfc]
 // THUMB: vcmla.f32       q0, q1, q2, #0  @ encoding: [0x32,0xfc,0x44,0x08]
-// V82A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-// NO-NEON_STDERR: :[[@LINE-4]]:{{[0-9]*}}: error: instruction requires: NEON
+// V82A: :[[@LINE-3]]:{{[0-9]*}}: error: invalid instruction, any one of the following would fix this:
+// V82A: :[[@LINE-4]]:{{[0-9]*}}: note: instruction requires: mve.fp
+// V82A: :[[@LINE-5]]:{{[0-9]*}}: note: instruction requires: armv8.3a
+// NO-NEON_STDERR: :[[@LINE-6]]:{{[0-9]*}}: error: instruction requires: NEON
 
 // Valid rotations
   vcmla.f32 d0, d1, d2, #90
@@ -83,8 +87,10 @@
 // FP16-ARM: vcadd.f16       q0, q1, q2, #90 @ encoding: [0x44,0x08,0x82,0xfc]
 // FP16-THUMB: vcadd.f16       q0, q1, q2, #90 @ encoding: [0x82,0xfc,0x44,0x08]
 // NO-FP16-STDERR: :[[@LINE-3]]:{{[0-9]*}}: note: instruction requires: full half-float
-// V82A: :[[@LINE-4]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-// NO-NEON_STDERR: :[[@LINE-5]]:{{[0-9]*}}: error: instruction requires: NEON
+// V82A: :[[@LINE-4]]:{{[0-9]*}}: error: invalid instruction, any one of the following would fix this:
+// V82A: :[[@LINE-5]]:{{[0-9]*}}: note: instruction requires: mve.fp
+// V82A: :[[@LINE-6]]:{{[0-9]*}}: note: instruction requires: armv8.3a
+// NO-NEON_STDERR: :[[@LINE-7]]:{{[0-9]*}}: error: instruction requires: NEON
   vcadd.f32 d0, d1, d2, #90
 // ARM: vcadd.f32       d0, d1, d2, #90 @ encoding: [0x02,0x08,0x91,0xfc]
 // THUMB: vcadd.f32       d0, d1, d2, #90 @ encoding: [0x91,0xfc,0x02,0x08]
@@ -93,8 +99,10 @@
   vcadd.f32 q0, q1, q2, #90
 // ARM: vcadd.f32       q0, q1, q2, #90 @ encoding: [0x44,0x08,0x92,0xfc]
 // THUMB: vcadd.f32       q0, q1, q2, #90 @ encoding: [0x92,0xfc,0x44,0x08]
-// V82A: :[[@LINE-3]]:{{[0-9]*}}: error: instruction requires: armv8.3a
-// NO-NEON_STDERR: :[[@LINE-4]]:{{[0-9]*}}: error: instruction requires: NEON
+// V82A: :[[@LINE-3]]:{{[0-9]*}}: error: invalid instruction, any one of the following would fix this:
+// V82A: :[[@LINE-4]]:{{[0-9]*}}: note: instruction requires: mve.fp
+// V82A: :[[@LINE-5]]:{{[0-9]*}}: note: instruction requires: armv8.3a
+// NO-NEON_STDERR: :[[@LINE-6]]:{{[0-9]*}}: error: instruction requires: NEON
 
 // Valid rotations
   vcadd.f32 d0, d1, d2, #270
diff --git a/llvm/test/MC/ARM/no-mve.s b/llvm/test/MC/ARM/no-mve.s
index 668db402dc66dd..2435ced45a370e 100644
--- a/llvm/test/MC/ARM/no-mve.s
+++ b/llvm/test/MC/ARM/no-mve.s
@@ -4,13 +4,13 @@
 # RUN: FileCheck --check-prefix=CHECK-MVE < %t %s
 
 # CHECK-MVE: instruction requires: mve.fp
-# CHECK: invalid instruction
+# CHECK: instruction requires: mve.fp
 vcadd.f32 q1, q2, q3, #270
 
 # CHECK-MVE: instruction requires: mve.fp
-# CHECK: invalid instruction
+# CHECK: instruction requires: mve.fp
 vadd.f32 q1, q2, q3
 
 # CHECK-MVE: vadd.i16 q1, q2, q3 @ encoding: [0x14,0xef,0x46,0x28]
-# CHECK: invalid instruction
+# CHECK: instruction requires: mve
 vadd.i16 q1, q2, q3
diff --git a/llvm/test/MC/ARM/not-armv4.s b/llvm/test/MC/ARM/not-armv4.s
index c62c50c26c31d5..b65b12976f78a8 100644
--- a/llvm/test/MC/ARM/not-armv4.s
+++ b/llvm/test/MC/ARM/not-armv4.s
@@ -1,13 +1,21 @@
 @ RUN: not llvm-mc < %s -triple armv4-unknown-unknown -show-encoding 2>&1 | FileCheck %s
 
 @ PR18524
-@ CHECK: instruction requires: armv5t
+@ CHECK: error: invalid instruction, any one of the following would fix this:
+@ CHECK: note: instruction requires: armv5t
+@ CHECK: note: instruction requires: thumb2
 clz r4,r9
 
-@ CHECK: instruction requires: armv6t2
+@ CHECK: error: invalid instruction, any one of the following would fix this:
+@ CHECK: note: instruction requires: armv6t2
+@ CHECK: note: instruction requires: thumb2
 rbit r4,r9
 
 @ CHECK: error: instruction requires: armv6t2
 movw r4,#0x1234
-@ CHECK: error: instruction requires: armv6t2
+
+@ CHECK: error: invalid instruction, any one of the following would fix this:
+@ CHECK: note: invalid operand for instruction
+@ CHECK: note: operand must be a register in range [r0, r15]
+@ CHECK: note: instruction requires: armv6t2
 mov  r4,#0x1234
diff --git a/llvm/test/MC/ARM/register-token-source-loc.s b/llvm/test/MC/ARM/register-token-source-loc.s
index afb6ba0c61a39e..7560f95999e71b 100644
--- a/llvm/test/MC/ARM/register-token-source-loc.s
+++ b/llvm/test/MC/ARM/register-token-source-loc.s
@@ -3,6 +3,9 @@
 // CHECK:     error: invalid instruction, any one of the following would fix this:
 // CHECK-NEXT:  add sp, r0, #4
 // CHECK-NEXT:  ^
+// CHECK-NEXT: note: operand must be a register in range [r0, r7]
+// CHECK-NEXT:   add sp, r0, #4
+// CHECK-NEXT:       ^
 // CHECK-NEXT: note: operand must be a register sp
 // CHECK-NEXT:  add sp, r0, #4
 // CHECK-NEXT:          ^
diff --git a/llvm/test/MC/ARM/tMOVSr.s b/llvm/test/MC/ARM/tMOVSr.s
index 198c90aa5ceb46..09602fecd68513 100644
--- a/llvm/test/MC/ARM/tMOVSr.s
+++ b/llvm/test/MC/ARM/tMOVSr.s
@@ -1,6 +1,7 @@
 @ REQUIRES: asserts
-@ RUN: llvm-mc --triple=thumbv8 --debug %s 2>&1 | FileCheck %s --match-full-lines
+@ RUN: llvm-mc --triple=thumbv8 %s --show-encoding 2>&1 | FileCheck %s --match-full-lines
 
-@ CHECK: Changed to: <MCInst #{{[0-9]+}} tMOVSr <MCOperand Reg:{{[0-9]+}}> <MCOperand Reg:{{[0-9]+}}>>
+// Note this makes sure the narrow instruciton is selected
+@ CHECK: movs r2, r3 @ encoding: [0x1a,0x00]
 .text
   movs r2, r3
diff --git a/llvm/test/MC/ARM/thumb-diagnostics.s b/llvm/test/MC/ARM/thumb-diagnostics.s
index cacd7f21cb9d36..171d60ac13f9a8 100644
--- a/llvm/test/MC/ARM/thumb-diagnostics.s
+++ b/llvm/test/MC/ARM/thumb-diagnostics.s
@@ -28,9 +28,12 @@
 @ CHECK-ERRORS:         ^
 @ CHECK-ERRORS: note: instruction variant requires Thumb2
 @ CHECK-ERRORS: note: operand must be a register sp
-@ CHECK-ERRORS-V5: error: instruction variant requires ARMv6 or later
+@ CHECK-ERRORS-V5: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS-V5:         mov r2, r3
 @ CHECK-ERRORS-V5:         ^
+@ CHECK-ERRORS-V5: note: instruction requires: arm-mode
+@ CHECK-ERRORS-V5: note: operand must be an immediate in the range [0,255] or a relocatable expression
+@ CHECK-ERRORS-V5: note: instruction variant requires ARMv6 or later
 
 @ Immediates where registers were expected
         adds #0, r1, r2
@@ -225,10 +228,11 @@
 
 @ Mismatched source/destination operands for MUL instruction.
         muls r1, r2, r3
-@ CHECK-ERRORS: error: destination register must match source register
+@ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS:         muls r1, r2, r3
-@ CHECK-ERRORS:              ^
-
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: note: destination register must match a source register
+@ CHECK-ERRORS: note: too many operands for instruction
 
 @ Out of range immediates for STR instruction.
         str r2, [r7, #-1]
@@ -274,30 +278,33 @@
 @ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS: add sp, #-1
 @ CHECK-ERRORS: ^
+@ CHECK-ERRORS: note: instruction requires: thumb2
+@ CHECK-ERRORS: add sp, #-1
+@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: note: operand must be a register in range [r0, r15]
 @ CHECK-ERRORS: add sp, #-1
 @ CHECK-ERRORS:         ^
 @ CHECK-ERRORS: note: invalid operand for instruction
 @ CHECK-ERRORS: add sp, #-1
 @ CHECK-ERRORS:         ^
-@ CHECK-ERRORS: note: instruction requires: thumb2
-@ CHECK-ERRORS: add sp, #-1
-@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS: add sp, #3
 @ CHECK-ERRORS: ^
+@ CHECK-ERRORS: note: instruction requires: thumb2
+@ CHECK-ERRORS: add sp, #3
+@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: note: operand must be a register in range [r0, r15]
 @ CHECK-ERRORS: add sp, #3
 @ CHECK-ERRORS:         ^
 @ CHECK-ERRORS: note: invalid operand for instruction
 @ CHECK-ERRORS: add sp, #3
 @ CHECK-ERRORS:         ^
-@ CHECK-ERRORS: note: instruction requires: thumb2
-@ CHECK-ERRORS: add sp, #3
-@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
 @ CHECK-ERRORS: add sp, sp, #512
 @ CHECK-ERRORS: ^
+@ CHECK-ERRORS: note: instruction requires: thumb2
+@ CHECK-ERRORS: add sp, sp, #512
+@ CHECK-ERRORS: ^
 @ CHECK-ERRORS: note: operand must be a register in range [r0, r15]
 @ CHECK-ERRORS: add sp, sp, #512
 @ CHECK-ERRORS:             ^
@@ -305,9 +312,6 @@
 @ CHECK-ERRORS: add sp, sp, #512
 @ CHECK-ERRORS:             ^
 @ CHECK-ERRORS: note: instruction requires: thumb2
-@ CHECK-ERRORS: add sp, sp, #512
-@ CHECK-ERRORS: ^
-@ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS: add r2, sp, #1024
 @ CHECK-ERRORS: ^
         add r2, sp, ip
@@ -407,7 +411,8 @@
         adds
         adds r0
 @ CHECK-ERRORS: error: too few operands for instruction
-@ CHECK-ERRORS: error: too few operands for instruction
+@ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
+@ CHECK-ERRORS: note: too few operands for instruction
 
 @------------------------------------------------------------------------------
 @ Out of range width for SBFX/UBFX
diff --git a/llvm/test/MC/ARM/thumb-mov.s b/llvm/test/MC/ARM/thumb-mov.s
index 6f662f3ee2c720..e910722f2edf49 100644
--- a/llvm/test/MC/ARM/thumb-mov.s
+++ b/llvm/test/MC/ARM/thumb-mov.s
@@ -58,10 +58,16 @@
         movs sp, r0
         movs r0, sp
         movs sp, sp
-// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7: error: invalid instruction, any one of the following would fix this:
 // CHECK-V7-NEXT: movs sp, r0
-// CHECK-V7: instruction variant requires ARMv8 or later
+// CHECK-V7: note: instruction variant requires ARMv8 or later
+// CHECK-V7: note: operand must be a register in range [r0, r7]
+// CHECK-V7: error: invalid instruction, any one of the following would fix this:
 // CHECK-V7-NEXT: movs r0, sp
+// CHECK-V7: note: instruction variant requires ARMv8 or later
+// CHECK-V7: note: invalid operand for instruction
+// CHECK-V7: note: operand must be an immediate in the range [0,255] or a relocatable expression
+// CHECK-V7: note: operand must be a register in range [r0, r7]
 // CHECK-V7: error: instruction variant requires ARMv8 or later
 // CHECK-V7-NEXT: movs sp, sp
 // CHECK-V8: movs.w sp, r0            @ encoding: [0x5f,0xea,0x00,0x0d]
@@ -69,8 +75,9 @@
 // CHECK-V8: movs.w sp, sp            @ encoding: [0x5f,0xea,0x0d,0x0d]
 
         mov.w sp, sp
-// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7: error: invalid instruction, any one of the following would fix this:
 // CHECK-V7-NEXT: mov.w sp, sp
+// CHECK-V7: note: instruction variant requires ARMv8 or later
 // CHECK-V8: mov.w sp, sp             @ encoding: [0x4f,0xea,0x0d,0x0d]
 
         movs.w sp, r0
@@ -78,8 +85,9 @@
         movs.w sp, sp
 // CHECK-V7: error: instruction variant requires ARMv8 or later
 // CHECK-V7-NEXT: movs.w sp, r0
-// CHECK-V7: instruction variant requires ARMv8 or later
+// CHECK-V7: error: invalid instruction, any one of the following would fix this:
 // CHECK-V7-NEXT: movs.w r0, sp
+// CHECK-V7: note: instruction variant requires ARMv8 or later
 // CHECK-V7: error: instruction variant requires ARMv8 or later
 // CHECK-V7-NEXT: movs.w sp, sp
 // CHECK-V8: movs.w sp, r0            @ encoding: [0x5f,0xea,0x00,0x0d]
diff --git a/llvm/test/MC/ARM/thumb2-diagnostics.s b/llvm/test/MC/ARM/thumb2-diagnostics.s
index 45efd3c62120c1..afb12ce4309812 100644
--- a/llvm/test/MC/ARM/thumb2-diagnostics.s
+++ b/llvm/test/MC/ARM/thumb2-diagnostics.s
@@ -156,7 +156,9 @@ foo2:
         adds
         adds r0
 @ CHECK-ERRORS: error: too few operands for instruction
-@ CHECK-ERRORS: error: too few operands for instruction
+@ CHECK-ERRORS: error: invalid instruction, any one of the following would fix this:
+@ CHECK-ERRORS: note: too few operands for instruction
+@ CHECK-ERRORS: note: operand must be a register in range [r0, r15]
 
         tst sp, #3
         tst sp, r5
diff --git a/llvm/test/MC/ARM/vfp4.s b/llvm/test/MC/ARM/vfp4.s
index 37d03bb53de8e9..f3bc5750f43128 100644
--- a/llvm/test/MC/ARM/vfp4.s
+++ b/llvm/test/MC/ARM/vfp4.s
@@ -23,7 +23,7 @@ vfma.f32 d16, d18, d17
 
 @ ARM: vfma.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x08,0xf2]
 @ THUMB: vfma.f32	q2, q4, q0 @ encoding: [0x08,0xef,0x50,0x4c]
-@ THUMB_V7EM-ERRORS: error: invalid instruction
+@ THUMB_V7EM-ERRORS: error: instruction requires: mve.fp
 @ THUMB_V7EM-ERRORS-NEXT: vfma.f32 q2, q4, q0
 vfma.f32 q2, q4, q0
 
@@ -57,7 +57,7 @@ vfms.f32 d16, d18, d17
 
 @ ARM: vfms.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x28,0xf2]
 @ THUMB: vfms.f32	q2, q4, q0 @ encoding: [0x28,0xef,0x50,0x4c]
-@ THUMB_V7EM-ERRORS: error: invalid instruction
+@ THUMB_V7EM-ERRORS: error: instruction requires: mve.fp
 @ THUMB_V7EM-ERRORS-NEXT: vfms.f32 q2, q4, q0
 vfms.f32 q2, q4, q0
 
diff --git a/llvm/test/MC/BPF/insn-unit.s b/llvm/test/MC/BPF/insn-unit.s
index 224eb7381aa234..84735d196030d6 100644
--- a/llvm/test/MC/BPF/insn-unit.s
+++ b/llvm/test/MC/BPF/insn-unit.s
@@ -65,8 +65,10 @@
 // CHECK: 8d 02 00 00 00 00 00 00 	callx r2
 
 // ======== BPF_JMP Class ========
+  may_goto Llabel0 // BPF_JCOND | BPF_K
   if r1 & r2 goto Llabel0    // BPF_JSET  | BPF_X
   if r1 & 0xffff goto Llabel0    // BPF_JSET  | BPF_K
+// CHECK: e5 00 1e 00 00 00 00 00	may_goto +30
 // CHECK: 4d 21 1d 00 00 00 00 00 	if r1 & r2 goto +29
 // CHECK: 45 01 1c 00 ff ff 00 00 	if r1 & 65535 goto +28
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt
index fbb95450dbbe2f..929f3d2129be92 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_sop1.txt
@@ -2536,6 +2536,9 @@
 # GFX11: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA) ; encoding: [0x85,0x4c,0x80,0xbe]
 0x85,0x4c,0x80,0xbe
 
+# GFX11: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA_TO_PC) ; encoding: [0x86,0x4c,0x80,0xbe]
+0x86,0x4c,0x80,0xbe
+
 # GFX11: s_setpc_b64 s[0:1]                      ; encoding: [0x00,0x48,0x80,0xbe]
 0x00,0x48,0x80,0xbe
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
index c87cea1205681d..f8c235f77b5f56 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
@@ -3279,9 +3279,12 @@
 # GFX12: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA) ; encoding: [0x85,0x4c,0x80,0xbe]
 0x85,0x4c,0x80,0xbe
 
-# GFX12: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_SE_AID_ID) ; encoding: [0x86,0x4c,0x80,0xbe]
+# GFX12: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_TBA_TO_PC) ; encoding: [0x86,0x4c,0x80,0xbe]
 0x86,0x4c,0x80,0xbe
 
+# GFX12: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_SE_AID_ID) ; encoding: [0x87,0x4c,0x80,0xbe]
+0x87,0x4c,0x80,0xbe
+
 # GFX12: s_setpc_b64 s[0:1]                      ; encoding: [0x00,0x48,0x80,0xbe]
 0x00,0x48,0x80,0xbe
 
diff --git a/llvm/test/MC/Disassembler/X86/apx/cfcmov.txt b/llvm/test/MC/Disassembler/X86/apx/cfcmov.txt
new file mode 100644
index 00000000000000..4ecaa4b6a14a91
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/cfcmov.txt
@@ -0,0 +1,842 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   cfcmovbw	%r17w, %r21w, %r25w
+# INTEL: cfcmovb	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x42,0xe9
+
+# ATT:   cfcmovbw	%r17w, %r21w
+# INTEL: cfcmovb	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x42,0xcd
+
+# ATT:   cfcmovbw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovb	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x42,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbl	%r18d, %r22d, %r26d
+# INTEL: cfcmovb	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x42,0xf2
+
+# ATT:   cfcmovbl	%r18d, %r22d
+# INTEL: cfcmovb	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x42,0xd6
+
+# ATT:   cfcmovbl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovb	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x42,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbq	%r19, %r23, %r27
+# INTEL: cfcmovb	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x42,0xfb
+
+# ATT:   cfcmovbq	%r19, %r23
+# INTEL: cfcmovb	r23, r19
+0x62,0xec,0xfc,0x0c,0x42,0xdf
+
+# ATT:   cfcmovbq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovb	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x42,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovb	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x42,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovb	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x42,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovb	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x42,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovb	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x42,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovb	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x42,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovb	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x42,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbew	%r17w, %r21w, %r25w
+# INTEL: cfcmovbe	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x46,0xe9
+
+# ATT:   cfcmovbew	%r17w, %r21w
+# INTEL: cfcmovbe	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x46,0xcd
+
+# ATT:   cfcmovbew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovbe	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x46,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbel	%r18d, %r22d, %r26d
+# INTEL: cfcmovbe	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x46,0xf2
+
+# ATT:   cfcmovbel	%r18d, %r22d
+# INTEL: cfcmovbe	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x46,0xd6
+
+# ATT:   cfcmovbel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovbe	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x46,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbeq	%r19, %r23, %r27
+# INTEL: cfcmovbe	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x46,0xfb
+
+# ATT:   cfcmovbeq	%r19, %r23
+# INTEL: cfcmovbe	r23, r19
+0x62,0xec,0xfc,0x0c,0x46,0xdf
+
+# ATT:   cfcmovbeq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovbe	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x46,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovbe	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x46,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovbe	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x46,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovbe	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x46,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovbe	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x46,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbeq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovbe	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x46,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovbeq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovbe	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x46,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlw	%r17w, %r21w, %r25w
+# INTEL: cfcmovl	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4c,0xe9
+
+# ATT:   cfcmovlw	%r17w, %r21w
+# INTEL: cfcmovl	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4c,0xcd
+
+# ATT:   cfcmovlw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovl	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovll	%r18d, %r22d, %r26d
+# INTEL: cfcmovl	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4c,0xf2
+
+# ATT:   cfcmovll	%r18d, %r22d
+# INTEL: cfcmovl	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4c,0xd6
+
+# ATT:   cfcmovll	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovl	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4c,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlq	%r19, %r23, %r27
+# INTEL: cfcmovl	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4c,0xfb
+
+# ATT:   cfcmovlq	%r19, %r23
+# INTEL: cfcmovl	r23, r19
+0x62,0xec,0xfc,0x0c,0x4c,0xdf
+
+# ATT:   cfcmovlq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovl	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovl	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovl	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovll	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovl	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4c,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovll	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovl	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4c,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovl	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovl	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlew	%r17w, %r21w, %r25w
+# INTEL: cfcmovle	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4e,0xe9
+
+# ATT:   cfcmovlew	%r17w, %r21w
+# INTEL: cfcmovle	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4e,0xcd
+
+# ATT:   cfcmovlew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovle	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlel	%r18d, %r22d, %r26d
+# INTEL: cfcmovle	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4e,0xf2
+
+# ATT:   cfcmovlel	%r18d, %r22d
+# INTEL: cfcmovle	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4e,0xd6
+
+# ATT:   cfcmovlel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovle	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4e,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovleq	%r19, %r23, %r27
+# INTEL: cfcmovle	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4e,0xfb
+
+# ATT:   cfcmovleq	%r19, %r23
+# INTEL: cfcmovle	r23, r19
+0x62,0xec,0xfc,0x0c,0x4e,0xdf
+
+# ATT:   cfcmovleq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovle	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovle	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovle	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovle	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4e,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovlel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovle	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4e,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovleq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovle	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovleq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovle	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaew	%r17w, %r21w, %r25w
+# INTEL: cfcmovae	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x43,0xe9
+
+# ATT:   cfcmovaew	%r17w, %r21w
+# INTEL: cfcmovae	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x43,0xcd
+
+# ATT:   cfcmovaew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovae	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x43,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovael	%r18d, %r22d, %r26d
+# INTEL: cfcmovae	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x43,0xf2
+
+# ATT:   cfcmovael	%r18d, %r22d
+# INTEL: cfcmovae	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x43,0xd6
+
+# ATT:   cfcmovael	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovae	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x43,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaeq	%r19, %r23, %r27
+# INTEL: cfcmovae	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x43,0xfb
+
+# ATT:   cfcmovaeq	%r19, %r23
+# INTEL: cfcmovae	r23, r19
+0x62,0xec,0xfc,0x0c,0x43,0xdf
+
+# ATT:   cfcmovaeq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovae	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x43,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovae	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x43,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovae	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x43,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovael	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovae	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x43,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovael	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovae	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x43,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaeq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovae	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x43,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaeq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovae	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x43,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaw	%r17w, %r21w, %r25w
+# INTEL: cfcmova	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x47,0xe9
+
+# ATT:   cfcmovaw	%r17w, %r21w
+# INTEL: cfcmova	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x47,0xcd
+
+# ATT:   cfcmovaw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmova	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x47,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoval	%r18d, %r22d, %r26d
+# INTEL: cfcmova	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x47,0xf2
+
+# ATT:   cfcmoval	%r18d, %r22d
+# INTEL: cfcmova	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x47,0xd6
+
+# ATT:   cfcmoval	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmova	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x47,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaq	%r19, %r23, %r27
+# INTEL: cfcmova	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x47,0xfb
+
+# ATT:   cfcmovaq	%r19, %r23
+# INTEL: cfcmova	r23, r19
+0x62,0xec,0xfc,0x0c,0x47,0xdf
+
+# ATT:   cfcmovaq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmova	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x47,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmova	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x47,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmova	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x47,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoval	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmova	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x47,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoval	291(%r28,%r29,4), %r18d
+# INTEL: cfcmova	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x47,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmova	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x47,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovaq	291(%r28,%r29,4), %r19
+# INTEL: cfcmova	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x47,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgew	%r17w, %r21w, %r25w
+# INTEL: cfcmovge	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4d,0xe9
+
+# ATT:   cfcmovgew	%r17w, %r21w
+# INTEL: cfcmovge	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4d,0xcd
+
+# ATT:   cfcmovgew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovge	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgel	%r18d, %r22d, %r26d
+# INTEL: cfcmovge	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4d,0xf2
+
+# ATT:   cfcmovgel	%r18d, %r22d
+# INTEL: cfcmovge	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4d,0xd6
+
+# ATT:   cfcmovgel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovge	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4d,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgeq	%r19, %r23, %r27
+# INTEL: cfcmovge	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4d,0xfb
+
+# ATT:   cfcmovgeq	%r19, %r23
+# INTEL: cfcmovge	r23, r19
+0x62,0xec,0xfc,0x0c,0x4d,0xdf
+
+# ATT:   cfcmovgeq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovge	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovge	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovge	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovge	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4d,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovge	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4d,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgeq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovge	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovgeq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovge	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnow	%r17w, %r21w, %r25w
+# INTEL: cfcmovno	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x41,0xe9
+
+# ATT:   cfcmovnow	%r17w, %r21w
+# INTEL: cfcmovno	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x41,0xcd
+
+# ATT:   cfcmovnow	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovno	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x41,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnol	%r18d, %r22d, %r26d
+# INTEL: cfcmovno	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x41,0xf2
+
+# ATT:   cfcmovnol	%r18d, %r22d
+# INTEL: cfcmovno	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x41,0xd6
+
+# ATT:   cfcmovnol	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovno	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x41,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnoq	%r19, %r23, %r27
+# INTEL: cfcmovno	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x41,0xfb
+
+# ATT:   cfcmovnoq	%r19, %r23
+# INTEL: cfcmovno	r23, r19
+0x62,0xec,0xfc,0x0c,0x41,0xdf
+
+# ATT:   cfcmovnoq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovno	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x41,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnow	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovno	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x41,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnow	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovno	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x41,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnol	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovno	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x41,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnol	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovno	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x41,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnoq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovno	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x41,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnoq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovno	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x41,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpw	%r17w, %r21w, %r25w
+# INTEL: cfcmovnp	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4b,0xe9
+
+# ATT:   cfcmovnpw	%r17w, %r21w
+# INTEL: cfcmovnp	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4b,0xcd
+
+# ATT:   cfcmovnpw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovnp	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpl	%r18d, %r22d, %r26d
+# INTEL: cfcmovnp	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4b,0xf2
+
+# ATT:   cfcmovnpl	%r18d, %r22d
+# INTEL: cfcmovnp	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4b,0xd6
+
+# ATT:   cfcmovnpl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovnp	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4b,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpq	%r19, %r23, %r27
+# INTEL: cfcmovnp	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4b,0xfb
+
+# ATT:   cfcmovnpq	%r19, %r23
+# INTEL: cfcmovnp	r23, r19
+0x62,0xec,0xfc,0x0c,0x4b,0xdf
+
+# ATT:   cfcmovnpq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovnp	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovnp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovnp	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovnp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4b,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovnp	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4b,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovnp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnpq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovnp	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsw	%r17w, %r21w, %r25w
+# INTEL: cfcmovns	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x49,0xe9
+
+# ATT:   cfcmovnsw	%r17w, %r21w
+# INTEL: cfcmovns	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x49,0xcd
+
+# ATT:   cfcmovnsw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovns	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x49,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsl	%r18d, %r22d, %r26d
+# INTEL: cfcmovns	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x49,0xf2
+
+# ATT:   cfcmovnsl	%r18d, %r22d
+# INTEL: cfcmovns	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x49,0xd6
+
+# ATT:   cfcmovnsl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovns	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x49,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsq	%r19, %r23, %r27
+# INTEL: cfcmovns	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x49,0xfb
+
+# ATT:   cfcmovnsq	%r19, %r23
+# INTEL: cfcmovns	r23, r19
+0x62,0xec,0xfc,0x0c,0x49,0xdf
+
+# ATT:   cfcmovnsq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovns	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x49,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovns	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x49,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovns	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x49,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovns	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x49,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovns	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x49,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovns	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x49,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnsq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovns	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x49,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnew	%r17w, %r21w, %r25w
+# INTEL: cfcmovne	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x45,0xe9
+
+# ATT:   cfcmovnew	%r17w, %r21w
+# INTEL: cfcmovne	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x45,0xcd
+
+# ATT:   cfcmovnew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovne	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x45,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnel	%r18d, %r22d, %r26d
+# INTEL: cfcmovne	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x45,0xf2
+
+# ATT:   cfcmovnel	%r18d, %r22d
+# INTEL: cfcmovne	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x45,0xd6
+
+# ATT:   cfcmovnel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovne	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x45,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovneq	%r19, %r23, %r27
+# INTEL: cfcmovne	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x45,0xfb
+
+# ATT:   cfcmovneq	%r19, %r23
+# INTEL: cfcmovne	r23, r19
+0x62,0xec,0xfc,0x0c,0x45,0xdf
+
+# ATT:   cfcmovneq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovne	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x45,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovne	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x45,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovne	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x45,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovne	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x45,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovnel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovne	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x45,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovneq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovne	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x45,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovneq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovne	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x45,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpw	%r17w, %r21w, %r25w
+# INTEL: cfcmovp	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x4a,0xe9
+
+# ATT:   cfcmovpw	%r17w, %r21w
+# INTEL: cfcmovp	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x4a,0xcd
+
+# ATT:   cfcmovpw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovp	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpl	%r18d, %r22d, %r26d
+# INTEL: cfcmovp	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x4a,0xf2
+
+# ATT:   cfcmovpl	%r18d, %r22d
+# INTEL: cfcmovp	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x4a,0xd6
+
+# ATT:   cfcmovpl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovp	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x4a,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpq	%r19, %r23, %r27
+# INTEL: cfcmovp	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x4a,0xfb
+
+# ATT:   cfcmovpq	%r19, %r23
+# INTEL: cfcmovp	r23, r19
+0x62,0xec,0xfc,0x0c,0x4a,0xdf
+
+# ATT:   cfcmovpq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovp	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovp	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x4a,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovp	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x4a,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovpq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovp	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsw	%r17w, %r21w, %r25w
+# INTEL: cfcmovs	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x48,0xe9
+
+# ATT:   cfcmovsw	%r17w, %r21w
+# INTEL: cfcmovs	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x48,0xcd
+
+# ATT:   cfcmovsw	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmovs	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x48,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsl	%r18d, %r22d, %r26d
+# INTEL: cfcmovs	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x48,0xf2
+
+# ATT:   cfcmovsl	%r18d, %r22d
+# INTEL: cfcmovs	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x48,0xd6
+
+# ATT:   cfcmovsl	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmovs	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x48,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsq	%r19, %r23, %r27
+# INTEL: cfcmovs	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x48,0xfb
+
+# ATT:   cfcmovsq	%r19, %r23
+# INTEL: cfcmovs	r23, r19
+0x62,0xec,0xfc,0x0c,0x48,0xdf
+
+# ATT:   cfcmovsq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmovs	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x48,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsw	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmovs	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x48,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsw	291(%r28,%r29,4), %r17w
+# INTEL: cfcmovs	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x48,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsl	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmovs	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x48,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsl	291(%r28,%r29,4), %r18d
+# INTEL: cfcmovs	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x48,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmovs	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x48,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovsq	291(%r28,%r29,4), %r19
+# INTEL: cfcmovs	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x48,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovew	%r17w, %r21w, %r25w
+# INTEL: cfcmove	r25w, r21w, r17w
+0x62,0xec,0x35,0x14,0x44,0xe9
+
+# ATT:   cfcmovew	%r17w, %r21w
+# INTEL: cfcmove	r21w, r17w
+0x62,0xec,0x7d,0x0c,0x44,0xcd
+
+# ATT:   cfcmovew	%r17w, 291(%r28,%r29,4)
+# INTEL: cfcmove	word ptr [r28 + 4*r29 + 291], r17w
+0x62,0x8c,0x79,0x0c,0x44,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovel	%r18d, %r22d, %r26d
+# INTEL: cfcmove	r26d, r22d, r18d
+0x62,0xec,0x2c,0x14,0x44,0xf2
+
+# ATT:   cfcmovel	%r18d, %r22d
+# INTEL: cfcmove	r22d, r18d
+0x62,0xec,0x7c,0x0c,0x44,0xd6
+
+# ATT:   cfcmovel	%r18d, 291(%r28,%r29,4)
+# INTEL: cfcmove	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x0c,0x44,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoveq	%r19, %r23, %r27
+# INTEL: cfcmove	r27, r23, r19
+0x62,0xec,0xa4,0x14,0x44,0xfb
+
+# ATT:   cfcmoveq	%r19, %r23
+# INTEL: cfcmove	r23, r19
+0x62,0xec,0xfc,0x0c,0x44,0xdf
+
+# ATT:   cfcmoveq	%r19, 291(%r28,%r29,4)
+# INTEL: cfcmove	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x0c,0x44,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovew	291(%r28,%r29,4), %r17w, %r21w
+# INTEL: cfcmove	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x51,0x14,0x44,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovew	291(%r28,%r29,4), %r17w
+# INTEL: cfcmove	r17w, word ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x79,0x08,0x44,0x8c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovel	291(%r28,%r29,4), %r18d, %r22d
+# INTEL: cfcmove	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x48,0x14,0x44,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmovel	291(%r28,%r29,4), %r18d
+# INTEL: cfcmove	r18d, dword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0x78,0x08,0x44,0x94,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoveq	291(%r28,%r29,4), %r19, %r23
+# INTEL: cfcmove	r23, r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xc0,0x14,0x44,0x9c,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cfcmoveq	291(%r28,%r29,4), %r19
+# INTEL: cfcmove	r19, qword ptr [r28 + 4*r29 + 291]
+0x62,0x8c,0xf8,0x08,0x44,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/cmov.txt b/llvm/test/MC/Disassembler/X86/apx/cmov.txt
new file mode 100644
index 00000000000000..cc96fb17e0085e
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/cmov.txt
@@ -0,0 +1,386 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   cmovbw	%dx, %ax, %r9w
+# INTEL: cmovb	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x42,0xc2
+
+# ATT:   cmovbl	%ecx, %edx, %r10d
+# INTEL: cmovb	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x42,0xd1
+
+# ATT:   cmovbq	%r9, %r15, %r11
+# INTEL: cmovb	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x42,0xf9
+
+# ATT:   cmovbw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovb	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x42,0x54,0x80,0x7b
+
+# ATT:   cmovbl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovb	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x42,0x4c,0x80,0x7b
+
+# ATT:   cmovbq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovb	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x42,0x4c,0x80,0x7b
+
+# ATT:   cmovbew	%dx, %ax, %r9w
+# INTEL: cmovbe	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x46,0xc2
+
+# ATT:   cmovbel	%ecx, %edx, %r10d
+# INTEL: cmovbe	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x46,0xd1
+
+# ATT:   cmovbeq	%r9, %r15, %r11
+# INTEL: cmovbe	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x46,0xf9
+
+# ATT:   cmovbew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovbe	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x46,0x54,0x80,0x7b
+
+# ATT:   cmovbel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovbe	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x46,0x4c,0x80,0x7b
+
+# ATT:   cmovbeq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovbe	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x46,0x4c,0x80,0x7b
+
+# ATT:   cmovlw	%dx, %ax, %r9w
+# INTEL: cmovl	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4c,0xc2
+
+# ATT:   cmovll	%ecx, %edx, %r10d
+# INTEL: cmovl	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4c,0xd1
+
+# ATT:   cmovlq	%r9, %r15, %r11
+# INTEL: cmovl	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4c,0xf9
+
+# ATT:   cmovlw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovl	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4c,0x54,0x80,0x7b
+
+# ATT:   cmovll	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovl	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4c,0x4c,0x80,0x7b
+
+# ATT:   cmovlq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovl	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4c,0x4c,0x80,0x7b
+
+# ATT:   cmovlew	%dx, %ax, %r9w
+# INTEL: cmovle	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4e,0xc2
+
+# ATT:   cmovlel	%ecx, %edx, %r10d
+# INTEL: cmovle	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4e,0xd1
+
+# ATT:   cmovleq	%r9, %r15, %r11
+# INTEL: cmovle	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4e,0xf9
+
+# ATT:   cmovlew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovle	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4e,0x54,0x80,0x7b
+
+# ATT:   cmovlel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovle	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4e,0x4c,0x80,0x7b
+
+# ATT:   cmovleq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovle	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4e,0x4c,0x80,0x7b
+
+# ATT:   cmovaew	%dx, %ax, %r9w
+# INTEL: cmovae	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x43,0xc2
+
+# ATT:   cmovael	%ecx, %edx, %r10d
+# INTEL: cmovae	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x43,0xd1
+
+# ATT:   cmovaeq	%r9, %r15, %r11
+# INTEL: cmovae	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x43,0xf9
+
+# ATT:   cmovaew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovae	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x43,0x54,0x80,0x7b
+
+# ATT:   cmovael	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovae	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x43,0x4c,0x80,0x7b
+
+# ATT:   cmovaeq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovae	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x43,0x4c,0x80,0x7b
+
+# ATT:   cmovaw	%dx, %ax, %r9w
+# INTEL: cmova	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x47,0xc2
+
+# ATT:   cmoval	%ecx, %edx, %r10d
+# INTEL: cmova	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x47,0xd1
+
+# ATT:   cmovaq	%r9, %r15, %r11
+# INTEL: cmova	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x47,0xf9
+
+# ATT:   cmovaw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmova	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x47,0x54,0x80,0x7b
+
+# ATT:   cmoval	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmova	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x47,0x4c,0x80,0x7b
+
+# ATT:   cmovaq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmova	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x47,0x4c,0x80,0x7b
+
+# ATT:   cmovgew	%dx, %ax, %r9w
+# INTEL: cmovge	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4d,0xc2
+
+# ATT:   cmovgel	%ecx, %edx, %r10d
+# INTEL: cmovge	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4d,0xd1
+
+# ATT:   cmovgeq	%r9, %r15, %r11
+# INTEL: cmovge	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4d,0xf9
+
+# ATT:   cmovgew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovge	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4d,0x54,0x80,0x7b
+
+# ATT:   cmovgel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovge	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4d,0x4c,0x80,0x7b
+
+# ATT:   cmovgeq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovge	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4d,0x4c,0x80,0x7b
+
+# ATT:   cmovgw	%dx, %ax, %r9w
+# INTEL: cmovg	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4f,0xc2
+
+# ATT:   cmovgl	%ecx, %edx, %r10d
+# INTEL: cmovg	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4f,0xd1
+
+# ATT:   cmovgq	%r9, %r15, %r11
+# INTEL: cmovg	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4f,0xf9
+
+# ATT:   cmovgw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovg	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4f,0x54,0x80,0x7b
+
+# ATT:   cmovgl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovg	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4f,0x4c,0x80,0x7b
+
+# ATT:   cmovgq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovg	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4f,0x4c,0x80,0x7b
+
+# ATT:   cmovnow	%dx, %ax, %r9w
+# INTEL: cmovno	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x41,0xc2
+
+# ATT:   cmovnol	%ecx, %edx, %r10d
+# INTEL: cmovno	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x41,0xd1
+
+# ATT:   cmovnoq	%r9, %r15, %r11
+# INTEL: cmovno	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x41,0xf9
+
+# ATT:   cmovnow	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovno	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x41,0x54,0x80,0x7b
+
+# ATT:   cmovnol	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovno	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x41,0x4c,0x80,0x7b
+
+# ATT:   cmovnoq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovno	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x41,0x4c,0x80,0x7b
+
+# ATT:   cmovnpw	%dx, %ax, %r9w
+# INTEL: cmovnp	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4b,0xc2
+
+# ATT:   cmovnpl	%ecx, %edx, %r10d
+# INTEL: cmovnp	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4b,0xd1
+
+# ATT:   cmovnpq	%r9, %r15, %r11
+# INTEL: cmovnp	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4b,0xf9
+
+# ATT:   cmovnpw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovnp	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4b,0x54,0x80,0x7b
+
+# ATT:   cmovnpl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovnp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4b,0x4c,0x80,0x7b
+
+# ATT:   cmovnpq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovnp	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4b,0x4c,0x80,0x7b
+
+# ATT:   cmovnsw	%dx, %ax, %r9w
+# INTEL: cmovns	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x49,0xc2
+
+# ATT:   cmovnsl	%ecx, %edx, %r10d
+# INTEL: cmovns	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x49,0xd1
+
+# ATT:   cmovnsq	%r9, %r15, %r11
+# INTEL: cmovns	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x49,0xf9
+
+# ATT:   cmovnsw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovns	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x49,0x54,0x80,0x7b
+
+# ATT:   cmovnsl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovns	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x49,0x4c,0x80,0x7b
+
+# ATT:   cmovnsq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovns	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x49,0x4c,0x80,0x7b
+
+# ATT:   cmovnew	%dx, %ax, %r9w
+# INTEL: cmovne	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x45,0xc2
+
+# ATT:   cmovnel	%ecx, %edx, %r10d
+# INTEL: cmovne	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x45,0xd1
+
+# ATT:   cmovneq	%r9, %r15, %r11
+# INTEL: cmovne	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x45,0xf9
+
+# ATT:   cmovnew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovne	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x45,0x54,0x80,0x7b
+
+# ATT:   cmovnel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovne	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x45,0x4c,0x80,0x7b
+
+# ATT:   cmovneq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovne	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x45,0x4c,0x80,0x7b
+
+# ATT:   cmovow	%dx, %ax, %r9w
+# INTEL: cmovo	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x40,0xc2
+
+# ATT:   cmovol	%ecx, %edx, %r10d
+# INTEL: cmovo	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x40,0xd1
+
+# ATT:   cmovoq	%r9, %r15, %r11
+# INTEL: cmovo	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x40,0xf9
+
+# ATT:   cmovow	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovo	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x40,0x54,0x80,0x7b
+
+# ATT:   cmovol	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovo	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x40,0x4c,0x80,0x7b
+
+# ATT:   cmovoq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovo	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x40,0x4c,0x80,0x7b
+
+# ATT:   cmovpw	%dx, %ax, %r9w
+# INTEL: cmovp	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x4a,0xc2
+
+# ATT:   cmovpl	%ecx, %edx, %r10d
+# INTEL: cmovp	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x4a,0xd1
+
+# ATT:   cmovpq	%r9, %r15, %r11
+# INTEL: cmovp	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x4a,0xf9
+
+# ATT:   cmovpw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovp	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x4a,0x54,0x80,0x7b
+
+# ATT:   cmovpl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x4a,0x4c,0x80,0x7b
+
+# ATT:   cmovpq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovp	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x4a,0x4c,0x80,0x7b
+
+# ATT:   cmovsw	%dx, %ax, %r9w
+# INTEL: cmovs	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x48,0xc2
+
+# ATT:   cmovsl	%ecx, %edx, %r10d
+# INTEL: cmovs	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x48,0xd1
+
+# ATT:   cmovsq	%r9, %r15, %r11
+# INTEL: cmovs	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x48,0xf9
+
+# ATT:   cmovsw	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmovs	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x48,0x54,0x80,0x7b
+
+# ATT:   cmovsl	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmovs	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x48,0x4c,0x80,0x7b
+
+# ATT:   cmovsq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmovs	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x48,0x4c,0x80,0x7b
+
+# ATT:   cmovew	%dx, %ax, %r9w
+# INTEL: cmove	r9w, ax, dx
+0x62,0xf4,0x35,0x18,0x44,0xc2
+
+# ATT:   cmovel	%ecx, %edx, %r10d
+# INTEL: cmove	r10d, edx, ecx
+0x62,0xf4,0x2c,0x18,0x44,0xd1
+
+# ATT:   cmoveq	%r9, %r15, %r11
+# INTEL: cmove	r11, r15, r9
+0x62,0x54,0xa4,0x18,0x44,0xf9
+
+# ATT:   cmovew	123(%r8,%rax,4), %dx, %ax
+# INTEL: cmove	ax, dx, word ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x7d,0x18,0x44,0x54,0x80,0x7b
+
+# ATT:   cmovel	123(%r8,%rax,4), %ecx, %edx
+# INTEL: cmove	edx, ecx, dword ptr [r8 + 4*rax + 123]
+0x62,0xd4,0x6c,0x18,0x44,0x4c,0x80,0x7b
+
+# ATT:   cmoveq	123(%r8,%rax,4), %r9, %r15
+# INTEL: cmove	r15, r9, qword ptr [r8 + 4*rax + 123]
+0x62,0x54,0x84,0x18,0x44,0x4c,0x80,0x7b
diff --git a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
index 1c1f70b096bed9..1156f5c409922a 100644
--- a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
@@ -11,6 +11,12 @@
 # INTEL: add	r18, qword ptr [r17 + 123], r16
 0x62,0xec,0xec,0x10,0x01,0x41,0x7b
 
+## MRMDestMemCC
+
+# ATT:   cfcmovbq %r16, 123(%r17,%r18,4)
+# INTEL: cfcmovb	qword ptr [r17 + 4*r18 + 123], r16
+0x62,0xec,0xf8,0x0c,0x42,0x44,0x91,0x7b
+
 ## MRMSrcMem
 
 # ATT:   vbroadcasti32x4	(%r16,%r17), %zmm0
@@ -21,6 +27,16 @@
 # INTEL: sub	r18, r17, qword ptr [r16 + 123]
 0x62,0xec,0xec,0x10,0x2b,0x48,0x7b
 
+## MRMSrcMemCC
+
+# ATT:   cfcmovbq	123(%r16,%r17,4), %r18
+# INTEL: cfcmovb	r18, qword ptr [r16 + 4*r17 + 123]
+0x62,0xec,0xf8,0x08,0x42,0x54,0x88,0x7b
+
+# ATT:   cfcmovbq	123(%r16,%r17,4), %r18, %r19
+# INTEL: cfcmovb	r19, r18, qword ptr [r16 + 4*r17 + 123]
+0x62,0xec,0xe0,0x14,0x42,0x54,0x88,0x7b
+
 ## MRM0m
 
 # ATT:   vprorq	$0, (%r16,%r17), %zmm0
@@ -123,12 +139,28 @@
 # INTEL: {nf}	add	r17, r16
 0x62,0xec,0xfc,0x0c,0x01,0xc1
 
+## MRMDestRegCC
+
+# ATT:   cfcmovbq	%r16, %r17
+# INTEL: cfcmovb	r17, r16
+0x62,0xec,0xfc,0x0c,0x42,0xc1
+
 ## MRMSrcReg
 
 # ATT:   mulxq	%r16, %r17, %r18
 # INTEL: mulx	r18, r17, r16
 0x62,0xea,0xf7,0x00,0xf6,0xd0
 
+## MRMSrcRegCC
+
+# ATT:   cfcmovbq	%r16, %r17, %r18
+# INTEL: cfcmovb	r18, r17, r16
+0x62,0xec,0xec,0x14,0x42,0xc8
+
+# ATT:   cfcmovlq	%r16, %r17, %r18
+# INTEL: cfcmovl	r18, r17, r16
+0x62,0xec,0xec,0x14,0x4c,0xc8
+
 ## MRMSrcReg4VOp3
 
 # ATT:   bzhiq	%r19, %r23, %r27
diff --git a/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt b/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt
index 9e812e370146ba..fd12d904a4a46d 100644
--- a/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/reverse-encoding.txt
@@ -430,3 +430,17 @@
 # ATT:   ccmpoq {dfv=}	%r16, %r17
 # INTEL: ccmpo {dfv=}	r17, r16
 0x62,0xec,0x84,0x00,0x3b,0xc8
+
+## cfcmov
+
+# ATT:   cfcmovbew	%r16w, %r17w
+# INTEL: cfcmovbe	r17w, r16w
+0x62,0xec,0x7d,0x08,0x46,0xc8
+
+# ATT:   cfcmovbel	%r16d, %r17d
+# INTEL: cfcmovbe	r17d, r16d
+0x62,0xec,0x7c,0x08,0x46,0xc8
+
+# ATT:   cfcmovbeq	%r16, %r17
+# INTEL: cfcmovbe	r17, r16
+0x62,0xec,0xfc,0x08,0x46,0xc8
diff --git a/llvm/test/MC/Hexagon/directive-attribute-err.s b/llvm/test/MC/Hexagon/directive-attribute-err.s
new file mode 100644
index 00000000000000..52b145b1ff28a1
--- /dev/null
+++ b/llvm/test/MC/Hexagon/directive-attribute-err.s
@@ -0,0 +1,24 @@
+/// attribute parsing error cases.
+
+// RUN: not llvm-mc -triple=hexagon -filetype=asm %s 2>&1 \
+// RUN:   | FileCheck %s
+
+  .attribute Tag_unknown_name, 0
+// CHECK: [[#@LINE-1]]:14: error: attribute name not recognized: Tag_unknown_name
+// CHECK-NEXT:   .attribute Tag_unknown_name
+
+  .attribute [non_constant_expression], 0
+// CHECK: [[#@LINE-1]]:14: error: expected numeric constant
+// CHECK-NEXT:   .attribute [non_constant_expression], 0
+
+  .attribute 42, "forty two"
+// CHECK: [[#@LINE-1]]:18: error: expected numeric constant
+// CHECK-NEXT:   .attribute 42, "forty two"
+
+  .attribute Tag_arch, "v75"
+// CHECK: [[#@LINE-1]]:24: error: expected numeric constant
+// CHECK-NEXT:   .attribute Tag_arch, "v75"
+
+  .attribute 0
+// CHECK: :[[#@LINE-1]]:15: error: expected comma
+// CHECK-NEXT:   .attribute 0
diff --git a/llvm/test/MC/Hexagon/directive-attribute.s b/llvm/test/MC/Hexagon/directive-attribute.s
new file mode 100644
index 00000000000000..d7c893061e18ab
--- /dev/null
+++ b/llvm/test/MC/Hexagon/directive-attribute.s
@@ -0,0 +1,41 @@
+/// Check .attribute parsing.
+
+// RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-readelf -A - | \
+// RUN:     FileCheck %s --match-full-lines --implicit-check-not={{.}}
+
+.attribute 4, 71 // Tag_arch
+.attribute Tag_cabac, 1
+.attribute Tag_hvx_arch, 68
+.attribute 7, 1 // Tag_hvx_qfloat
+
+//      CHECK: BuildAttributes {
+// CHECK-NEXT:   FormatVersion: 0x41
+// CHECK-NEXT:   Section 1 {
+// CHECK-NEXT:     SectionLength: 25
+// CHECK-NEXT:     Vendor: hexagon
+// CHECK-NEXT:     Tag: Tag_File (0x1)
+// CHECK-NEXT:     Size: 13
+// CHECK-NEXT:     FileAttributes {
+// CHECK-NEXT:       Attribute {
+// CHECK-NEXT:         Tag: 4
+// CHECK-NEXT:         TagName: arch
+// CHECK-NEXT:         Value: 71
+// CHECK-NEXT:       }
+// CHECK-NEXT:       Attribute {
+// CHECK-NEXT:         Tag: 10
+// CHECK-NEXT:         TagName: cabac
+// CHECK-NEXT:         Value: 1
+// CHECK-NEXT:       }
+// CHECK-NEXT:       Attribute {
+// CHECK-NEXT:         Tag: 5
+// CHECK-NEXT:         TagName: hvx_arch
+// CHECK-NEXT:         Value: 68
+// CHECK-NEXT:       }
+// CHECK-NEXT:       Attribute {
+// CHECK-NEXT:         Tag: 7
+// CHECK-NEXT:         TagName: hvx_qfloat
+// CHECK-NEXT:         Value: 1
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
diff --git a/llvm/test/MC/Hexagon/hexagon_attributes.s b/llvm/test/MC/Hexagon/hexagon_attributes.s
new file mode 100644
index 00000000000000..e90536030d977f
--- /dev/null
+++ b/llvm/test/MC/Hexagon/hexagon_attributes.s
@@ -0,0 +1,94 @@
+/// Check that file attributes are recorded in a .hexagon.attributes section.
+
+q0&=vcmp.gt(v0.bf,v0.bf)     // hvxv73, hvx-qfloat
+r3:2=cround(r1:0,#0x0)       // v67, audio
+v3:0.w=vrmpyz(v0.b,r0.b)     // hvxv73, zreg
+v1:0.sf=vadd(v0.bf,v0.bf)    // hvxv73, hvx-ieee-fp
+
+// RUN: llvm-mc --mattr=+v67,+hvxv73,+hvx-qfloat,+hvx-ieee-fp,+zreg,+audio %s \
+// RUN:   -triple=hexagon -filetype=obj --hexagon-add-build-attributes -o %t.o
+
+// RUN: llvm-readelf -A %t.o | \
+// RUN:   FileCheck %s --match-full-lines --implicit-check-not={{.}} --check-prefix=READELF
+
+/// llvm-objudmp should be able to determine subtarget features
+/// without manually passing in features when an attribute section is present.
+// RUN: llvm-objdump -d %t.o | FileCheck %s --check-prefix=OBJDUMP
+
+// RUN: llvm-mc --mattr=+v67,+hvxv73,+hvx-qfloat,+hvx-ieee-fp,+zreg,+audio %s \
+// RUN:   -triple=hexagon -filetype=asm --hexagon-add-build-attributes | \
+// RUN:     FileCheck %s --match-full-lines --implicit-check-not={{.}} --check-prefix=ASM
+
+//      READELF: BuildAttributes {
+// READELF-NEXT:   FormatVersion: 0x41
+// READELF-NEXT:   Section 1 {
+// READELF-NEXT:     SectionLength: 31
+// READELF-NEXT:     Vendor: hexagon
+// READELF-NEXT:     Tag: Tag_File (0x1)
+// READELF-NEXT:     Size: 19
+// READELF-NEXT:     FileAttributes {
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 4
+// READELF-NEXT:         TagName: arch
+// READELF-NEXT:         Value: 67
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 5
+// READELF-NEXT:         TagName: hvx_arch
+// READELF-NEXT:         Value: 73
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 6
+// READELF-NEXT:         TagName: hvx_ieeefp
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 7
+// READELF-NEXT:         TagName: hvx_qfloat
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 8
+// READELF-NEXT:         TagName: zreg
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 9
+// READELF-NEXT:         TagName: audio
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:       Attribute {
+// READELF-NEXT:         Tag: 10
+// READELF-NEXT:         TagName: cabac
+// READELF-NEXT:         Value: 1
+// READELF-NEXT:       }
+// READELF-NEXT:     }
+// READELF-NEXT:   }
+// READELF-NEXT: }
+
+//      OBJDUMP: 1c80e0d0 {      q0 &= vcmp.gt(v0.bf,v0.bf) }
+// OBJDUMP-NEXT: 8ce0c042 {      r3:2 = cround(r1:0,#0x0) }
+// OBJDUMP-NEXT: 19e8c000 {      v3:0.w = vrmpyz(v0.b,r0.b) }
+// OBJDUMP-NEXT: 1d40e0c0 {      v1:0.sf = vadd(v0.bf,v0.bf) }
+
+//      ASM: .attribute      4, 67   // Tag_arch
+// ASM-NEXT: .attribute      5, 73   // Tag_hvx_arch
+// ASM-NEXT: .attribute      6, 1    // Tag_hvx_ieeefp
+// ASM-NEXT: .attribute      7, 1    // Tag_hvx_qfloat
+// ASM-NEXT: .attribute      8, 1    // Tag_zreg
+// ASM-NEXT: .attribute      9, 1    // Tag_audio
+// ASM-NEXT: .attribute      10, 1   // Tag_cabac
+// ASM-NEXT:        .text
+// ASM-EMPTY:
+// ASM-NEXT:        {
+// ASM-NEXT:                q0 &= vcmp.gt(v0.bf,v0.bf)
+// ASM-NEXT:        }
+// ASM-NEXT:        {
+// ASM-NEXT:                r3:2 = cround(r1:0,#0)
+// ASM-NEXT:        }
+// ASM-NEXT:        {
+// ASM-NEXT:                v3:0.w = vrmpyz(v0.b,r0.b)
+// ASM-NEXT:        }
+// ASM-NEXT:        {
+// ASM-NEXT:                v1:0.sf = vadd(v0.bf,v0.bf)
+// ASM-NEXT:        }
diff --git a/llvm/test/MC/LoongArch/Macros/macros-li-bad.s b/llvm/test/MC/LoongArch/Macros/macros-li-bad.s
index 194b86bfed2733..c880a01020059b 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-li-bad.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-li-bad.s
@@ -5,3 +5,9 @@ li.w $a0, 0x100000000
 
 li.d $a0, 0x10000000000000000
 # CHECK: :[[#@LINE-1]]:11: error: unknown operand
+
+li.w $a0, non_const_val
+# CHECK: :[[#@LINE-1]]:11: error: operand must be a 32 bit immediate
+
+li.d $a0, non_const_val
+# CHECK: :[[#@LINE-1]]:11: error: operand must be a 64 bit immediate
diff --git a/llvm/test/MC/RISCV/rv32-machine-csr-names.s b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
index e7a6d9ce718f2c..016f448d8dac63 100644
--- a/llvm/test/MC/RISCV/rv32-machine-csr-names.s
+++ b/llvm/test/MC/RISCV/rv32-machine-csr-names.s
@@ -1149,3 +1149,59 @@ csrrs t2, 0x319, zero
 csrrs t1, miph, zero
 # uimm12
 csrrs t2, 0x354, zero
+
+################################################
+# Resumable Non-Maskable Interrupts(Smrnmi) CSRs
+################################################
+
+# mnscratch
+# name
+# CHECK-INST: csrrs t1, mnscratch, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x00,0x74]
+# CHECK-INST-ALIAS: csrr t1, mnscratch
+# uimm12
+# CHECK-INST: csrrs t2, mnscratch, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x00,0x74]
+# CHECK-INST-ALIAS: csrr t2, mnscratch
+csrrs t1, mnscratch, zero
+# uimm12
+csrrs t2, 0x740, zero
+
+# mnepc
+# name
+# CHECK-INST: csrrs t1, mnepc, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x10,0x74]
+# CHECK-INST-ALIAS: csrr t1, mnepc
+# uimm12
+# CHECK-INST: csrrs t2, mnepc, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x10,0x74]
+# CHECK-INST-ALIAS: csrr t2, mnepc
+csrrs t1, mnepc, zero
+# uimm12
+csrrs t2, 0x741, zero
+
+# mncause
+# name
+# CHECK-INST: csrrs t1, mncause, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x20,0x74]
+# CHECK-INST-ALIAS: csrr t1, mncause
+# uimm12
+# CHECK-INST: csrrs t2, mncause, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x20,0x74]
+# CHECK-INST-ALIAS: csrr t2, mncause
+csrrs t1, mncause, zero
+# uimm12
+csrrs t2, 0x742, zero
+
+# mnstatus
+# name
+# CHECK-INST: csrrs t1, mnstatus, zero
+# CHECK-ENC: encoding: [0x73,0x23,0x40,0x74]
+# CHECK-INST-ALIAS: csrr t1, mnstatus
+# uimm12
+# CHECK-INST: csrrs t2, mnstatus, zero
+# CHECK-ENC: encoding: [0xf3,0x23,0x40,0x74]
+# CHECK-INST-ALIAS: csrr t2, mnstatus
+csrrs t1, mnstatus, zero
+# uimm12
+csrrs t2, 0x744, zero
diff --git a/llvm/test/MC/X86/apx/cfcmov-att.s b/llvm/test/MC/X86/apx/cfcmov-att.s
new file mode 100644
index 00000000000000..a6947ca7bc0deb
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cfcmov-att.s
@@ -0,0 +1,841 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+
+# CHECK: cfcmovbw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x42,0xe9]
+         cfcmovbw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovbw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x42,0xcd]
+         cfcmovbw	%r17w, %r21w
+
+# CHECK: cfcmovbw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x42,0xf2]
+         cfcmovbl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovbl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x42,0xd6]
+         cfcmovbl	%r18d, %r22d
+
+# CHECK: cfcmovbl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x42,0xfb]
+         cfcmovbq	%r19, %r23, %r27
+
+# CHECK: cfcmovbq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x42,0xdf]
+         cfcmovbq	%r19, %r23
+
+# CHECK: cfcmovbq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovbw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovbl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovbl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovbq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovbq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovbew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x46,0xe9]
+         cfcmovbew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovbew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x46,0xcd]
+         cfcmovbew	%r17w, %r21w
+
+# CHECK: cfcmovbew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x46,0xf2]
+         cfcmovbel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovbel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x46,0xd6]
+         cfcmovbel	%r18d, %r22d
+
+# CHECK: cfcmovbel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbeq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x46,0xfb]
+         cfcmovbeq	%r19, %r23, %r27
+
+# CHECK: cfcmovbeq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x46,0xdf]
+         cfcmovbeq	%r19, %r23
+
+# CHECK: cfcmovbeq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbeq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovbew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovbew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovbel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovbel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovbeq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbeq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovbeq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbeq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovlw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4c,0xe9]
+         cfcmovlw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovlw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4c,0xcd]
+         cfcmovlw	%r17w, %r21w
+
+# CHECK: cfcmovlw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovll	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4c,0xf2]
+         cfcmovll	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovll	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4c,0xd6]
+         cfcmovll	%r18d, %r22d
+
+# CHECK: cfcmovll	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovll	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovlq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4c,0xfb]
+         cfcmovlq	%r19, %r23, %r27
+
+# CHECK: cfcmovlq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4c,0xdf]
+         cfcmovlq	%r19, %r23
+
+# CHECK: cfcmovlq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovlw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovlw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovll	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovll	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovll	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovll	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovlq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovlq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovlew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4e,0xe9]
+         cfcmovlew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovlew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4e,0xcd]
+         cfcmovlew	%r17w, %r21w
+
+# CHECK: cfcmovlew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovlel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4e,0xf2]
+         cfcmovlel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovlel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4e,0xd6]
+         cfcmovlel	%r18d, %r22d
+
+# CHECK: cfcmovlel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovleq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4e,0xfb]
+         cfcmovleq	%r19, %r23, %r27
+
+# CHECK: cfcmovleq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4e,0xdf]
+         cfcmovleq	%r19, %r23
+
+# CHECK: cfcmovleq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovleq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovlew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovlew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovlel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovlel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovlel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovleq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovleq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovleq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovleq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovaew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x43,0xe9]
+         cfcmovaew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovaew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x43,0xcd]
+         cfcmovaew	%r17w, %r21w
+
+# CHECK: cfcmovaew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovael	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x43,0xf2]
+         cfcmovael	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovael	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x43,0xd6]
+         cfcmovael	%r18d, %r22d
+
+# CHECK: cfcmovael	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovael	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovaeq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x43,0xfb]
+         cfcmovaeq	%r19, %r23, %r27
+
+# CHECK: cfcmovaeq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x43,0xdf]
+         cfcmovaeq	%r19, %r23
+
+# CHECK: cfcmovaeq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaeq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovaew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovaew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovael	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovael	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovael	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovael	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovaeq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaeq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovaeq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaeq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovaw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x47,0xe9]
+         cfcmovaw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovaw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x47,0xcd]
+         cfcmovaw	%r17w, %r21w
+
+# CHECK: cfcmovaw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmoval	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x47,0xf2]
+         cfcmoval	%r18d, %r22d, %r26d
+
+# CHECK: cfcmoval	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x47,0xd6]
+         cfcmoval	%r18d, %r22d
+
+# CHECK: cfcmoval	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmoval	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovaq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x47,0xfb]
+         cfcmovaq	%r19, %r23, %r27
+
+# CHECK: cfcmovaq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x47,0xdf]
+         cfcmovaq	%r19, %r23
+
+# CHECK: cfcmovaq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovaw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovaw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmoval	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmoval	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmoval	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmoval	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovaq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovaq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovaq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovgew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4d,0xe9]
+         cfcmovgew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovgew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4d,0xcd]
+         cfcmovgew	%r17w, %r21w
+
+# CHECK: cfcmovgew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovgel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4d,0xf2]
+         cfcmovgel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovgel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4d,0xd6]
+         cfcmovgel	%r18d, %r22d
+
+# CHECK: cfcmovgel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovgeq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4d,0xfb]
+         cfcmovgeq	%r19, %r23, %r27
+
+# CHECK: cfcmovgeq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4d,0xdf]
+         cfcmovgeq	%r19, %r23
+
+# CHECK: cfcmovgeq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgeq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovgew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovgew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovgel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovgel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovgeq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgeq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovgeq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovgeq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovnow	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x41,0xe9]
+         cfcmovnow	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovnow	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x41,0xcd]
+         cfcmovnow	%r17w, %r21w
+
+# CHECK: cfcmovnow	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnow	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnol	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x41,0xf2]
+         cfcmovnol	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovnol	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x41,0xd6]
+         cfcmovnol	%r18d, %r22d
+
+# CHECK: cfcmovnol	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnol	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnoq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x41,0xfb]
+         cfcmovnoq	%r19, %r23, %r27
+
+# CHECK: cfcmovnoq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x41,0xdf]
+         cfcmovnoq	%r19, %r23
+
+# CHECK: cfcmovnoq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnoq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnow	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnow	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovnow	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnow	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovnol	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnol	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovnol	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnol	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovnoq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnoq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovnoq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnoq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovnpw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4b,0xe9]
+         cfcmovnpw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovnpw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4b,0xcd]
+         cfcmovnpw	%r17w, %r21w
+
+# CHECK: cfcmovnpw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnpl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4b,0xf2]
+         cfcmovnpl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovnpl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4b,0xd6]
+         cfcmovnpl	%r18d, %r22d
+
+# CHECK: cfcmovnpl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnpq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4b,0xfb]
+         cfcmovnpq	%r19, %r23, %r27
+
+# CHECK: cfcmovnpq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4b,0xdf]
+         cfcmovnpq	%r19, %r23
+
+# CHECK: cfcmovnpq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnpw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovnpw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovnpl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovnpl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovnpq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovnpq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnpq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovnsw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x49,0xe9]
+         cfcmovnsw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovnsw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x49,0xcd]
+         cfcmovnsw	%r17w, %r21w
+
+# CHECK: cfcmovnsw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnsl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x49,0xf2]
+         cfcmovnsl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovnsl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x49,0xd6]
+         cfcmovnsl	%r18d, %r22d
+
+# CHECK: cfcmovnsl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnsq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x49,0xfb]
+         cfcmovnsq	%r19, %r23, %r27
+
+# CHECK: cfcmovnsq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x49,0xdf]
+         cfcmovnsq	%r19, %r23
+
+# CHECK: cfcmovnsq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnsw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovnsw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovnsl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovnsl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovnsq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovnsq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnsq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovnew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x45,0xe9]
+         cfcmovnew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovnew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x45,0xcd]
+         cfcmovnew	%r17w, %r21w
+
+# CHECK: cfcmovnew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x45,0xf2]
+         cfcmovnel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovnel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x45,0xd6]
+         cfcmovnel	%r18d, %r22d
+
+# CHECK: cfcmovnel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovneq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x45,0xfb]
+         cfcmovneq	%r19, %r23, %r27
+
+# CHECK: cfcmovneq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x45,0xdf]
+         cfcmovneq	%r19, %r23
+
+# CHECK: cfcmovneq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovneq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovnew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovnew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovnel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovnel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovneq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovneq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovneq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovneq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovpw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4a,0xe9]
+         cfcmovpw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovpw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4a,0xcd]
+         cfcmovpw	%r17w, %r21w
+
+# CHECK: cfcmovpw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovpl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4a,0xf2]
+         cfcmovpl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovpl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4a,0xd6]
+         cfcmovpl	%r18d, %r22d
+
+# CHECK: cfcmovpl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovpq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4a,0xfb]
+         cfcmovpq	%r19, %r23, %r27
+
+# CHECK: cfcmovpq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4a,0xdf]
+         cfcmovpq	%r19, %r23
+
+# CHECK: cfcmovpq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovpw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovpw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovpl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovpl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovpq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovpq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovpq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovsw	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x48,0xe9]
+         cfcmovsw	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovsw	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x48,0xcd]
+         cfcmovsw	%r17w, %r21w
+
+# CHECK: cfcmovsw	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsw	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovsl	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x48,0xf2]
+         cfcmovsl	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovsl	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x48,0xd6]
+         cfcmovsl	%r18d, %r22d
+
+# CHECK: cfcmovsl	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsl	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmovsq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x48,0xfb]
+         cfcmovsq	%r19, %r23, %r27
+
+# CHECK: cfcmovsq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x48,0xdf]
+         cfcmovsq	%r19, %r23
+
+# CHECK: cfcmovsq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovsw	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsw	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovsw	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsw	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovsl	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsl	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovsl	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsl	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmovsq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmovsq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovsq	291(%r28,%r29,4), %r19
+
+# CHECK: cfcmovew	%r17w, %r21w, %r25w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x44,0xe9]
+         cfcmovew	%r17w, %r21w, %r25w
+
+# CHECK: cfcmovew	%r17w, %r21w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x44,0xcd]
+         cfcmovew	%r17w, %r21w
+
+# CHECK: cfcmovew	%r17w, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovew	%r17w, 291(%r28,%r29,4)
+
+# CHECK: cfcmovel	%r18d, %r22d, %r26d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x44,0xf2]
+         cfcmovel	%r18d, %r22d, %r26d
+
+# CHECK: cfcmovel	%r18d, %r22d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x44,0xd6]
+         cfcmovel	%r18d, %r22d
+
+# CHECK: cfcmovel	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovel	%r18d, 291(%r28,%r29,4)
+
+# CHECK: cfcmoveq	%r19, %r23, %r27
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x44,0xfb]
+         cfcmoveq	%r19, %r23, %r27
+
+# CHECK: cfcmoveq	%r19, %r23
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x44,0xdf]
+         cfcmoveq	%r19, %r23
+
+# CHECK: cfcmoveq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmoveq	%r19, 291(%r28,%r29,4)
+
+# CHECK: cfcmovew	291(%r28,%r29,4), %r17w, %r21w
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovew	291(%r28,%r29,4), %r17w, %r21w
+
+# CHECK: cfcmovew	291(%r28,%r29,4), %r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovew	291(%r28,%r29,4), %r17w
+
+# CHECK: cfcmovel	291(%r28,%r29,4), %r18d, %r22d
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovel	291(%r28,%r29,4), %r18d, %r22d
+
+# CHECK: cfcmovel	291(%r28,%r29,4), %r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovel	291(%r28,%r29,4), %r18d
+
+# CHECK: cfcmoveq	291(%r28,%r29,4), %r19, %r23
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmoveq	291(%r28,%r29,4), %r19, %r23
+
+# CHECK: cfcmoveq	291(%r28,%r29,4), %r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmoveq	291(%r28,%r29,4), %r19
diff --git a/llvm/test/MC/X86/apx/cfcmov-intel.s b/llvm/test/MC/X86/apx/cfcmov-intel.s
new file mode 100644
index 00000000000000..58b145d7fef2e5
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cfcmov-intel.s
@@ -0,0 +1,841 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: cfcmovb	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x42,0xe9]
+         cfcmovb	r25w, r21w, r17w
+
+# CHECK: cfcmovb	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x42,0xcd]
+         cfcmovb	r21w, r17w
+
+# CHECK: cfcmovb	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovb	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x42,0xf2]
+         cfcmovb	r26d, r22d, r18d
+
+# CHECK: cfcmovb	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x42,0xd6]
+         cfcmovb	r22d, r18d
+
+# CHECK: cfcmovb	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovb	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x42,0xfb]
+         cfcmovb	r27, r23, r19
+
+# CHECK: cfcmovb	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x42,0xdf]
+         cfcmovb	r23, r19
+
+# CHECK: cfcmovb	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovb	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x42,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x42,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovb	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x42,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovb	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x46,0xe9]
+         cfcmovbe	r25w, r21w, r17w
+
+# CHECK: cfcmovbe	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x46,0xcd]
+         cfcmovbe	r21w, r17w
+
+# CHECK: cfcmovbe	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovbe	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x46,0xf2]
+         cfcmovbe	r26d, r22d, r18d
+
+# CHECK: cfcmovbe	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x46,0xd6]
+         cfcmovbe	r22d, r18d
+
+# CHECK: cfcmovbe	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovbe	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x46,0xfb]
+         cfcmovbe	r27, r23, r19
+
+# CHECK: cfcmovbe	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x46,0xdf]
+         cfcmovbe	r23, r19
+
+# CHECK: cfcmovbe	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovbe	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x46,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x46,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovbe	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x46,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovbe	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4c,0xe9]
+         cfcmovl	r25w, r21w, r17w
+
+# CHECK: cfcmovl	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4c,0xcd]
+         cfcmovl	r21w, r17w
+
+# CHECK: cfcmovl	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovl	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4c,0xf2]
+         cfcmovl	r26d, r22d, r18d
+
+# CHECK: cfcmovl	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4c,0xd6]
+         cfcmovl	r22d, r18d
+
+# CHECK: cfcmovl	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovl	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4c,0xfb]
+         cfcmovl	r27, r23, r19
+
+# CHECK: cfcmovl	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4c,0xdf]
+         cfcmovl	r23, r19
+
+# CHECK: cfcmovl	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovl	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4c,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4c,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovl	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4c,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovl	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4e,0xe9]
+         cfcmovle	r25w, r21w, r17w
+
+# CHECK: cfcmovle	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4e,0xcd]
+         cfcmovle	r21w, r17w
+
+# CHECK: cfcmovle	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovle	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4e,0xf2]
+         cfcmovle	r26d, r22d, r18d
+
+# CHECK: cfcmovle	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4e,0xd6]
+         cfcmovle	r22d, r18d
+
+# CHECK: cfcmovle	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovle	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4e,0xfb]
+         cfcmovle	r27, r23, r19
+
+# CHECK: cfcmovle	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4e,0xdf]
+         cfcmovle	r23, r19
+
+# CHECK: cfcmovle	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovle	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4e,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4e,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovle	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4e,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovle	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x43,0xe9]
+         cfcmovae	r25w, r21w, r17w
+
+# CHECK: cfcmovae	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x43,0xcd]
+         cfcmovae	r21w, r17w
+
+# CHECK: cfcmovae	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovae	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x43,0xf2]
+         cfcmovae	r26d, r22d, r18d
+
+# CHECK: cfcmovae	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x43,0xd6]
+         cfcmovae	r22d, r18d
+
+# CHECK: cfcmovae	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovae	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x43,0xfb]
+         cfcmovae	r27, r23, r19
+
+# CHECK: cfcmovae	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x43,0xdf]
+         cfcmovae	r23, r19
+
+# CHECK: cfcmovae	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovae	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x43,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x43,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovae	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x43,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovae	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x47,0xe9]
+         cfcmova	r25w, r21w, r17w
+
+# CHECK: cfcmova	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x47,0xcd]
+         cfcmova	r21w, r17w
+
+# CHECK: cfcmova	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmova	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x47,0xf2]
+         cfcmova	r26d, r22d, r18d
+
+# CHECK: cfcmova	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x47,0xd6]
+         cfcmova	r22d, r18d
+
+# CHECK: cfcmova	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmova	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x47,0xfb]
+         cfcmova	r27, r23, r19
+
+# CHECK: cfcmova	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x47,0xdf]
+         cfcmova	r23, r19
+
+# CHECK: cfcmova	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmova	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x47,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x47,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmova	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x47,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmova	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4d,0xe9]
+         cfcmovge	r25w, r21w, r17w
+
+# CHECK: cfcmovge	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4d,0xcd]
+         cfcmovge	r21w, r17w
+
+# CHECK: cfcmovge	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovge	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4d,0xf2]
+         cfcmovge	r26d, r22d, r18d
+
+# CHECK: cfcmovge	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4d,0xd6]
+         cfcmovge	r22d, r18d
+
+# CHECK: cfcmovge	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovge	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4d,0xfb]
+         cfcmovge	r27, r23, r19
+
+# CHECK: cfcmovge	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4d,0xdf]
+         cfcmovge	r23, r19
+
+# CHECK: cfcmovge	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovge	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4d,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4d,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovge	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4d,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovge	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x41,0xe9]
+         cfcmovno	r25w, r21w, r17w
+
+# CHECK: cfcmovno	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x41,0xcd]
+         cfcmovno	r21w, r17w
+
+# CHECK: cfcmovno	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovno	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x41,0xf2]
+         cfcmovno	r26d, r22d, r18d
+
+# CHECK: cfcmovno	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x41,0xd6]
+         cfcmovno	r22d, r18d
+
+# CHECK: cfcmovno	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovno	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x41,0xfb]
+         cfcmovno	r27, r23, r19
+
+# CHECK: cfcmovno	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x41,0xdf]
+         cfcmovno	r23, r19
+
+# CHECK: cfcmovno	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovno	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x41,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x41,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovno	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x41,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovno	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4b,0xe9]
+         cfcmovnp	r25w, r21w, r17w
+
+# CHECK: cfcmovnp	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4b,0xcd]
+         cfcmovnp	r21w, r17w
+
+# CHECK: cfcmovnp	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovnp	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4b,0xf2]
+         cfcmovnp	r26d, r22d, r18d
+
+# CHECK: cfcmovnp	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4b,0xd6]
+         cfcmovnp	r22d, r18d
+
+# CHECK: cfcmovnp	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovnp	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4b,0xfb]
+         cfcmovnp	r27, r23, r19
+
+# CHECK: cfcmovnp	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4b,0xdf]
+         cfcmovnp	r23, r19
+
+# CHECK: cfcmovnp	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovnp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4b,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4b,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovnp	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4b,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovnp	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x49,0xe9]
+         cfcmovns	r25w, r21w, r17w
+
+# CHECK: cfcmovns	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x49,0xcd]
+         cfcmovns	r21w, r17w
+
+# CHECK: cfcmovns	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovns	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x49,0xf2]
+         cfcmovns	r26d, r22d, r18d
+
+# CHECK: cfcmovns	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x49,0xd6]
+         cfcmovns	r22d, r18d
+
+# CHECK: cfcmovns	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovns	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x49,0xfb]
+         cfcmovns	r27, r23, r19
+
+# CHECK: cfcmovns	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x49,0xdf]
+         cfcmovns	r23, r19
+
+# CHECK: cfcmovns	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovns	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x49,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x49,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovns	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x49,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovns	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x45,0xe9]
+         cfcmovne	r25w, r21w, r17w
+
+# CHECK: cfcmovne	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x45,0xcd]
+         cfcmovne	r21w, r17w
+
+# CHECK: cfcmovne	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovne	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x45,0xf2]
+         cfcmovne	r26d, r22d, r18d
+
+# CHECK: cfcmovne	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x45,0xd6]
+         cfcmovne	r22d, r18d
+
+# CHECK: cfcmovne	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovne	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x45,0xfb]
+         cfcmovne	r27, r23, r19
+
+# CHECK: cfcmovne	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x45,0xdf]
+         cfcmovne	r23, r19
+
+# CHECK: cfcmovne	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovne	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x45,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x45,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovne	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x45,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovne	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x4a,0xe9]
+         cfcmovp	r25w, r21w, r17w
+
+# CHECK: cfcmovp	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x4a,0xcd]
+         cfcmovp	r21w, r17w
+
+# CHECK: cfcmovp	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovp	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x4a,0xf2]
+         cfcmovp	r26d, r22d, r18d
+
+# CHECK: cfcmovp	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x4a,0xd6]
+         cfcmovp	r22d, r18d
+
+# CHECK: cfcmovp	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovp	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x4a,0xfb]
+         cfcmovp	r27, r23, r19
+
+# CHECK: cfcmovp	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x4a,0xdf]
+         cfcmovp	r23, r19
+
+# CHECK: cfcmovp	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x4a,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x4a,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovp	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x4a,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovp	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x48,0xe9]
+         cfcmovs	r25w, r21w, r17w
+
+# CHECK: cfcmovs	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x48,0xcd]
+         cfcmovs	r21w, r17w
+
+# CHECK: cfcmovs	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmovs	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x48,0xf2]
+         cfcmovs	r26d, r22d, r18d
+
+# CHECK: cfcmovs	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x48,0xd6]
+         cfcmovs	r22d, r18d
+
+# CHECK: cfcmovs	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmovs	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x48,0xfb]
+         cfcmovs	r27, r23, r19
+
+# CHECK: cfcmovs	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x48,0xdf]
+         cfcmovs	r23, r19
+
+# CHECK: cfcmovs	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmovs	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x48,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x48,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmovs	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x48,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmovs	r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r25w, r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x35,0x14,0x44,0xe9]
+         cfcmove	r25w, r21w, r17w
+
+# CHECK: cfcmove	r21w, r17w
+# CHECK: encoding: [0x62,0xec,0x7d,0x0c,0x44,0xcd]
+         cfcmove	r21w, r17w
+
+# CHECK: cfcmove	word ptr [r28 + 4*r29 + 291], r17w
+# CHECK: encoding: [0x62,0x8c,0x79,0x0c,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	word ptr [r28 + 4*r29 + 291], r17w
+
+# CHECK: cfcmove	r26d, r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x2c,0x14,0x44,0xf2]
+         cfcmove	r26d, r22d, r18d
+
+# CHECK: cfcmove	r22d, r18d
+# CHECK: encoding: [0x62,0xec,0x7c,0x0c,0x44,0xd6]
+         cfcmove	r22d, r18d
+
+# CHECK: cfcmove	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x0c,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	dword ptr [r28 + 4*r29 + 291], r18d
+
+# CHECK: cfcmove	r27, r23, r19
+# CHECK: encoding: [0x62,0xec,0xa4,0x14,0x44,0xfb]
+         cfcmove	r27, r23, r19
+
+# CHECK: cfcmove	r23, r19
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x44,0xdf]
+         cfcmove	r23, r19
+
+# CHECK: cfcmove	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x0c,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	qword ptr [r28 + 4*r29 + 291], r19
+
+# CHECK: cfcmove	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x51,0x14,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r21w, r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r17w, word ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x44,0x8c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r17w, word ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x48,0x14,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r22d, r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r18d, dword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x44,0x94,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r18d, dword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r23, r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xc0,0x14,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r23, r19, qword ptr [r28 + 4*r29 + 291]
+
+# CHECK: cfcmove	r19, qword ptr [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x44,0x9c,0xac,0x23,0x01,0x00,0x00]
+         cfcmove	r19, qword ptr [r28 + 4*r29 + 291]
diff --git a/llvm/test/MC/X86/apx/cmov-att.s b/llvm/test/MC/X86/apx/cmov-att.s
new file mode 100644
index 00000000000000..4b8e6786d80159
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cmov-att.s
@@ -0,0 +1,293 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-96: error:
+# ERROR-NOT: error:
+# CHECK: cmovbw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x42,0xc2]
+         cmovbw	%dx, %ax, %r9w
+# CHECK: cmovbl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x42,0xd1]
+         cmovbl	%ecx, %edx, %r10d
+# CHECK: cmovbq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x42,0xf9]
+         cmovbq	%r9, %r15, %r11
+# CHECK: cmovbw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x42,0x54,0x80,0x7b]
+         cmovbw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovbl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x42,0x4c,0x80,0x7b]
+         cmovbl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovbq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x42,0x4c,0x80,0x7b]
+         cmovbq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovbew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x46,0xc2]
+         cmovbew	%dx, %ax, %r9w
+# CHECK: cmovbel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x46,0xd1]
+         cmovbel	%ecx, %edx, %r10d
+# CHECK: cmovbeq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x46,0xf9]
+         cmovbeq	%r9, %r15, %r11
+# CHECK: cmovbew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x46,0x54,0x80,0x7b]
+         cmovbew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovbel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x46,0x4c,0x80,0x7b]
+         cmovbel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovbeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x46,0x4c,0x80,0x7b]
+         cmovbeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovlw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4c,0xc2]
+         cmovlw	%dx, %ax, %r9w
+# CHECK: cmovll	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4c,0xd1]
+         cmovll	%ecx, %edx, %r10d
+# CHECK: cmovlq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4c,0xf9]
+         cmovlq	%r9, %r15, %r11
+# CHECK: cmovlw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4c,0x54,0x80,0x7b]
+         cmovlw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovll	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4c,0x4c,0x80,0x7b]
+         cmovll	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovlq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4c,0x4c,0x80,0x7b]
+         cmovlq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovlew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4e,0xc2]
+         cmovlew	%dx, %ax, %r9w
+# CHECK: cmovlel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4e,0xd1]
+         cmovlel	%ecx, %edx, %r10d
+# CHECK: cmovleq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4e,0xf9]
+         cmovleq	%r9, %r15, %r11
+# CHECK: cmovlew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4e,0x54,0x80,0x7b]
+         cmovlew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovlel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4e,0x4c,0x80,0x7b]
+         cmovlel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovleq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4e,0x4c,0x80,0x7b]
+         cmovleq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovaew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x43,0xc2]
+         cmovaew	%dx, %ax, %r9w
+# CHECK: cmovael	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x43,0xd1]
+         cmovael	%ecx, %edx, %r10d
+# CHECK: cmovaeq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x43,0xf9]
+         cmovaeq	%r9, %r15, %r11
+# CHECK: cmovaew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x43,0x54,0x80,0x7b]
+         cmovaew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovael	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x43,0x4c,0x80,0x7b]
+         cmovael	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovaeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x43,0x4c,0x80,0x7b]
+         cmovaeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovaw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x47,0xc2]
+         cmovaw	%dx, %ax, %r9w
+# CHECK: cmoval	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x47,0xd1]
+         cmoval	%ecx, %edx, %r10d
+# CHECK: cmovaq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x47,0xf9]
+         cmovaq	%r9, %r15, %r11
+# CHECK: cmovaw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x47,0x54,0x80,0x7b]
+         cmovaw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmoval	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x47,0x4c,0x80,0x7b]
+         cmoval	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovaq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x47,0x4c,0x80,0x7b]
+         cmovaq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovgew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4d,0xc2]
+         cmovgew	%dx, %ax, %r9w
+# CHECK: cmovgel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4d,0xd1]
+         cmovgel	%ecx, %edx, %r10d
+# CHECK: cmovgeq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4d,0xf9]
+         cmovgeq	%r9, %r15, %r11
+# CHECK: cmovgew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4d,0x54,0x80,0x7b]
+         cmovgew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovgel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4d,0x4c,0x80,0x7b]
+         cmovgel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovgeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4d,0x4c,0x80,0x7b]
+         cmovgeq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovgw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4f,0xc2]
+         cmovgw	%dx, %ax, %r9w
+# CHECK: cmovgl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4f,0xd1]
+         cmovgl	%ecx, %edx, %r10d
+# CHECK: cmovgq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4f,0xf9]
+         cmovgq	%r9, %r15, %r11
+# CHECK: cmovgw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4f,0x54,0x80,0x7b]
+         cmovgw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovgl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4f,0x4c,0x80,0x7b]
+         cmovgl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovgq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4f,0x4c,0x80,0x7b]
+         cmovgq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovnow	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x41,0xc2]
+         cmovnow	%dx, %ax, %r9w
+# CHECK: cmovnol	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x41,0xd1]
+         cmovnol	%ecx, %edx, %r10d
+# CHECK: cmovnoq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x41,0xf9]
+         cmovnoq	%r9, %r15, %r11
+# CHECK: cmovnow	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x41,0x54,0x80,0x7b]
+         cmovnow	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovnol	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x41,0x4c,0x80,0x7b]
+         cmovnol	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovnoq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x41,0x4c,0x80,0x7b]
+         cmovnoq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovnpw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4b,0xc2]
+         cmovnpw	%dx, %ax, %r9w
+# CHECK: cmovnpl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4b,0xd1]
+         cmovnpl	%ecx, %edx, %r10d
+# CHECK: cmovnpq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4b,0xf9]
+         cmovnpq	%r9, %r15, %r11
+# CHECK: cmovnpw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4b,0x54,0x80,0x7b]
+         cmovnpw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovnpl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4b,0x4c,0x80,0x7b]
+         cmovnpl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovnpq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4b,0x4c,0x80,0x7b]
+         cmovnpq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovnsw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x49,0xc2]
+         cmovnsw	%dx, %ax, %r9w
+# CHECK: cmovnsl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x49,0xd1]
+         cmovnsl	%ecx, %edx, %r10d
+# CHECK: cmovnsq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x49,0xf9]
+         cmovnsq	%r9, %r15, %r11
+# CHECK: cmovnsw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x49,0x54,0x80,0x7b]
+         cmovnsw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovnsl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x49,0x4c,0x80,0x7b]
+         cmovnsl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovnsq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x49,0x4c,0x80,0x7b]
+         cmovnsq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovnew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x45,0xc2]
+         cmovnew	%dx, %ax, %r9w
+# CHECK: cmovnel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x45,0xd1]
+         cmovnel	%ecx, %edx, %r10d
+# CHECK: cmovneq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x45,0xf9]
+         cmovneq	%r9, %r15, %r11
+# CHECK: cmovnew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x45,0x54,0x80,0x7b]
+         cmovnew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovnel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x45,0x4c,0x80,0x7b]
+         cmovnel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovneq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x45,0x4c,0x80,0x7b]
+         cmovneq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovow	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x40,0xc2]
+         cmovow	%dx, %ax, %r9w
+# CHECK: cmovol	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x40,0xd1]
+         cmovol	%ecx, %edx, %r10d
+# CHECK: cmovoq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x40,0xf9]
+         cmovoq	%r9, %r15, %r11
+# CHECK: cmovow	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x40,0x54,0x80,0x7b]
+         cmovow	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovol	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x40,0x4c,0x80,0x7b]
+         cmovol	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovoq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x40,0x4c,0x80,0x7b]
+         cmovoq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovpw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4a,0xc2]
+         cmovpw	%dx, %ax, %r9w
+# CHECK: cmovpl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4a,0xd1]
+         cmovpl	%ecx, %edx, %r10d
+# CHECK: cmovpq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4a,0xf9]
+         cmovpq	%r9, %r15, %r11
+# CHECK: cmovpw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4a,0x54,0x80,0x7b]
+         cmovpw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovpl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4a,0x4c,0x80,0x7b]
+         cmovpl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovpq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4a,0x4c,0x80,0x7b]
+         cmovpq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovsw	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x48,0xc2]
+         cmovsw	%dx, %ax, %r9w
+# CHECK: cmovsl	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x48,0xd1]
+         cmovsl	%ecx, %edx, %r10d
+# CHECK: cmovsq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x48,0xf9]
+         cmovsq	%r9, %r15, %r11
+# CHECK: cmovsw	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x48,0x54,0x80,0x7b]
+         cmovsw	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovsl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x48,0x4c,0x80,0x7b]
+         cmovsl	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmovsq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x48,0x4c,0x80,0x7b]
+         cmovsq	123(%r8,%rax,4), %r9, %r15
+# CHECK: cmovew	%dx, %ax, %r9w
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x44,0xc2]
+         cmovew	%dx, %ax, %r9w
+# CHECK: cmovel	%ecx, %edx, %r10d
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x44,0xd1]
+         cmovel	%ecx, %edx, %r10d
+# CHECK: cmoveq	%r9, %r15, %r11
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x44,0xf9]
+         cmoveq	%r9, %r15, %r11
+# CHECK: cmovew	123(%r8,%rax,4), %dx, %ax
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x44,0x54,0x80,0x7b]
+         cmovew	123(%r8,%rax,4), %dx, %ax
+# CHECK: cmovel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x44,0x4c,0x80,0x7b]
+         cmovel	123(%r8,%rax,4), %ecx, %edx
+# CHECK: cmoveq	123(%r8,%rax,4), %r9, %r15
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x44,0x4c,0x80,0x7b]
+         cmoveq	123(%r8,%rax,4), %r9, %r15
diff --git a/llvm/test/MC/X86/apx/cmov-intel.s b/llvm/test/MC/X86/apx/cmov-intel.s
new file mode 100644
index 00000000000000..f481346bc34708
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cmov-intel.s
@@ -0,0 +1,290 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: cmovb	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x42,0xc2]
+         cmovb	r9w, ax, dx
+# CHECK: cmovb	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x42,0xd1]
+         cmovb	r10d, edx, ecx
+# CHECK: cmovb	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x42,0xf9]
+         cmovb	r11, r15, r9
+# CHECK: cmovb	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x42,0x54,0x80,0x7b]
+         cmovb	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovb	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x42,0x4c,0x80,0x7b]
+         cmovb	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovb	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x42,0x4c,0x80,0x7b]
+         cmovb	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovbe	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x46,0xc2]
+         cmovbe	r9w, ax, dx
+# CHECK: cmovbe	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x46,0xd1]
+         cmovbe	r10d, edx, ecx
+# CHECK: cmovbe	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x46,0xf9]
+         cmovbe	r11, r15, r9
+# CHECK: cmovbe	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x46,0x54,0x80,0x7b]
+         cmovbe	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovbe	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x46,0x4c,0x80,0x7b]
+         cmovbe	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovbe	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x46,0x4c,0x80,0x7b]
+         cmovbe	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovl	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4c,0xc2]
+         cmovl	r9w, ax, dx
+# CHECK: cmovl	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4c,0xd1]
+         cmovl	r10d, edx, ecx
+# CHECK: cmovl	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4c,0xf9]
+         cmovl	r11, r15, r9
+# CHECK: cmovl	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4c,0x54,0x80,0x7b]
+         cmovl	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovl	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4c,0x4c,0x80,0x7b]
+         cmovl	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovl	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4c,0x4c,0x80,0x7b]
+         cmovl	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovle	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4e,0xc2]
+         cmovle	r9w, ax, dx
+# CHECK: cmovle	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4e,0xd1]
+         cmovle	r10d, edx, ecx
+# CHECK: cmovle	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4e,0xf9]
+         cmovle	r11, r15, r9
+# CHECK: cmovle	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4e,0x54,0x80,0x7b]
+         cmovle	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovle	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4e,0x4c,0x80,0x7b]
+         cmovle	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovle	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4e,0x4c,0x80,0x7b]
+         cmovle	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovae	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x43,0xc2]
+         cmovae	r9w, ax, dx
+# CHECK: cmovae	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x43,0xd1]
+         cmovae	r10d, edx, ecx
+# CHECK: cmovae	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x43,0xf9]
+         cmovae	r11, r15, r9
+# CHECK: cmovae	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x43,0x54,0x80,0x7b]
+         cmovae	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovae	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x43,0x4c,0x80,0x7b]
+         cmovae	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovae	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x43,0x4c,0x80,0x7b]
+         cmovae	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmova	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x47,0xc2]
+         cmova	r9w, ax, dx
+# CHECK: cmova	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x47,0xd1]
+         cmova	r10d, edx, ecx
+# CHECK: cmova	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x47,0xf9]
+         cmova	r11, r15, r9
+# CHECK: cmova	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x47,0x54,0x80,0x7b]
+         cmova	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmova	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x47,0x4c,0x80,0x7b]
+         cmova	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmova	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x47,0x4c,0x80,0x7b]
+         cmova	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovge	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4d,0xc2]
+         cmovge	r9w, ax, dx
+# CHECK: cmovge	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4d,0xd1]
+         cmovge	r10d, edx, ecx
+# CHECK: cmovge	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4d,0xf9]
+         cmovge	r11, r15, r9
+# CHECK: cmovge	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4d,0x54,0x80,0x7b]
+         cmovge	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovge	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4d,0x4c,0x80,0x7b]
+         cmovge	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovge	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4d,0x4c,0x80,0x7b]
+         cmovge	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovg	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4f,0xc2]
+         cmovg	r9w, ax, dx
+# CHECK: cmovg	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4f,0xd1]
+         cmovg	r10d, edx, ecx
+# CHECK: cmovg	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4f,0xf9]
+         cmovg	r11, r15, r9
+# CHECK: cmovg	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4f,0x54,0x80,0x7b]
+         cmovg	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovg	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4f,0x4c,0x80,0x7b]
+         cmovg	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovg	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4f,0x4c,0x80,0x7b]
+         cmovg	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovno	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x41,0xc2]
+         cmovno	r9w, ax, dx
+# CHECK: cmovno	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x41,0xd1]
+         cmovno	r10d, edx, ecx
+# CHECK: cmovno	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x41,0xf9]
+         cmovno	r11, r15, r9
+# CHECK: cmovno	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x41,0x54,0x80,0x7b]
+         cmovno	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovno	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x41,0x4c,0x80,0x7b]
+         cmovno	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovno	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x41,0x4c,0x80,0x7b]
+         cmovno	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovnp	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4b,0xc2]
+         cmovnp	r9w, ax, dx
+# CHECK: cmovnp	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4b,0xd1]
+         cmovnp	r10d, edx, ecx
+# CHECK: cmovnp	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4b,0xf9]
+         cmovnp	r11, r15, r9
+# CHECK: cmovnp	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4b,0x54,0x80,0x7b]
+         cmovnp	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovnp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4b,0x4c,0x80,0x7b]
+         cmovnp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovnp	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4b,0x4c,0x80,0x7b]
+         cmovnp	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovns	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x49,0xc2]
+         cmovns	r9w, ax, dx
+# CHECK: cmovns	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x49,0xd1]
+         cmovns	r10d, edx, ecx
+# CHECK: cmovns	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x49,0xf9]
+         cmovns	r11, r15, r9
+# CHECK: cmovns	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x49,0x54,0x80,0x7b]
+         cmovns	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovns	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x49,0x4c,0x80,0x7b]
+         cmovns	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovns	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x49,0x4c,0x80,0x7b]
+         cmovns	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovne	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x45,0xc2]
+         cmovne	r9w, ax, dx
+# CHECK: cmovne	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x45,0xd1]
+         cmovne	r10d, edx, ecx
+# CHECK: cmovne	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x45,0xf9]
+         cmovne	r11, r15, r9
+# CHECK: cmovne	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x45,0x54,0x80,0x7b]
+         cmovne	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovne	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x45,0x4c,0x80,0x7b]
+         cmovne	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovne	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x45,0x4c,0x80,0x7b]
+         cmovne	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovo	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x40,0xc2]
+         cmovo	r9w, ax, dx
+# CHECK: cmovo	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x40,0xd1]
+         cmovo	r10d, edx, ecx
+# CHECK: cmovo	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x40,0xf9]
+         cmovo	r11, r15, r9
+# CHECK: cmovo	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x40,0x54,0x80,0x7b]
+         cmovo	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovo	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x40,0x4c,0x80,0x7b]
+         cmovo	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovo	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x40,0x4c,0x80,0x7b]
+         cmovo	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovp	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x4a,0xc2]
+         cmovp	r9w, ax, dx
+# CHECK: cmovp	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x4a,0xd1]
+         cmovp	r10d, edx, ecx
+# CHECK: cmovp	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x4a,0xf9]
+         cmovp	r11, r15, r9
+# CHECK: cmovp	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x4a,0x54,0x80,0x7b]
+         cmovp	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x4a,0x4c,0x80,0x7b]
+         cmovp	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovp	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x4a,0x4c,0x80,0x7b]
+         cmovp	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmovs	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x48,0xc2]
+         cmovs	r9w, ax, dx
+# CHECK: cmovs	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x48,0xd1]
+         cmovs	r10d, edx, ecx
+# CHECK: cmovs	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x48,0xf9]
+         cmovs	r11, r15, r9
+# CHECK: cmovs	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x48,0x54,0x80,0x7b]
+         cmovs	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmovs	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x48,0x4c,0x80,0x7b]
+         cmovs	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmovs	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x48,0x4c,0x80,0x7b]
+         cmovs	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: cmove	r9w, ax, dx
+# CHECK: encoding: [0x62,0xf4,0x35,0x18,0x44,0xc2]
+         cmove	r9w, ax, dx
+# CHECK: cmove	r10d, edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x2c,0x18,0x44,0xd1]
+         cmove	r10d, edx, ecx
+# CHECK: cmove	r11, r15, r9
+# CHECK: encoding: [0x62,0x54,0xa4,0x18,0x44,0xf9]
+         cmove	r11, r15, r9
+# CHECK: cmove	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x44,0x54,0x80,0x7b]
+         cmove	ax, dx, word ptr [r8 + 4*rax + 123]
+# CHECK: cmove	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0xd4,0x6c,0x18,0x44,0x4c,0x80,0x7b]
+         cmove	edx, ecx, dword ptr [r8 + 4*rax + 123]
+# CHECK: cmove	r15, r9, qword ptr [r8 + 4*rax + 123]
+# CHECK: encoding: [0x62,0x54,0x84,0x18,0x44,0x4c,0x80,0x7b]
+         cmove	r15, r9, qword ptr [r8 + 4*rax + 123]
diff --git a/llvm/test/MC/X86/apx/evex-format-att.s b/llvm/test/MC/X86/apx/evex-format-att.s
index 055a29fe00f32c..36df3f3757dc3f 100644
--- a/llvm/test/MC/X86/apx/evex-format-att.s
+++ b/llvm/test/MC/X86/apx/evex-format-att.s
@@ -10,6 +10,12 @@
 # CHECK: encoding: [0x62,0xec,0xec,0x10,0x01,0x41,0x7b]
          addq	%r16, 123(%r17), %r18
 
+## MRMDestMemCC
+
+# CHECK: cfcmovbq %r16, 123(%r17,%r18,4)
+# CHECK: encoding: [0x62,0xec,0xf8,0x0c,0x42,0x44,0x91,0x7b]
+         cfcmovbq %r16, 123(%r17,%r18,4)
+
 ## MRMSrcMem
 
 # CHECK: vbroadcasti32x4	(%r16,%r17), %zmm0
@@ -20,6 +26,16 @@
 # CHECK: encoding: [0x62,0xec,0xec,0x10,0x2b,0x48,0x7b]
          subq	123(%r16), %r17, %r18
 
+## MRMSrcMemCC
+
+# CHECK: cfcmovbq	123(%r16,%r17,4), %r18
+# CHECK: encoding: [0x62,0xec,0xf8,0x08,0x42,0x54,0x88,0x7b]
+         cfcmovbq	123(%r16,%r17,4), %r18
+
+# CHECK: cfcmovbq	123(%r16,%r17,4), %r18, %r19
+# CHECK: encoding: [0x62,0xec,0xe0,0x14,0x42,0x54,0x88,0x7b]
+         cfcmovbq	123(%r16,%r17,4), %r18, %r19
+
 ## MRM0m
 
 # CHECK: vprorq	$0, (%r16,%r17), %zmm0
@@ -122,12 +138,24 @@
 # CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x01,0xc1]
          {nf}	addq	%r16, %r17
 
+## MRMDestRegCC
+
+# CHECK: cfcmovbq	%r16, %r17
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x42,0xc1]
+         cfcmovbq	%r16, %r17
+
 ## MRMSrcReg
 
 # CHECK: mulxq	%r16, %r17, %r18
 # CHECK: encoding: [0x62,0xea,0xf7,0x00,0xf6,0xd0]
          mulxq	%r16, %r17, %r18
 
+## MRMSrcRegCC
+
+# CHECK: cfcmovbq	%r16, %r17, %r18
+# CHECK: encoding: [0x62,0xec,0xec,0x14,0x42,0xc8]
+         cfcmovbq	%r16, %r17, %r18
+
 ## MRMSrcReg4VOp3
 
 # CHECK: bzhiq	%r19, %r23, %r27
diff --git a/llvm/test/MC/X86/apx/evex-format-intel.s b/llvm/test/MC/X86/apx/evex-format-intel.s
index 06b56078aab904..2b346e0e858063 100644
--- a/llvm/test/MC/X86/apx/evex-format-intel.s
+++ b/llvm/test/MC/X86/apx/evex-format-intel.s
@@ -10,6 +10,12 @@
 # CHECK: encoding: [0x62,0xec,0xec,0x10,0x01,0x41,0x7b]
          add	r18, qword ptr [r17 + 123], r16
 
+## MRMDestMemCC
+
+# CHECK: cfcmovb	qword ptr [r17 + 4*r18 + 123], r16
+# CHECK: encoding: [0x62,0xec,0xf8,0x0c,0x42,0x44,0x91,0x7b]
+         cfcmovb	qword ptr [r17 + 4*r18 + 123], r16
+
 ## MRMSrcMem
 
 # CHECK: vbroadcasti32x4	zmm0, xmmword ptr [r16 + r17]
@@ -20,6 +26,16 @@
 # CHECK: encoding: [0x62,0xec,0xec,0x10,0x2b,0x48,0x7b]
          sub	r18, r17, qword ptr [r16 + 123]
 
+## MRMSrcMemCC
+
+# CHECK: cfcmovb	r18, qword ptr [r16 + 4*r17 + 123]
+# CHECK: encoding: [0x62,0xec,0xf8,0x08,0x42,0x54,0x88,0x7b]
+         cfcmovb	r18, qword ptr [r16 + 4*r17 + 123]
+
+# CHECK: cfcmovb	r19, r18, qword ptr [r16 + 4*r17 + 123]
+# CHECK: encoding: [0x62,0xec,0xe0,0x14,0x42,0x54,0x88,0x7b]
+         cfcmovb	r19, r18, qword ptr [r16 + 4*r17 + 123]
+
 ## MRM0m
 
 # CHECK: vprorq	zmm0, zmmword ptr [r16 + r17], 0
@@ -122,12 +138,24 @@
 # CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x01,0xc1]
          {nf}	add	r17, r16
 
+## MRMDestRegCC
+
+# CHECK: cfcmovb	r17, r16
+# CHECK: encoding: [0x62,0xec,0xfc,0x0c,0x42,0xc1]
+         cfcmovb	r17, r16
+
 ## MRMSrcReg
 
 # CHECK: mulx	r18, r17, r16
 # CHECK: encoding: [0x62,0xea,0xf7,0x00,0xf6,0xd0]
          mulx	r18, r17, r16
 
+## MRMSrcRegCC
+
+# CHECK: cfcmovb	r18, r17, r16
+# CHECK: encoding: [0x62,0xec,0xec,0x14,0x42,0xc8]
+         cfcmovb	r18, r17, r16
+
 ## MRMSrcReg4VOp3
 
 # CHECK: bzhi	r27, r23, r19
diff --git a/llvm/test/Other/optimize-inrange-gep.ll b/llvm/test/Other/optimize-inrange-gep.ll
index cc2bd15b5b22cc..2eae34bdb09b1d 100644
--- a/llvm/test/Other/optimize-inrange-gep.ll
+++ b/llvm/test/Other/optimize-inrange-gep.ll
@@ -1,4 +1,5 @@
-; RUN: opt -O0 -S < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -O0 -S < %s | FileCheck %s --check-prefix=O0
 ; RUN: opt -O1 -S < %s | FileCheck %s
 ; RUN: opt -O2 -S < %s | FileCheck %s
 ; RUN: opt -O3 -S < %s | FileCheck %s
@@ -7,12 +8,21 @@
 
 target datalayout = "e-p:64:64"
 
-; Make sure that optimizations do not optimize inrange GEP.
+; Make sure that optimizations do not lose inrange information.
 
 @vtable = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr null] }
 
 define void @foo(ptr %p) {
-  ;CHECK: store ptr getelementptr {{.*}} ({ [3 x ptr] }, ptr @vtable, i{{.*}} 0, inrange i32 0, i{{.*}} 3), ptr %p
-  store ptr getelementptr ({ [3 x ptr] }, ptr @vtable, i32 0, inrange i32 0, i32 3), ptr %p
+; O0-LABEL: define void @foo(
+; O0-SAME: ptr [[P:%.*]]) {
+; O0-NEXT:    store ptr getelementptr inrange(-24, 0) ({ [3 x ptr], [3 x ptr] }, ptr @vtable, i32 0, i32 0, i32 3), ptr [[P]], align 8
+; O0-NEXT:    ret void
+;
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr nocapture writeonly [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    store ptr getelementptr inbounds inrange(-24, 0) ({ [3 x ptr] }, ptr @vtable, i64 1, i32 0, i64 0), ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
+  store ptr getelementptr inrange(-24, 0) ({ [3 x ptr], [3 x ptr] }, ptr @vtable, i32 0, i32 0, i32 3), ptr %p
   ret void
 }
diff --git a/llvm/test/TableGen/MacroFusion.td b/llvm/test/TableGen/MacroFusion.td
index ce76e7f0f7fa64..b54b0506c56359 100644
--- a/llvm/test/TableGen/MacroFusion.td
+++ b/llvm/test/TableGen/MacroFusion.td
@@ -33,6 +33,8 @@ let Namespace = "Test" in {
 
 def Inst0 : TestInst<0>;
 def Inst1 : TestInst<1>;
+let isCommutable = true in
+def Inst2 : TestInst<2>;
 
 def BothFusionPredicate: BothFusionPredicateWithMCInstPredicate<CheckRegOperand<0, X0>>;
 def TestBothFusionPredicate: Fusion<"test-both-fusion-predicate", "HasBothFusionPredicate",
@@ -42,16 +44,32 @@ def TestBothFusionPredicate: Fusion<"test-both-fusion-predicate", "HasBothFusion
 def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
                              CheckOpcode<[Inst0]>,
                              CheckAll<[
-                              CheckOpcode<[Inst1]>,
-                              CheckRegOperand<0, X0>
+                               CheckOpcode<[Inst1]>,
+                               CheckRegOperand<0, X0>
                              ]>>;
 
+let IsCommutable = 1 in
+def TestCommutableFusion: SimpleFusion<"test-commutable-fusion", "HasTestCommutableFusion",
+                                       "Test Commutable Fusion",
+                                       CheckOpcode<[Inst0]>,
+                                       CheckAll<[
+                                         CheckOpcode<[Inst1]>,
+                                         CheckRegOperand<0, X0>
+                                       ]>>;
+
+def TestSingleFusion: SingleFusion<"test-single-fusion", "HasTestSingleFusion",
+                                   "Test SingleFusion",
+                                   Inst0, Inst2,
+                                   secondInstPred=CheckRegOperand<0, X0>>;
+
 // CHECK-PREDICATOR:       #ifdef GET_Test_MACRO_FUSION_PRED_DECL
 // CHECK-PREDICATOR-NEXT:  #undef GET_Test_MACRO_FUSION_PRED_DECL
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  namespace llvm {
 // CHECK-PREDICATOR-NEXT:  bool isTestBothFusionPredicate(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
+// CHECK-PREDICATOR-NEXT:  bool isTestCommutableFusion(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);   
 // CHECK-PREDICATOR-NEXT:  bool isTestFusion(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
+// CHECK-PREDICATOR-NEXT:  bool isTestSingleFusion(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
 // CHECK-PREDICATOR-NEXT:  } // end namespace llvm
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  #endif
@@ -78,7 +96,7 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:    }
 // CHECK-PREDICATOR-NEXT:    return true;
 // CHECK-PREDICATOR-NEXT:  }
-// CHECK-PREDICATOR-NEXT:  bool isTestFusion(
+// CHECK-PREDICATOR-NEXT:  bool isTestCommutableFusion(
 // CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
 // CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
 // CHECK-PREDICATOR-NEXT:      const MachineInstr *FirstMI,
@@ -99,14 +117,58 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:      if (( MI->getOpcode() != Test::Inst0 ))
 // CHECK-PREDICATOR-NEXT:        return false;
 // CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!SecondMI.getOperand(0).getReg().isVirtual()) {
+// CHECK-PREDICATOR-NEXT:      if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(1).getReg()) {
+// CHECK-PREDICATOR-NEXT:        if (!SecondMI.getDesc().isCommutable())
+// CHECK-PREDICATOR-NEXT:          return false;
+// CHECK-PREDICATOR-NEXT:        unsigned SrcOpIdx1 = 1, SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+// CHECK-PREDICATOR-NEXT:        if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))
+// CHECK-PREDICATOR-NEXT:          if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())
+// CHECK-PREDICATOR-NEXT:            return false;
+// CHECK-PREDICATOR-NEXT:      }
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      Register FirstDest = FirstMI->getOperand(0).getReg();
+// CHECK-PREDICATOR-NEXT:      if (FirstDest.isVirtual() && !MRI.hasOneNonDBGUse(FirstDest))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!(FirstMI->getOperand(0).isReg() &&
+// CHECK-PREDICATOR-NEXT:          SecondMI.getOperand(1).isReg() &&
+// CHECK-PREDICATOR-NEXT:          FirstMI->getOperand(0).getReg() == SecondMI.getOperand(1).getReg())) {
+// CHECK-PREDICATOR-NEXT:      if (!SecondMI.getDesc().isCommutable())
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:      unsigned SrcOpIdx1 = 1, SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+// CHECK-PREDICATOR-NEXT:      if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))
+// CHECK-PREDICATOR-NEXT:        if (FirstMI->getOperand(0).getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())
+// CHECK-PREDICATOR-NEXT:          return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    return true;
+// CHECK-PREDICATOR-NEXT:  }
+// CHECK-PREDICATOR-NEXT:  bool isTestFusion(
+// CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
+// CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *FirstMI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr &SecondMI) {
+// CHECK-PREDICATOR-NEXT:    auto &MRI = SecondMI.getMF()->getRegInfo();
 // CHECK-PREDICATOR-NEXT:    {
 // CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = &SecondMI;
 // CHECK-PREDICATOR-NEXT:      if (!(
-// CHECK-PREDICATOR-NEXT:          MI->getOperand(0).getReg().isVirtual()
-// CHECK-PREDICATOR-NEXT:          || MI->getOperand(0).getReg() == MI->getOperand(1).getReg()
+// CHECK-PREDICATOR-NEXT:          ( MI->getOpcode() == Test::Inst1 )
+// CHECK-PREDICATOR-NEXT:          && MI->getOperand(0).getReg() == Test::X0
 // CHECK-PREDICATOR-NEXT:        ))
 // CHECK-PREDICATOR-NEXT:        return false;
 // CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!FirstMI)
+// CHECK-PREDICATOR-NEXT:      return true;
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = FirstMI;
+// CHECK-PREDICATOR-NEXT:      if (( MI->getOpcode() != Test::Inst0 ))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!SecondMI.getOperand(0).getReg().isVirtual()) {
+// CHECK-PREDICATOR-NEXT:      if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(1).getReg())
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
 // CHECK-PREDICATOR-NEXT:    {
 // CHECK-PREDICATOR-NEXT:      Register FirstDest = FirstMI->getOperand(0).getReg();
 // CHECK-PREDICATOR-NEXT:      if (FirstDest.isVirtual() && !MRI.hasOneNonDBGUse(FirstDest))
@@ -118,12 +180,66 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:      return false;
 // CHECK-PREDICATOR-NEXT:    return true;
 // CHECK-PREDICATOR-NEXT:  }
+// CHECK-PREDICATOR-NEXT:  bool isTestSingleFusion(
+// CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
+// CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *FirstMI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr &SecondMI) {
+// CHECK-PREDICATOR-NEXT:    auto &MRI = SecondMI.getMF()->getRegInfo();
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = &SecondMI;
+// CHECK-PREDICATOR-NEXT:      if (!(
+// CHECK-PREDICATOR-NEXT:          ( MI->getOpcode() == Test::Inst2 )
+// CHECK-PREDICATOR-NEXT:          && MI->getOperand(0).getReg() == Test::X0
+// CHECK-PREDICATOR-NEXT:        ))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!FirstMI)
+// CHECK-PREDICATOR-NEXT:      return true;
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = FirstMI;
+// CHECK-PREDICATOR-NEXT:      if (!(
+// CHECK-PREDICATOR-NEXT:          ( MI->getOpcode() == Test::Inst0 )
+// CHECK-PREDICATOR-NEXT:          && true
+// CHECK-PREDICATOR-NEXT:        ))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!SecondMI.getOperand(0).getReg().isVirtual()) {
+// CHECK-PREDICATOR-NEXT:      if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(1).getReg()) {
+// CHECK-PREDICATOR-NEXT:        if (!SecondMI.getDesc().isCommutable())
+// CHECK-PREDICATOR-NEXT:          return false;
+// CHECK-PREDICATOR-NEXT:        unsigned SrcOpIdx1 = 1, SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+// CHECK-PREDICATOR-NEXT:        if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))
+// CHECK-PREDICATOR-NEXT:          if (SecondMI.getOperand(0).getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())
+// CHECK-PREDICATOR-NEXT:            return false;
+// CHECK-PREDICATOR-NEXT:      }
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      Register FirstDest = FirstMI->getOperand(0).getReg();
+// CHECK-PREDICATOR-NEXT:      if (FirstDest.isVirtual() && !MRI.hasOneNonDBGUse(FirstDest))
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    if (!(FirstMI->getOperand(0).isReg() &&
+// CHECK-PREDICATOR-NEXT:          SecondMI.getOperand(1).isReg() &&
+// CHECK-PREDICATOR-NEXT:          FirstMI->getOperand(0).getReg() == SecondMI.getOperand(1).getReg())) {
+// CHECK-PREDICATOR-NEXT:      if (!SecondMI.getDesc().isCommutable())
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:      unsigned SrcOpIdx1 = 1, SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;
+// CHECK-PREDICATOR-NEXT:      if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))
+// CHECK-PREDICATOR-NEXT:        if (FirstMI->getOperand(0).getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())
+// CHECK-PREDICATOR-NEXT:          return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    return true;
+// CHECK-PREDICATOR-NEXT:  }
 // CHECK-PREDICATOR-NEXT:  } // end namespace llvm
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  #endif
 
 // Check that we have generated target subfeature.
+// CHECK-SUBTARGET: { "test-both-fusion-predicate", "Test BothFusionPredicate", Test::TestBothFusionPredicate
+// CHECK-SUBTARGET: { "test-commutable-fusion", "Test Commutable Fusion", Test::TestCommutableFusion
 // CHECK-SUBTARGET: { "test-fusion", "Test Fusion", Test::TestFusion
+// CHECK-SUBTARGET: { "test-single-fusion", "Test SingleFusion", Test::TestSingleFusion
 
 // Check that we have generated `getMacroFusions()` function.
 // CHECK-SUBTARGET:      std::vector<MacroFusionPredTy> getMacroFusions() const override;
@@ -131,6 +247,8 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-SUBTARGET:      std::vector<MacroFusionPredTy> TestGenSubtargetInfo::getMacroFusions() const {
 // CHECK-SUBTARGET-NEXT:   std::vector<MacroFusionPredTy> Fusions;
 // CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestBothFusionPredicate)) Fusions.push_back(llvm::isTestBothFusionPredicate); 
+// CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestCommutableFusion)) Fusions.push_back(llvm::isTestCommutableFusion); 
 // CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestFusion)) Fusions.push_back(llvm::isTestFusion);
+// CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestSingleFusion)) Fusions.push_back(llvm::isTestSingleFusion);
 // CHECK-SUBTARGET-NEXT:   return Fusions;
 // CHECK-SUBTARGET-NEXT: }
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index eea4f87cae9cec..7b65e483c39d0d 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -1937,8 +1937,11 @@ static const X86FoldTableEntry Table2[] = {
   {X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16},
   {X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16},
   {X86::CMOV16rr, X86::CMOV16rm, 0},
+  {X86::CMOV16rr_ND, X86::CMOV16rm_ND, 0},
   {X86::CMOV32rr, X86::CMOV32rm, 0},
+  {X86::CMOV32rr_ND, X86::CMOV32rm_ND, 0},
   {X86::CMOV64rr, X86::CMOV64rm, 0},
+  {X86::CMOV64rr_ND, X86::CMOV64rm_ND, 0},
   {X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16},
   {X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16},
   {X86::CMPSDrri, X86::CMPSDrmi, 0},
diff --git a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
index d64f8cf3e56dc1..05c57052e674d7 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
@@ -2641,8 +2641,8 @@ define float @assume_false_smallest_normal(float %arg) {
 }
 
 define float @clamp_false_nan(float %arg) {
-; CHECK-LABEL: define nofpclass(nan inf nzero sub norm) float @clamp_false_nan(
-; CHECK-SAME: float returned nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-LABEL: define float @clamp_false_nan(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
   %fcmp = fcmp false float %arg, 0x7FF8000000000000
@@ -2784,12 +2784,12 @@ define float @clamp_true_smallest_normal_0.0(float %arg) {
 }
 
 define float @clamp_true_nan(float %arg) {
-; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_nan(
-; CHECK-SAME: float nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    ret float 0.000000e+00
+; CHECK-LABEL: define float @clamp_true_nan(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float [[ARG]]
 ;
   %fcmp = fcmp true float %arg, 0x7FF8000000000000
-  %select = select i1 %fcmp, float 0.0, float %arg
+  %select = select i1 %fcmp, float %arg, float 0.0
   ret float %select
 }
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/fpclass-test.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/fpclass-test.ll
new file mode 100644
index 00000000000000..63ab22e96ad2ad
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/fpclass-test.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=aarch64 < %s | FileCheck %s
+
+define i1 @test_is_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp ueq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp one double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp oeq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp une double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define <vscale x 2 x i1> @test_vec_is_inf_or_nan(<vscale x 2 x double> %arg) {
+; CHECK-LABEL: define <vscale x 2 x i1> @test_vec_is_inf_or_nan(
+; CHECK-SAME: <vscale x 2 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2f64(<vscale x 2 x double> [[ARG]], i32 519)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %arg)
+  %ret = fcmp ueq <vscale x 2 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 2 x i1> %ret
+}
+
+define <vscale x 2 x i1> @test_vec_is_not_inf_or_nan(<vscale x 2 x double> %arg) {
+; CHECK-LABEL: define <vscale x 2 x i1> @test_vec_is_not_inf_or_nan(
+; CHECK-SAME: <vscale x 2 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2f64(<vscale x 2 x double> [[ARG]], i32 504)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %arg)
+  %ret = fcmp one <vscale x 2 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 2 x i1> %ret
+}
+
+define <vscale x 2 x i1> @test_vec_is_inf(<vscale x 2 x double> %arg) {
+; CHECK-LABEL: define <vscale x 2 x i1> @test_vec_is_inf(
+; CHECK-SAME: <vscale x 2 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2f64(<vscale x 2 x double> [[ARG]], i32 516)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %arg)
+  %ret = fcmp oeq <vscale x 2 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 2 x i1> %ret
+}
+
+define <vscale x 2 x i1> @test_vec_is_not_inf(<vscale x 2 x double> %arg) {
+; CHECK-LABEL: define <vscale x 2 x i1> @test_vec_is_not_inf(
+; CHECK-SAME: <vscale x 2 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.is.fpclass.nxv2f64(<vscale x 2 x double> [[ARG]], i32 507)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %arg)
+  %ret = fcmp une <vscale x 2 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 2 x i1> %ret
+}
+
+define i1 @test_fp128_is_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp ueq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp one fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp une fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/fpclass-test.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/fpclass-test.ll
new file mode 100644
index 00000000000000..7c00218bdcce3d
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/fpclass-test.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=riscv64 < %s | FileCheck %s
+
+define i1 @test_is_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp ueq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp one double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp oeq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp une double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define <vscale x 4 x i1> @test_vec_is_inf_or_nan(<vscale x 4 x double> %arg) {
+; CHECK-LABEL: define <vscale x 4 x i1> @test_vec_is_inf_or_nan(
+; CHECK-SAME: <vscale x 4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.is.fpclass.nxv4f64(<vscale x 4 x double> [[ARG]], i32 519)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> %arg)
+  %ret = fcmp ueq <vscale x 4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 4 x i1> %ret
+}
+
+define <vscale x 4 x i1> @test_vec_is_not_inf_or_nan(<vscale x 4 x double> %arg) {
+; CHECK-LABEL: define <vscale x 4 x i1> @test_vec_is_not_inf_or_nan(
+; CHECK-SAME: <vscale x 4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.is.fpclass.nxv4f64(<vscale x 4 x double> [[ARG]], i32 504)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> %arg)
+  %ret = fcmp one <vscale x 4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 4 x i1> %ret
+}
+
+define <vscale x 4 x i1> @test_vec_is_inf(<vscale x 4 x double> %arg) {
+; CHECK-LABEL: define <vscale x 4 x i1> @test_vec_is_inf(
+; CHECK-SAME: <vscale x 4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.is.fpclass.nxv4f64(<vscale x 4 x double> [[ARG]], i32 516)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> %arg)
+  %ret = fcmp oeq <vscale x 4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 4 x i1> %ret
+}
+
+define <vscale x 4 x i1> @test_vec_is_not_inf(<vscale x 4 x double> %arg) {
+; CHECK-LABEL: define <vscale x 4 x i1> @test_vec_is_not_inf(
+; CHECK-SAME: <vscale x 4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.is.fpclass.nxv4f64(<vscale x 4 x double> [[ARG]], i32 507)
+; CHECK-NEXT:    ret <vscale x 4 x i1> [[TMP1]]
+;
+  %abs = tail call <vscale x 4 x double> @llvm.fabs.nxv4f64(<vscale x 4 x double> %arg)
+  %ret = fcmp une <vscale x 4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <vscale x 4 x i1> %ret
+}
+
+define i1 @test_fp128_is_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp ueq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp one fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp une fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fpclass-test.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fpclass-test.ll
new file mode 100644
index 00000000000000..525caeb3e79a10
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fpclass-test.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -codegenprepare -S -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+define i1 @test_is_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp ueq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf_or_nan(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf_or_nan(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp one double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp oeq double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define i1 @test_is_not_inf(double %arg) {
+; CHECK-LABEL: define i1 @test_is_not_inf(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f64(double [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call double @llvm.fabs.f64(double %arg)
+  %ret = fcmp une double %abs, 0x7FF0000000000000
+  ret i1 %ret
+}
+
+define <4 x i1> @test_vec_is_inf_or_nan(<4 x double> %arg) {
+; CHECK-LABEL: define <4 x i1> @test_vec_is_inf_or_nan(
+; CHECK-SAME: <4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> [[ARG]], i32 519)
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %abs = tail call <4 x double> @llvm.fabs.v4f64(<4 x double> %arg)
+  %ret = fcmp ueq <4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @test_vec_is_not_inf_or_nan(<4 x double> %arg) {
+; CHECK-LABEL: define <4 x i1> @test_vec_is_not_inf_or_nan(
+; CHECK-SAME: <4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> [[ARG]], i32 504)
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %abs = tail call <4 x double> @llvm.fabs.v4f64(<4 x double> %arg)
+  %ret = fcmp one <4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @test_vec_is_inf(<4 x double> %arg) {
+; CHECK-LABEL: define <4 x i1> @test_vec_is_inf(
+; CHECK-SAME: <4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> [[ARG]], i32 516)
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %abs = tail call <4 x double> @llvm.fabs.v4f64(<4 x double> %arg)
+  %ret = fcmp oeq <4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @test_vec_is_not_inf(<4 x double> %arg) {
+; CHECK-LABEL: define <4 x i1> @test_vec_is_not_inf(
+; CHECK-SAME: <4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f64(<4 x double> [[ARG]], i32 507)
+; CHECK-NEXT:    ret <4 x i1> [[TMP1]]
+;
+  %abs = tail call <4 x double> @llvm.fabs.v4f64(<4 x double> %arg)
+  %ret = fcmp une <4 x double> %abs, splat (double 0x7FF0000000000000)
+  ret <4 x i1> %ret
+}
+
+define i1 @test_fp128_is_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp ueq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp one fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_fp128_is_not_inf(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_not_inf(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call fp128 @llvm.fabs.f128(fp128 %arg)
+  %ret = fcmp une fp128 %abs, 0xL00000000000000007FFF000000000000
+  ret i1 %ret
+}
+
+define i1 @test_x86_fp80_is_inf_or_nan(x86_fp80 %arg) {
+; CHECK-LABEL: define i1 @test_x86_fp80_is_inf_or_nan(
+; CHECK-SAME: x86_fp80 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[ARG]], i32 519)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call x86_fp80 @llvm.fabs.f80(x86_fp80 %arg)
+  %ret = fcmp ueq x86_fp80 %abs, 0xK7FFF8000000000000000
+  ret i1 %ret
+}
+
+define i1 @test_x86_fp80_is_not_inf_or_nan(x86_fp80 %arg) {
+; CHECK-LABEL: define i1 @test_x86_fp80_is_not_inf_or_nan(
+; CHECK-SAME: x86_fp80 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[ARG]], i32 504)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call x86_fp80 @llvm.fabs.f80(x86_fp80 %arg)
+  %ret = fcmp one x86_fp80 %abs, 0xK7FFF8000000000000000
+  ret i1 %ret
+}
+
+define i1 @test_x86_fp80_is_inf(x86_fp80 %arg) {
+; CHECK-LABEL: define i1 @test_x86_fp80_is_inf(
+; CHECK-SAME: x86_fp80 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[ARG]], i32 516)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call x86_fp80 @llvm.fabs.f80(x86_fp80 %arg)
+  %ret = fcmp oeq x86_fp80 %abs, 0xK7FFF8000000000000000
+  ret i1 %ret
+}
+
+define i1 @test_x86_fp80_is_not_inf(x86_fp80 %arg) {
+; CHECK-LABEL: define i1 @test_x86_fp80_is_not_inf(
+; CHECK-SAME: x86_fp80 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f80(x86_fp80 [[ARG]], i32 507)
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %abs = tail call x86_fp80 @llvm.fabs.f80(x86_fp80 %arg)
+  %ret = fcmp une x86_fp80 %abs, 0xK7FFF8000000000000000
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
index 196a104adc0233..6d8890d71e2be9 100644
--- a/llvm/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
+++ b/llvm/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -mtriple=arm64-darwin-unknown -S -passes=consthoist < %s | FileCheck %s
+; RUN: opt -mtriple=arm64-darwin-unknown -S -passes=consthoist < %s | FileCheck %s --check-prefixes=CHECK,CV
+; RUN: opt -mtriple=arm64-darwin-unknown -S -passes=consthoist -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat < %s | FileCheck %s --check-prefixes=CHECK,CI
 
 define i128 @test1(i128 %a) {
 ; CHECK-LABEL: define i128 @test1(
@@ -122,13 +123,37 @@ define i64 @sdiv_minsize(i64 %a) minsize {
 }
 
 define <2 x i64> @sdiv_v2i64(<2 x i64> %a) {
-; CHECK-LABEL: define <2 x i64> @sdiv_v2i64(
-; CHECK-SAME: <2 x i64> [[A:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <2 x i64> [[A]], <i64 4294967087, i64 4294967087>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP1]], <i64 4294967087, i64 4294967087>
-; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+; CV-LABEL: define <2 x i64> @sdiv_v2i64(
+; CV-SAME: <2 x i64> [[A:%.*]]) {
+; CV-NEXT:    [[TMP1:%.*]] = sdiv <2 x i64> [[A]], <i64 4294967087, i64 4294967087>
+; CV-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP1]], <i64 4294967087, i64 4294967087>
+; CV-NEXT:    ret <2 x i64> [[TMP2]]
+;
+; CI-LABEL: define <2 x i64> @sdiv_v2i64(
+; CI-SAME: <2 x i64> [[A:%.*]]) {
+; CI-NEXT:    [[TMP1:%.*]] = sdiv <2 x i64> [[A]], splat (i64 4294967087)
+; CI-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP1]], splat (i64 4294967087)
+; CI-NEXT:    ret <2 x i64> [[TMP2]]
 ;
   %1 = sdiv <2 x i64> %a, <i64 4294967087, i64 4294967087>
   %2 = add <2 x i64> %1, <i64 4294967087, i64 4294967087>
   ret <2 x i64> %2
 }
+
+define <vscale x 2 x i64> @sdiv_nxv2i64(<vscale x 2 x i64> %a) {
+; CV-LABEL: define <vscale x 2 x i64> @sdiv_nxv2i64(
+; CV-SAME: <vscale x 2 x i64> [[A:%.*]]) {
+; CV-NEXT:    [[TMP1:%.*]] = sdiv <vscale x 2 x i64> [[A]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4294967087, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CV-NEXT:    [[TMP2:%.*]] = add <vscale x 2 x i64> [[TMP1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4294967087, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CV-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+;
+; CI-LABEL: define <vscale x 2 x i64> @sdiv_nxv2i64(
+; CI-SAME: <vscale x 2 x i64> [[A:%.*]]) {
+; CI-NEXT:    [[TMP1:%.*]] = sdiv <vscale x 2 x i64> [[A]], splat (i64 4294967087)
+; CI-NEXT:    [[TMP2:%.*]] = add <vscale x 2 x i64> [[TMP1]], splat (i64 4294967087)
+; CI-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
+;
+  %1 = sdiv <vscale x 2 x i64> %a, splat (i64 4294967087)
+  %2 = add <vscale x 2 x i64> %1, splat (i64 4294967087)
+  ret <vscale x 2 x i64> %2
+}
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
index 9c2b1ece1624bc..d9dba92ec4eb7e 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
@@ -88,7 +88,6 @@ exit:
   ret void
 }
 
-; FIXME: The fakeresume1 here should be marked as musttail.
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
index 860032bd3cf8e5..2257d5aee473ee 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
@@ -88,7 +88,6 @@ exit:
   ret void
 }
 
-; FIXME: The fakeresume1 here should be marked as musttail.
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
index df725b9a7fa47d..696bd55d2dfdd9 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=dfa-jump-threading %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -dfa-early-exit-heuristic=false %s | FileCheck %s
 
 ; These tests check if selects are unfolded properly for jump threading
 ; opportunities. There are three different patterns to consider:
diff --git a/llvm/test/Transforms/DFAJumpThreading/unpredictable-heuristic.ll b/llvm/test/Transforms/DFAJumpThreading/unpredictable-heuristic.ll
new file mode 100644
index 00000000000000..9743f0acc8165e
--- /dev/null
+++ b/llvm/test/Transforms/DFAJumpThreading/unpredictable-heuristic.ll
@@ -0,0 +1,124 @@
+; REQUIRES: asserts
+; RUN: opt -S -passes=dfa-jump-threading %s -debug-only=dfa-jump-threading 2>&1 | FileCheck %s
+
+; CHECK-COUNT-3: Exiting early due to unpredictability heuristic.
+
+@.str.1 = private unnamed_addr constant [3 x i8] c"10\00", align 1
+@.str.2 = private unnamed_addr constant [3 x i8] c"30\00", align 1
+@.str.3 = private unnamed_addr constant [3 x i8] c"20\00", align 1
+@.str.4 = private unnamed_addr constant [3 x i8] c"40\00", align 1
+
+define void @test1(i32 noundef %num, i32 noundef %num2) {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %entry, %sw.epilog
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %num.addr.1, %sw.epilog ]
+  switch i32 %num.addr.0, label %sw.default [
+    i32 10, label %sw.bb
+    i32 30, label %sw.bb1
+    i32 20, label %sw.bb2
+    i32 40, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %while.body
+  %call.i = tail call i32 @bar(ptr noundef nonnull @.str.1)
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %while.body
+  %call.i4 = tail call i32 @bar(ptr noundef nonnull @.str.2)
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %while.body
+  %call.i5 = tail call i32 @bar(ptr noundef nonnull @.str.3)
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %while.body
+  %call.i6 = tail call i32 @bar(ptr noundef nonnull @.str.4)
+  %call = tail call noundef i32 @foo()
+  %add = add nsw i32 %call, %num2
+  br label %sw.epilog
+
+sw.default:                                       ; preds = %while.body
+  ret void
+
+sw.epilog:                                        ; preds = %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb
+  %num.addr.1 = phi i32 [ %add, %sw.bb3 ], [ 40, %sw.bb2 ], [ 20, %sw.bb1 ], [ 30, %sw.bb ]
+  br label %while.body
+}
+
+
+define void @test2(i32 noundef %num, i32 noundef %num2) {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %entry, %sw.epilog
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %num.addr.1, %sw.epilog ]
+  switch i32 %num.addr.0, label %sw.default [
+    i32 10, label %sw.epilog
+    i32 30, label %sw.bb1
+    i32 20, label %sw.bb2
+    i32 40, label %sw.bb3
+  ]
+
+sw.bb1:                                           ; preds = %while.body
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %while.body
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %while.body
+  br label %sw.epilog
+
+sw.default:                                       ; preds = %while.body
+  ret void
+
+sw.epilog:                                        ; preds = %while.body, %sw.bb3, %sw.bb2, %sw.bb1
+  %.str.4.sink = phi ptr [ @.str.4, %sw.bb3 ], [ @.str.3, %sw.bb2 ], [ @.str.2, %sw.bb1 ], [ @.str.1, %while.body ]
+  %num.addr.1 = phi i32 [ %num2, %sw.bb3 ], [ 40, %sw.bb2 ], [ 20, %sw.bb1 ], [ 30, %while.body ]
+  %call.i6 = tail call i32 @bar(ptr noundef nonnull %.str.4.sink)
+  br label %while.body
+}
+
+
+define void @test3(i32 noundef %num, i32 noundef %num2) {
+entry:
+  %add = add nsw i32 %num2, 40
+  br label %while.body
+
+while.body:                                       ; preds = %entry, %sw.epilog
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %num.addr.1, %sw.epilog ]
+  switch i32 %num.addr.0, label %sw.default [
+    i32 10, label %sw.bb
+    i32 30, label %sw.bb1
+    i32 20, label %sw.bb2
+    i32 40, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %while.body
+  %call.i = tail call i32 @bar(ptr noundef nonnull @.str.1)
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %while.body
+  %call.i5 = tail call i32 @bar(ptr noundef nonnull @.str.2)
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %while.body
+  %call.i6 = tail call i32 @bar(ptr noundef nonnull @.str.3)
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %while.body
+  %call.i7 = tail call i32 @bar(ptr noundef nonnull @.str.4)
+  br label %sw.epilog
+
+sw.default:                                       ; preds = %while.body
+  ret void
+
+sw.epilog:                                        ; preds = %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb
+  %num.addr.1 = phi i32 [ %add, %sw.bb3 ], [ 40, %sw.bb2 ], [ 20, %sw.bb1 ], [ 30, %sw.bb ]
+  br label %while.body
+}
+
+
+declare noundef i32 @foo()
+declare noundef i32 @bar(ptr nocapture noundef readonly)
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
index 3961fec5b3be18..76f5248302badc 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
@@ -15,12 +15,12 @@ define half @si129tohalf(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
@@ -100,12 +100,12 @@ define float @si129tofloat(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
@@ -184,12 +184,12 @@ define double @si129todouble(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 53
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 53
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 54, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 55, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 54, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 55, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
@@ -273,12 +273,12 @@ define x86_fp80 @si129tox86_fp80(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP7]], 113
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i129 115, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
@@ -357,12 +357,12 @@ define fp128 @si129tofp128(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP7]], 113
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i129 115, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[TMP3]], 1
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
index e05ff198ecc33b..96d87a5cace98b 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
@@ -15,12 +15,12 @@ define half @ui129tohalf(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
@@ -100,12 +100,12 @@ define float @ui129tofloat(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
@@ -184,12 +184,12 @@ define double @ui129todouble(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 129, [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP7]], 53
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 53
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 54, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i32 55, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i32 54, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 55, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
@@ -273,12 +273,12 @@ define x86_fp80 @ui129tox86_fp80(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP7]], 113
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i129 115, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
@@ -357,12 +357,12 @@ define fp128 @ui129tofp128(i129 %a) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = sub i129 129, [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP7]], 113
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
 ; CHECK:       itofp-if-then4:
 ; CHECK-NEXT:    switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT:    i129 115, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:      i129 114, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i129 115, label [[ITOFP_SW_EPILOG:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       itofp-sw-bb:
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl i129 [[A]], 1
diff --git a/llvm/test/Transforms/Float2Int/basic.ll b/llvm/test/Transforms/Float2Int/basic.ll
index a454b773f4ebab..2854a83179b7eb 100644
--- a/llvm/test/Transforms/Float2Int/basic.ll
+++ b/llvm/test/Transforms/Float2Int/basic.ll
@@ -1,29 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=float2int -S | FileCheck %s -check-prefixes=CHECK,NONE
-; RUN: opt < %s -passes=float2int -S --data-layout="n64" | FileCheck %s -check-prefixes=CHECK,ONLY64
-; RUN: opt < %s -passes=float2int -S --data-layout="n8:16:32:64"| FileCheck %s -check-prefixes=CHECK,MULTIPLE
+; RUN: opt < %s -passes='float2int' -S | FileCheck %s
 
 ;
 ; Positive tests
 ;
 
 define i16 @simple1(i8 %a) {
-; NONE-LABEL: @simple1(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 1
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; NONE-NEXT:    ret i16 [[TMP2]]
-;
-; ONLY64-LABEL: @simple1(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = add i64 [[TMP1]], 1
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i16
-; ONLY64-NEXT:    ret i16 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple1(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = add i16 [[TMP1]], 1
-; MULTIPLE-NEXT:    ret i16 [[T21]]
+; CHECK-LABEL: @simple1(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fadd float %t1, 1.0
@@ -32,23 +19,11 @@ define i16 @simple1(i8 %a) {
 }
 
 define i8 @simple2(i8 %a) {
-; NONE-LABEL: @simple2(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i8
-; NONE-NEXT:    ret i8 [[TMP2]]
-;
-; ONLY64-LABEL: @simple2(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = sub i64 [[TMP1]], 1
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i8
-; ONLY64-NEXT:    ret i8 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple2(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = sub i16 [[TMP1]], 1
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = trunc i16 [[T21]] to i8
-; MULTIPLE-NEXT:    ret i8 [[TMP2]]
+; CHECK-LABEL: @simple2(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i8
+; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fsub float %t1, 1.0
@@ -57,22 +32,10 @@ define i8 @simple2(i8 %a) {
 }
 
 define i32 @simple3(i8 %a) {
-; NONE-LABEL: @simple3(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
-; NONE-NEXT:    ret i32 [[T21]]
-;
-; ONLY64-LABEL: @simple3(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = sub i64 [[TMP1]], 1
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i32
-; ONLY64-NEXT:    ret i32 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple3(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = sub i16 [[TMP1]], 1
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i16 [[T21]] to i32
-; MULTIPLE-NEXT:    ret i32 [[TMP2]]
+; CHECK-LABEL: @simple3(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    ret i32 [[T21]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fsub float %t1, 1.0
@@ -81,23 +44,11 @@ define i32 @simple3(i8 %a) {
 }
 
 define i1 @cmp(i8 %a, i8 %b) {
-; NONE-LABEL: @cmp(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; NONE-NEXT:    [[T31:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
-; NONE-NEXT:    ret i1 [[T31]]
-;
-; ONLY64-LABEL: @cmp(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
-; ONLY64-NEXT:    [[T31:%.*]] = icmp slt i64 [[TMP1]], [[TMP2]]
-; ONLY64-NEXT:    ret i1 [[T31]]
-;
-; MULTIPLE-LABEL: @cmp(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
-; MULTIPLE-NEXT:    [[T31:%.*]] = icmp slt i16 [[TMP1]], [[TMP2]]
-; MULTIPLE-NEXT:    ret i1 [[T31]]
+; CHECK-LABEL: @cmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[T31:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[T31]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = uitofp i8 %b to float
@@ -119,27 +70,12 @@ define i32 @simple4(i32 %a) {
 }
 
 define i32 @simple5(i8 %a, i8 %b) {
-; NONE-LABEL: @simple5(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; NONE-NEXT:    [[T31:%.*]] = add i32 [[TMP1]], 1
-; NONE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; NONE-NEXT:    ret i32 [[T42]]
-;
-; ONLY64-LABEL: @simple5(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
-; ONLY64-NEXT:    [[T31:%.*]] = add i64 [[TMP1]], 1
-; ONLY64-NEXT:    [[T42:%.*]] = mul i64 [[T31]], [[TMP2]]
-; ONLY64-NEXT:    [[TMP3:%.*]] = trunc i64 [[T42]] to i32
-; ONLY64-NEXT:    ret i32 [[TMP3]]
-;
-; MULTIPLE-LABEL: @simple5(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; MULTIPLE-NEXT:    [[T31:%.*]] = add i32 [[TMP1]], 1
-; MULTIPLE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; MULTIPLE-NEXT:    ret i32 [[T42]]
+; CHECK-LABEL: @simple5(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[T31:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[T42]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = uitofp i8 %b to float
@@ -150,27 +86,12 @@ define i32 @simple5(i8 %a, i8 %b) {
 }
 
 define i32 @simple6(i8 %a, i8 %b) {
-; NONE-LABEL: @simple6(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; NONE-NEXT:    [[T31:%.*]] = sub i32 0, [[TMP1]]
-; NONE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; NONE-NEXT:    ret i32 [[T42]]
-;
-; ONLY64-LABEL: @simple6(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
-; ONLY64-NEXT:    [[T31:%.*]] = sub i64 0, [[TMP1]]
-; ONLY64-NEXT:    [[T42:%.*]] = mul i64 [[T31]], [[TMP2]]
-; ONLY64-NEXT:    [[TMP3:%.*]] = trunc i64 [[T42]] to i32
-; ONLY64-NEXT:    ret i32 [[TMP3]]
-;
-; MULTIPLE-LABEL: @simple6(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; MULTIPLE-NEXT:    [[T31:%.*]] = sub i32 0, [[TMP1]]
-; MULTIPLE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; MULTIPLE-NEXT:    ret i32 [[T42]]
+; CHECK-LABEL: @simple6(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[T31:%.*]] = sub i32 0, [[TMP1]]
+; CHECK-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[T42]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = uitofp i8 %b to float
@@ -184,37 +105,15 @@ define i32 @simple6(i8 %a, i8 %b) {
 ; cause failure of the other.
 
 define i32 @multi1(i8 %a, i8 %b, i8 %c, float %d) {
-; NONE-LABEL: @multi1(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; NONE-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
-; NONE-NEXT:    [[X1:%.*]] = add i32 [[TMP1]], [[TMP2]]
-; NONE-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
-; NONE-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
-; NONE-NEXT:    [[R:%.*]] = add i32 [[X1]], [[W]]
-; NONE-NEXT:    ret i32 [[R]]
-;
-; ONLY64-LABEL: @multi1(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
-; ONLY64-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
-; ONLY64-NEXT:    [[X1:%.*]] = add i64 [[TMP1]], [[TMP2]]
-; ONLY64-NEXT:    [[TMP3:%.*]] = trunc i64 [[X1]] to i32
-; ONLY64-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
-; ONLY64-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
-; ONLY64-NEXT:    [[R:%.*]] = add i32 [[TMP3]], [[W]]
-; ONLY64-NEXT:    ret i32 [[R]]
-;
-; MULTIPLE-LABEL: @multi1(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
-; MULTIPLE-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
-; MULTIPLE-NEXT:    [[X1:%.*]] = add i16 [[TMP1]], [[TMP2]]
-; MULTIPLE-NEXT:    [[TMP3:%.*]] = zext i16 [[X1]] to i32
-; MULTIPLE-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
-; MULTIPLE-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
-; MULTIPLE-NEXT:    [[R:%.*]] = add i32 [[TMP3]], [[W]]
-; MULTIPLE-NEXT:    ret i32 [[R]]
+; CHECK-LABEL: @multi1(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
+; CHECK-NEXT:    [[X1:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
+; CHECK-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[X1]], [[W]]
+; CHECK-NEXT:    ret i32 [[R]]
 ;
   %fa = uitofp i8 %a to float
   %fb = uitofp i8 %b to float
@@ -228,22 +127,11 @@ define i32 @multi1(i8 %a, i8 %b, i8 %c, float %d) {
 }
 
 define i16 @simple_negzero(i8 %a) {
-; NONE-LABEL: @simple_negzero(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 0
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; NONE-NEXT:    ret i16 [[TMP2]]
-;
-; ONLY64-LABEL: @simple_negzero(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = add i64 [[TMP1]], 0
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i16
-; ONLY64-NEXT:    ret i16 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple_negzero(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = add i16 [[TMP1]], 0
-; MULTIPLE-NEXT:    ret i16 [[T21]]
+; CHECK-LABEL: @simple_negzero(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fadd fast float %t1, -0.0
@@ -252,26 +140,12 @@ define i16 @simple_negzero(i8 %a) {
 }
 
 define i32 @simple_negative(i8 %call) {
-; NONE-LABEL: @simple_negative(
-; NONE-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i32
-; NONE-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP1]], -3
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[MUL1]] to i8
-; NONE-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
-; NONE-NEXT:    ret i32 [[CONV3]]
-;
-; ONLY64-LABEL: @simple_negative(
-; ONLY64-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i64
-; ONLY64-NEXT:    [[MUL1:%.*]] = mul i64 [[TMP1]], -3
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[MUL1]] to i8
-; ONLY64-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
-; ONLY64-NEXT:    ret i32 [[CONV3]]
-;
-; MULTIPLE-LABEL: @simple_negative(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i16
-; MULTIPLE-NEXT:    [[MUL1:%.*]] = mul i16 [[TMP1]], -3
-; MULTIPLE-NEXT:    [[TMP2:%.*]] = trunc i16 [[MUL1]] to i8
-; MULTIPLE-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
-; MULTIPLE-NEXT:    ret i32 [[CONV3]]
+; CHECK-LABEL: @simple_negative(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i32
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP1]], -3
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[MUL1]] to i8
+; CHECK-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[CONV3]]
 ;
   %conv1 = sitofp i8 %call to float
   %mul = fmul float %conv1, -3.000000e+00
@@ -281,22 +155,11 @@ define i32 @simple_negative(i8 %call) {
 }
 
 define i16 @simple_fneg(i8 %a) {
-; NONE-LABEL: @simple_fneg(
-; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; NONE-NEXT:    [[T21:%.*]] = sub i32 0, [[TMP1]]
-; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; NONE-NEXT:    ret i16 [[TMP2]]
-;
-; ONLY64-LABEL: @simple_fneg(
-; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
-; ONLY64-NEXT:    [[T21:%.*]] = sub i64 0, [[TMP1]]
-; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i16
-; ONLY64-NEXT:    ret i16 [[TMP2]]
-;
-; MULTIPLE-LABEL: @simple_fneg(
-; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
-; MULTIPLE-NEXT:    [[T21:%.*]] = sub i16 0, [[TMP1]]
-; MULTIPLE-NEXT:    ret i16 [[T21]]
+; CHECK-LABEL: @simple_fneg(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[T21:%.*]] = sub i32 0, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fneg fast float %t1
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-base-call.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-base-call.ll
index 8ef2d895a02906..baf5c7272dfe33 100644
--- a/llvm/test/Transforms/GlobalDCE/virtual-functions-base-call.ll
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-base-call.ll
@@ -45,13 +45,13 @@ entry:
 
 define hidden void @_ZN1AC2Ev(ptr nocapture %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
 define hidden void @_ZN1BC2Ev(ptr nocapture %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-base-pointer-call.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-base-pointer-call.ll
index 09979690da6d8b..47a80f843c93ee 100644
--- a/llvm/test/Transforms/GlobalDCE/virtual-functions-base-pointer-call.ll
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-base-pointer-call.ll
@@ -65,13 +65,13 @@ entry:
 
 define hidden void @_ZN1AC2Ev(ptr nocapture %this) {
 entry:
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
 define hidden void @_ZN1BC2Ev(ptr nocapture %this) {
 entry:
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-derived-call.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-derived-call.ll
index 896675f7655f90..50b68254395603 100644
--- a/llvm/test/Transforms/GlobalDCE/virtual-functions-derived-call.ll
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-derived-call.ll
@@ -45,13 +45,13 @@ entry:
 
 define hidden void @_ZN1AC2Ev(ptr nocapture %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
 define hidden void @_ZN1BC2Ev(ptr nocapture %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-derived-pointer-call.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-derived-pointer-call.ll
index bdf97d25ac3f1d..e5dea7ad9a81c3 100644
--- a/llvm/test/Transforms/GlobalDCE/virtual-functions-derived-pointer-call.ll
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-derived-pointer-call.ll
@@ -67,13 +67,13 @@ entry:
 
 define hidden void @_ZN1AC2Ev(ptr nocapture %this) {
 entry:
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
 define hidden void @_ZN1BC2Ev(ptr nocapture %this) {
 entry:
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-novfe.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-novfe.ll
index 01475c8911fc2d..04f654c5e0b989 100644
--- a/llvm/test/Transforms/GlobalDCE/virtual-functions-novfe.ll
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-novfe.ll
@@ -39,8 +39,8 @@ entry:
 define dso_local i32 @test_A() {
 entry:
   %call = tail call ptr @_Znwm(i64 8)
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %call, align 8
-  %0 = tail call { ptr, i1 } @llvm.type.checked.load(ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), i32 0, metadata !"_ZTS1A"), !nosanitize !9
+  store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %call, align 8
+  %0 = tail call { ptr, i1 } @llvm.type.checked.load(ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), i32 0, metadata !"_ZTS1A"), !nosanitize !9
   %1 = extractvalue { ptr, i1 } %0, 0, !nosanitize !9
   %call1 = tail call i32 %1(ptr nonnull %call)
   ret i32 %call1
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-visibility-post-lto.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-visibility-post-lto.ll
index 0fac0fbca10992..dde36d46f6b976 100644
--- a/llvm/test/Transforms/GlobalDCE/virtual-functions-visibility-post-lto.ll
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-visibility-post-lto.ll
@@ -18,7 +18,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define internal void @_ZN1AC2Ev(ptr %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
@@ -43,7 +43,7 @@ entry:
 
 define internal void @_ZN1BC2Ev(ptr %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
@@ -68,7 +68,7 @@ entry:
 
 define internal void @_ZN1CC2Ev(ptr %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1C, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1C, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions-visibility-pre-lto.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions-visibility-pre-lto.ll
index 7875d38c8c0b17..ac9c362e1fd681 100644
--- a/llvm/test/Transforms/GlobalDCE/virtual-functions-visibility-pre-lto.ll
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions-visibility-pre-lto.ll
@@ -12,7 +12,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define internal void @_ZN1AC2Ev(ptr %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
@@ -36,7 +36,7 @@ entry:
 
 define internal void @_ZN1BC2Ev(ptr %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
@@ -60,7 +60,7 @@ entry:
 
 define internal void @_ZN1CC2Ev(ptr %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1C, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1C, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalDCE/virtual-functions.ll b/llvm/test/Transforms/GlobalDCE/virtual-functions.ll
index e00f8a7fdfecde..0d9b51ca8c5f68 100644
--- a/llvm/test/Transforms/GlobalDCE/virtual-functions.ll
+++ b/llvm/test/Transforms/GlobalDCE/virtual-functions.ll
@@ -38,8 +38,8 @@ entry:
 define dso_local i32 @test_A() {
 entry:
   %call = tail call ptr @_Znwm(i64 8)
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %call, align 8
-  %0 = tail call { ptr, i1 } @llvm.type.checked.load(ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), i32 0, metadata !"_ZTS1A"), !nosanitize !9
+  store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %call, align 8
+  %0 = tail call { ptr, i1 } @llvm.type.checked.load(ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), i32 0, metadata !"_ZTS1A"), !nosanitize !9
   %1 = extractvalue { ptr, i1 } %0, 0, !nosanitize !9
   %call1 = tail call i32 %1(ptr nonnull %call)
   ret i32 %call1
diff --git a/llvm/test/Transforms/GlobalDCE/vtable-rtti.ll b/llvm/test/Transforms/GlobalDCE/vtable-rtti.ll
index 2eb761e8ab1af0..c4a45ffaf63a80 100644
--- a/llvm/test/Transforms/GlobalDCE/vtable-rtti.ll
+++ b/llvm/test/Transforms/GlobalDCE/vtable-rtti.ll
@@ -16,7 +16,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define internal void @_ZN1AC2Ev(ptr %this) {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
diff --git a/llvm/test/Transforms/GlobalSplit/basic.ll b/llvm/test/Transforms/GlobalSplit/basic.ll
index c47bdd64e02f51..eb1515741763cf 100644
--- a/llvm/test/Transforms/GlobalSplit/basic.ll
+++ b/llvm/test/Transforms/GlobalSplit/basic.ll
@@ -3,11 +3,11 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: @vtt = constant [3 x ptr] [ptr @global.0, ptr getelementptr inbounds ([2 x ptr], ptr @global.0, i32 0, i32 1), ptr @global.1]
+; CHECK: @vtt = constant [3 x ptr] [ptr @global.0, ptr getelementptr inbounds (i8, ptr @global.0, i64 8), ptr @global.1]
 @vtt = constant [3 x ptr] [
-  ptr getelementptr ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, inrange i32 0, i32 0),
-  ptr getelementptr ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, inrange i32 0, i32 1),
-  ptr getelementptr ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, inrange i32 1, i32 0)
+  ptr getelementptr inrange(0, 16) ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, i32 0, i32 0),
+  ptr getelementptr inrange(-8, 8) ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, i32 0, i32 1),
+  ptr getelementptr inrange(0, 8) ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, i32 1, i32 0)
 ]
 
 ; CHECK-NOT: @global =
@@ -22,25 +22,25 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: define ptr @f1()
 define ptr @f1() {
   ; CHECK-NEXT: ret ptr @global.0
-  ret ptr getelementptr ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, inrange i32 0, i32 0)
+  ret ptr getelementptr inrange(0, 16) ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, i32 0, i32 0)
 }
 
 ; CHECK: define ptr @f2()
 define ptr @f2() {
-  ; CHECK-NEXT: ret ptr getelementptr inbounds ([2 x ptr], ptr @global.0, i32 0, i32 1)
-  ret ptr getelementptr ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, inrange i32 0, i32 1)
+  ; CHECK-NEXT: ret ptr getelementptr inbounds (i8, ptr @global.0, i64 8)
+  ret ptr getelementptr inrange(-8, 8) ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, i32 0, i32 1)
 }
 
 ; CHECK: define ptr @f3()
 define ptr @f3() {
-  ; CHECK-NEXT: ret ptr getelementptr inbounds ([2 x ptr], ptr @global.0, i64 1, i32 0)
-  ret ptr getelementptr ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, inrange i32 0, i32 2)
+  ; CHECK-NEXT: ret ptr getelementptr (i8, ptr @global.0, i64 16)
+  ret ptr getelementptr inrange(-16, 0) ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, i32 0, i32 2)
 }
 
 ; CHECK: define ptr @f4()
 define ptr @f4() {
   ; CHECK-NEXT: ret ptr @global.1
-  ret ptr getelementptr ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, inrange i32 1, i32 0)
+  ret ptr getelementptr inrange(0, 8) ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, i32 1, i32 0)
 }
 
 define void @foo() {
diff --git a/llvm/test/Transforms/GlobalSplit/non-beneficial.ll b/llvm/test/Transforms/GlobalSplit/non-beneficial.ll
index a7c50c985c907a..5e27623dd6e9c7 100644
--- a/llvm/test/Transforms/GlobalSplit/non-beneficial.ll
+++ b/llvm/test/Transforms/GlobalSplit/non-beneficial.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
 }
 
 define ptr @f() {
-  ret ptr getelementptr ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, inrange i32 0, i32 0)
+  ret ptr getelementptr inrange(0, 16) ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, i32 0, i32 0)
 }
 
 define ptr @g() {
diff --git a/llvm/test/Transforms/GlobalSplit/nonlocal.ll b/llvm/test/Transforms/GlobalSplit/nonlocal.ll
index 38169a8bea479f..0e639135feb61d 100644
--- a/llvm/test/Transforms/GlobalSplit/nonlocal.ll
+++ b/llvm/test/Transforms/GlobalSplit/nonlocal.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
 }
 
 define ptr @f() {
-  ret ptr getelementptr ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, inrange i32 0, i32 0)
+  ret ptr getelementptr inrange(0, 16) ({ [2 x ptr], [1 x ptr] }, ptr @global, i32 0, i32 0, i32 0)
 }
 
 define ptr @g() {
diff --git a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
index 0c055981260b2b..55013aa96551d0 100644
--- a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
+++ b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
@@ -2,9 +2,9 @@
 
 ; CHECK-LABEL: define {{.*}}@fun
 ; CHECK: call {{.*}}@fun.cold.1(
-; CHECK-NEXT: ret void
+; CHECK-NEXT: unreachable
 ; CHECK: call {{.*}}@fun.cold.2(
-; CHECK-NEXT: ret void
+; CHECK-NEXT: unreachable
 define void @fun() {
 entry:
   br i1 undef, label %A.then, label %A.else
diff --git a/llvm/test/Transforms/IROutliner/outlining-no-return-functions.ll b/llvm/test/Transforms/IROutliner/outlining-no-return-functions.ll
index 6b12207a32dff8..d2c41376f6b2c8 100644
--- a/llvm/test/Transforms/IROutliner/outlining-no-return-functions.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-no-return-functions.ll
@@ -29,19 +29,19 @@ bb1:
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    call void @outlined_ir_func_0()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    call void @outlined_ir_func_0()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: @f3(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    call void @outlined_ir_func_0()
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ;
 ;
 ; CHECK-LABEL: define internal void @outlined_ir_func_0(
diff --git a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
index 0e21bf8ba347a8..59a0241bfe9fde 100644
--- a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
+++ b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
@@ -493,3 +493,55 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %cmp = icmp ult i32 %add, %length
   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
 }
+
+; Test that we can handle shl and disjoint or in getExtendedOperandRecurrence.
+define void @foo7(i32 %n, ptr %a, i32 %x) {
+; CHECK-LABEL: @foo7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %add1 = add nsw i32 %x, 2
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add2, %for.body ]
+  %mul = shl nsw i32 %i.07, 1
+  %add = or disjoint i32 %mul, 1
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+  store i32 %i.07, ptr %arrayidx, align 4
+  %add2 = add nsw i32 %add1, %i.07
+  %cmp = icmp slt i32 %add2, %n
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
diff --git a/llvm/test/Transforms/Inline/devirtualize-4.ll b/llvm/test/Transforms/Inline/devirtualize-4.ll
index fae364313ea028..d29360f73b4739 100644
--- a/llvm/test/Transforms/Inline/devirtualize-4.ll
+++ b/llvm/test/Transforms/Inline/devirtualize-4.ll
@@ -60,7 +60,7 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 define linkonce_odr dso_local void @_ZN4ImplC2Ev(ptr %this) unnamed_addr align 2 {
 entry:
   call void @_ZN9InterfaceC2Ev(ptr %this)
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV4Impl, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV4Impl, i64 0, i32 0, i64 2), ptr %this, align 8
   %f = getelementptr inbounds %class.Impl, ptr %this, i64 0, i32 1
   store i32 3, ptr %f, align 8
   ret void
@@ -78,7 +78,7 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 define linkonce_odr dso_local void @_ZN9InterfaceC2Ev(ptr %this) unnamed_addr align 2 {
 entry:
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV9Interface, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV9Interface, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
@@ -185,7 +185,7 @@ entry:
 
 define linkonce_odr void @_ZN1AC2Ev(ptr %this) align 2 {
 entry:
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %this, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [4 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %this, align 8
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll
index 861525f424fbca..2bdc808d9771c4 100644
--- a/llvm/test/Transforms/InstCombine/add-sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll
@@ -33,7 +33,7 @@ define double @test(i32 %a) {
 define float @test_neg(i32 %a) {
 ; CHECK-LABEL: @test_neg(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
-; CHECK-NEXT:    [[A_AND_FP:%.*]] = uitofp i32 [[A_AND]] to float
+; CHECK-NEXT:    [[A_AND_FP:%.*]] = sitofp i32 [[A_AND]] to float
 ; CHECK-NEXT:    [[RES:%.*]] = fadd float [[A_AND_FP]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[RES]]
 ;
@@ -67,8 +67,8 @@ define float @test_2_neg(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test_2_neg(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
 ; CHECK-NEXT:    [[B_AND:%.*]] = and i32 [[B:%.*]], 1073741823
-; CHECK-NEXT:    [[A_AND_FP:%.*]] = uitofp i32 [[A_AND]] to float
-; CHECK-NEXT:    [[B_AND_FP:%.*]] = uitofp i32 [[B_AND]] to float
+; CHECK-NEXT:    [[A_AND_FP:%.*]] = sitofp i32 [[A_AND]] to float
+; CHECK-NEXT:    [[B_AND_FP:%.*]] = sitofp i32 [[B_AND]] to float
 ; CHECK-NEXT:    [[RES:%.*]] = fadd float [[A_AND_FP]], [[B_AND_FP]]
 ; CHECK-NEXT:    ret float [[RES]]
 ;
@@ -122,8 +122,8 @@ define <4 x float> @test_4_neg(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @test_4_neg(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
 ; CHECK-NEXT:    [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
-; CHECK-NEXT:    [[A_AND_FP:%.*]] = uitofp <4 x i32> [[A_AND]] to <4 x float>
-; CHECK-NEXT:    [[B_AND_FP:%.*]] = uitofp <4 x i32> [[B_AND]] to <4 x float>
+; CHECK-NEXT:    [[A_AND_FP:%.*]] = sitofp <4 x i32> [[A_AND]] to <4 x float>
+; CHECK-NEXT:    [[B_AND_FP:%.*]] = sitofp <4 x i32> [[B_AND]] to <4 x float>
 ; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x float> [[A_AND_FP]], [[B_AND_FP]]
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/and-or-implied-cond-not.ll b/llvm/test/Transforms/InstCombine/and-or-implied-cond-not.ll
new file mode 100644
index 00000000000000..89b3164018ac9a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/and-or-implied-cond-not.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @test_imply_not1(i32 %depth) {
+; CHECK-LABEL: define i1 @test_imply_not1(
+; CHECK-SAME: i32 [[DEPTH:%.*]]) {
+; CHECK-NEXT:    [[CMP1_NOT1:%.*]] = icmp eq i32 [[DEPTH]], 16
+; CHECK-NEXT:    call void @use(i1 [[CMP1_NOT1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[DEPTH]], 8
+; CHECK-NEXT:    call void @use(i1 [[CMP2]])
+; CHECK-NEXT:    br i1 [[CMP1_NOT1]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @func1()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @func2()
+; CHECK-NEXT:    unreachable
+;
+  %cmp1 = icmp eq i32 %depth, 16
+  call void @use(i1 %cmp1)
+  %cmp2 = icmp slt i32 %depth, 8
+  call void @use(i1 %cmp2)
+  %cmp.not = xor i1 %cmp1, true
+  %brmerge = or i1 %cmp2, %cmp.not
+  br i1 %brmerge, label %if.then, label %if.else
+if.then:
+  call void @func1()
+  unreachable
+
+if.else:
+  call void @func2()
+  unreachable
+}
+
+define i1 @test_imply_not2(i32 %a, i1 %cmp2) {
+; CHECK-LABEL: define i1 @test_imply_not2(
+; CHECK-SAME: i32 [[A:%.*]], i1 [[CMP2:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[BRMERGE]]
+;
+  %cmp1 = icmp eq i32 %a, 0
+  %or.cond = select i1 %cmp1, i1 %cmp2, i1 false
+  %cmp.not = xor i1 %cmp1, true
+  %brmerge = or i1 %or.cond, %cmp.not
+  ret i1 %brmerge
+}
+
+define i1 @test_imply_not3(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: define i1 @test_imply_not3(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A]], [[B]]
+; CHECK-NEXT:    call void @use(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT:    [[AND:%.*]] = select i1 [[CMP2]], i1 [[COND]], i1 false
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp eq i32 %a, %b
+  call void @use(i1 %cmp1)
+  %cmp2 = icmp slt i32 %a, %b
+  %cmp.not = xor i1 %cmp1, true
+  %sel = select i1 %cmp.not, i1 %cond, i1 false
+  %and = and i1 %cmp2, %sel
+  ret i1 %and
+}
+
+declare void @func1()
+declare void @func2()
+declare void @use(i1)
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index 927f0a86b0a252..87c75fb2b55592 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -386,7 +386,7 @@ define i1 @nonnull5(ptr %a) {
 define i32 @assumption_conflicts_with_known_bits(i32 %a, i32 %b) {
 ; CHECK-LABEL: @assumption_conflicts_with_known_bits(
 ; CHECK-NEXT:    store i1 true, ptr poison, align 1
-; CHECK-NEXT:    ret i32 1
+; CHECK-NEXT:    ret i32 poison
 ;
   %and1 = and i32 %b, 3
   %B1 = lshr i32 %and1, %and1
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index f796273c84e082..82cdb3ce6bee61 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -166,7 +166,7 @@ define half @test_si_si_i8_sub(i8 noundef %x_in, i8 noundef %y_in) {
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 63
 ; CHECK-NEXT:    [[Y:%.*]] = or i8 [[Y_IN:%.*]], -64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = sitofp i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 63
@@ -181,7 +181,7 @@ define half @test_si_si_i8_sub_fail_overflow(i8 noundef %x_in, i8 noundef %y_in)
 ; CHECK-LABEL: @test_si_si_i8_sub_fail_overflow(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 63
 ; CHECK-NEXT:    [[Y:%.*]] = or i8 [[Y_IN:%.*]], -65
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i8 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fsub half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -198,7 +198,7 @@ define half @test_si_si_i8_sub_C(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_si_si_i8_sub_C(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 63
 ; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[X]], 64
-; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = sitofp i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 63
@@ -315,7 +315,7 @@ define half @test_si_si_i8_mul_fail_maybe_zero(i8 noundef %x_in, i8 noundef %y_i
 ; CHECK-LABEL: @test_si_si_i8_mul_fail_maybe_zero(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 7
 ; CHECK-NEXT:    [[Y:%.*]] = or i8 [[Y_IN:%.*]], -8
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i8 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -332,7 +332,7 @@ define half @test_si_si_i8_mul_C_fail_no_repr(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_si_si_i8_mul_C_fail_no_repr(
 ; CHECK-NEXT:    [[XX:%.*]] = and i8 [[X_IN:%.*]], 6
 ; CHECK-NEXT:    [[X:%.*]] = or disjoint i8 [[XX]], 1
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], 0xHC780
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -347,7 +347,7 @@ define half @test_si_si_i8_mul_C_fail_overflow(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_si_si_i8_mul_C_fail_overflow(
 ; CHECK-NEXT:    [[XX:%.*]] = and i8 [[X_IN:%.*]], 6
 ; CHECK-NEXT:    [[X:%.*]] = or disjoint i8 [[XX]], 1
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], 0xHCCC0
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -401,7 +401,7 @@ define half @test_ui_si_i8_mul_fail_signed(i8 noundef %x_in, i8 noundef %y_in) {
 ; CHECK-NEXT:    [[XX:%.*]] = and i8 [[X_IN:%.*]], 7
 ; CHECK-NEXT:    [[X:%.*]] = add nuw nsw i8 [[XX]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = or i8 [[Y_IN:%.*]], -4
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = uitofp i8 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -541,7 +541,7 @@ define half @test_si_si_i16_sub_fail_no_promotion(i16 noundef %x_in, i16 noundef
 ; CHECK-LABEL: @test_si_si_i16_sub_fail_no_promotion(
 ; CHECK-NEXT:    [[X:%.*]] = and i16 [[X_IN:%.*]], 2047
 ; CHECK-NEXT:    [[Y:%.*]] = or i16 [[Y_IN:%.*]], -2049
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i16 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = sitofp i16 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i16 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fsub half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -575,7 +575,7 @@ define half @test_ui_si_i16_sub_fail_maybe_signed(i16 noundef %x_in, i16 noundef
 ; CHECK-NEXT:    [[X:%.*]] = or i16 [[X_IN:%.*]], -2048
 ; CHECK-NEXT:    [[Y:%.*]] = and i16 [[Y_IN:%.*]], 2047
 ; CHECK-NEXT:    [[XF:%.*]] = uitofp i16 [[X]] to half
-; CHECK-NEXT:    [[YF:%.*]] = uitofp i16 [[Y]] to half
+; CHECK-NEXT:    [[YF:%.*]] = sitofp i16 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fsub half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -643,7 +643,7 @@ define half @test_si_si_i16_mul_fail_overflow(i16 noundef %x_in, i16 noundef %y_
 ; CHECK-NEXT:    [[XX:%.*]] = and i16 [[X_IN:%.*]], 126
 ; CHECK-NEXT:    [[X:%.*]] = or disjoint i16 [[XX]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = or i16 [[Y_IN:%.*]], -257
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i16 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = sitofp i16 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i16 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -821,7 +821,7 @@ define half @test_si_si_i12_sub(i12 noundef %x_in, i12 noundef %y_in) {
 ; CHECK-NEXT:    [[X:%.*]] = and i12 [[X_IN:%.*]], 1023
 ; CHECK-NEXT:    [[Y:%.*]] = or i12 [[Y_IN:%.*]], -1024
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i12 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = uitofp i12 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = sitofp i12 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i12 %x_in, 1023
@@ -915,7 +915,7 @@ define half @test_si_si_i12_mul_fail_overflow(i12 noundef %x_in, i12 noundef %y_
 ; CHECK-NEXT:    [[XX:%.*]] = and i12 [[X_IN:%.*]], 30
 ; CHECK-NEXT:    [[X:%.*]] = or disjoint i12 [[XX]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = or i12 [[Y_IN:%.*]], -128
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i12 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = sitofp i12 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i12 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -933,7 +933,7 @@ define half @test_si_si_i12_mul_fail_maybe_non_zero(i12 noundef %x_in, i12 nound
 ; CHECK-LABEL: @test_si_si_i12_mul_fail_maybe_non_zero(
 ; CHECK-NEXT:    [[X:%.*]] = and i12 [[X_IN:%.*]], 30
 ; CHECK-NEXT:    [[Y:%.*]] = or i12 [[Y_IN:%.*]], -128
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i12 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = sitofp i12 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i12 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -950,7 +950,7 @@ define half @test_si_si_i12_mul_C(i12 noundef %x_in) {
 ; CHECK-LABEL: @test_si_si_i12_mul_C(
 ; CHECK-NEXT:    [[X:%.*]] = or i12 [[X_IN:%.*]], -64
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i12 [[X]], -16
-; CHECK-NEXT:    [[R:%.*]] = uitofp i12 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = sitofp i12 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = or i12 %x_in, -64
@@ -1004,3 +1004,123 @@ define float @test_ui_add_with_signed_constant(i32 %shr.i) {
   %add = fadd float %sub, -16383.0
   ret float %add
 }
+
+
+;; Reduced form of bug noticed due to #82555
+define float @missed_nonzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @missed_nonzero_check_on_constant_for_si_fmul(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp i16 [[CONV_I]] to float
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul float [[CONV1_I]], 0.000000e+00
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret float [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i = trunc i32 %sel to i16
+  %conv1.i = sitofp i16 %conv.i to float
+  %mul3.i.i = fmul float %conv1.i, 0.000000e+00
+  store i32 %sel, ptr %g_2345, align 4
+  ret float %mul3.i.i
+}
+
+define <2 x float> @missed_nonzero_check_on_constant_for_si_fmul_vec(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @missed_nonzero_check_on_constant_for_si_fmul_vec(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I_S:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], zeroinitializer
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i.s = trunc i32 %sel to i16
+  %conv.i.v = insertelement <2 x i16> poison, i16 %conv.i.s, i64 0
+  %conv.i = insertelement <2 x i16> %conv.i.v, i16 %conv.i.s, i64 1
+  %conv1.i = sitofp <2 x i16> %conv.i to <2 x float>
+  %mul3.i.i = fmul <2 x float> %conv1.i, zeroinitializer
+  store i32 %sel, ptr %g_2345, align 4
+  ret <2 x float> %mul3.i.i
+}
+
+define float @negzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @negzero_check_on_constant_for_si_fmul(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp i16 [[CONV_I]] to float
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul float [[CONV1_I]], -0.000000e+00
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret float [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i = trunc i32 %sel to i16
+  %conv1.i = sitofp i16 %conv.i to float
+  %mul3.i.i = fmul float %conv1.i, -0.000000e+00
+  store i32 %sel, ptr %g_2345, align 4
+  ret float %mul3.i.i
+}
+
+define <2 x float> @nonzero_check_on_constant_for_si_fmul_vec_w_undef(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_vec_w_undef(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I_S:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], <float undef, float 0.000000e+00>
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i.s = trunc i32 %sel to i16
+  %conv.i.v = insertelement <2 x i16> poison, i16 %conv.i.s, i64 0
+  %conv.i = insertelement <2 x i16> %conv.i.v, i16 %conv.i.s, i64 1
+  %conv1.i = sitofp <2 x i16> %conv.i to <2 x float>
+  %mul3.i.i = fmul <2 x float> %conv1.i, <float undef, float 0.000000e+00>
+  store i32 %sel, ptr %g_2345, align 4
+  ret <2 x float> %mul3.i.i
+}
+
+define <2 x float> @nonzero_check_on_constant_for_si_fmul_nz_vec_w_undef(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_nz_vec_w_undef(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I_S:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], <float undef, float 1.000000e+00>
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i.s = trunc i32 %sel to i16
+  %conv.i.v = insertelement <2 x i16> poison, i16 %conv.i.s, i64 0
+  %conv.i = insertelement <2 x i16> %conv.i.v, i16 %conv.i.s, i64 1
+  %conv1.i = sitofp <2 x i16> %conv.i to <2 x float>
+  %mul3.i.i = fmul <2 x float> %conv1.i, <float undef, float 1.000000e+00>
+  store i32 %sel, ptr %g_2345, align 4
+  ret <2 x float> %mul3.i.i
+}
+
+define <2 x float> @nonzero_check_on_constant_for_si_fmul_negz_vec_w_undef(i1 %c, i1 %.b, ptr %g_2345) {
+; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_negz_vec_w_undef(
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
+; CHECK-NEXT:    [[CONV_I_S:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
+; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], <float undef, float -0.000000e+00>
+; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+;
+  %sel = select i1 %c, i32 65529, i32 53264
+  %conv.i.s = trunc i32 %sel to i16
+  %conv.i.v = insertelement <2 x i16> poison, i16 %conv.i.s, i64 0
+  %conv.i = insertelement <2 x i16> %conv.i.v, i16 %conv.i.s, i64 1
+  %conv1.i = sitofp <2 x i16> %conv.i to <2 x float>
+  %mul3.i.i = fmul <2 x float> %conv1.i, <float undef, float -0.000000e+00>
+  store i32 %sel, ptr %g_2345, align 4
+  ret <2 x float> %mul3.i.i
+}
diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
index 3f3939ea73bdc9..9da9eb36d381f0 100644
--- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
@@ -504,7 +504,7 @@ define float @mixed_clamp_to_float_1(i32 %x) {
 ; CHECK-LABEL: @mixed_clamp_to_float_1(
 ; CHECK-NEXT:    [[SI_MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255)
 ; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN]], i32 1)
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[R1]] to float
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[R1]] to float
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %si_min_cmp = icmp sgt i32 %x, 255
@@ -539,7 +539,7 @@ define float @mixed_clamp_to_float_2(i32 %x) {
 ; CHECK-LABEL: @mixed_clamp_to_float_2(
 ; CHECK-NEXT:    [[SI_MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255)
 ; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN]], i32 1)
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[R1]] to float
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[R1]] to float
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %si_min_cmp = icmp sgt i32 %x, 255
@@ -572,7 +572,7 @@ define <2 x float> @mixed_clamp_to_float_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @mixed_clamp_to_float_vec(
 ; CHECK-NEXT:    [[SI_MIN:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[X:%.*]], <2 x i32> <i32 255, i32 255>)
 ; CHECK-NEXT:    [[R1:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[SI_MIN]], <2 x i32> <i32 1, i32 1>)
-; CHECK-NEXT:    [[R:%.*]] = uitofp <2 x i32> [[R1]] to <2 x float>
+; CHECK-NEXT:    [[R:%.*]] = sitofp <2 x i32> [[R1]] to <2 x float>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %si_min_cmp = icmp sgt <2 x i32> %x, <i32 255, i32 255>
diff --git a/llvm/test/Transforms/InstCombine/copysign-fneg-fabs.ll b/llvm/test/Transforms/InstCombine/copysign-fneg-fabs.ll
index af939cf74399ab..a63eeab6a4b1e9 100644
--- a/llvm/test/Transforms/InstCombine/copysign-fneg-fabs.ll
+++ b/llvm/test/Transforms/InstCombine/copysign-fneg-fabs.ll
@@ -275,6 +275,85 @@ define half @fneg_fabs_copysign_multi_use_fabs(half %x, half %y, ptr %ptr) {
   ret half %fabs.copysign
 }
 
+define half @copysign_pos(half %a) {
+; CHECK-LABEL: @copysign_pos(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH3C00, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xH3C00, half %a)
+  ret half %ret
+}
+
+define half @copysign_neg(half %a) {
+; CHECK-LABEL: @copysign_neg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH3C00, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xHBC00, half %a)
+  ret half %ret
+}
+
+define half @copysign_negzero(half %a) {
+; CHECK-LABEL: @copysign_negzero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xH8000, half %a)
+  ret half %ret
+}
+
+define half @copysign_negnan(half %a) {
+; CHECK-LABEL: @copysign_negnan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH7E00, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xHFE00, half %a)
+  ret half %ret
+}
+
+define half @copysign_neginf(half %a) {
+; CHECK-LABEL: @copysign_neginf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call half @llvm.copysign.f16(half 0xH7C00, half [[A:%.*]])
+; CHECK-NEXT:    ret half [[RET]]
+;
+entry:
+  %ret = call half @llvm.copysign.f16(half 0xHFC00, half %a)
+  ret half %ret
+}
+
+define <4 x half> @copysign_splat(<4 x half> %a) {
+; CHECK-LABEL: @copysign_splat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x half> @llvm.copysign.v4f16(<4 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, <4 x half> [[A:%.*]])
+; CHECK-NEXT:    ret <4 x half> [[RET]]
+;
+entry:
+  %ret = call <4 x half> @llvm.copysign.v4f16(<4 x half> splat(half 0xHBC00), <4 x half> %a)
+  ret <4 x half> %ret
+}
+
+; TODO: Support constant folding of fabs
+
+define <4 x half> @copysign_vec4(<4 x half> %a) {
+; CHECK-LABEL: @copysign_vec4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x half> @llvm.copysign.v4f16(<4 x half> <half 0xH3C00, half 0xHBC00, half undef, half poison>, <4 x half> [[A:%.*]])
+; CHECK-NEXT:    ret <4 x half> [[RET]]
+;
+entry:
+  %ret = call <4 x half> @llvm.copysign.v4f16(<4 x half> <half 0xH3C00, half 0xHBC00, half undef, half poison>, <4 x half> %a)
+  ret <4 x half> %ret
+}
+
 declare half @llvm.fabs.f16(half)
 declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
 declare half @llvm.copysign.f16(half, half)
diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll
index 159c84d0dd8aa9..f2701d16d0f3d1 100644
--- a/llvm/test/Transforms/InstCombine/fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp.ll
@@ -644,7 +644,7 @@ define <2 x i1> @is_signbit_set_anyzero(<2 x double> %x) {
 
 define i1 @is_signbit_clear(double %x) {
 ; CHECK-LABEL: @is_signbit_clear(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp ogt double [[S]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -655,7 +655,7 @@ define i1 @is_signbit_clear(double %x) {
 
 define i1 @is_signbit_clear_1(double %x) {
 ; CHECK-LABEL: @is_signbit_clear_1(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp ugt double [[S]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -666,7 +666,7 @@ define i1 @is_signbit_clear_1(double %x) {
 
 define i1 @is_signbit_clear_2(double %x) {
 ; CHECK-LABEL: @is_signbit_clear_2(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp oge double [[S]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -677,7 +677,7 @@ define i1 @is_signbit_clear_2(double %x) {
 
 define i1 @is_signbit_clear_3(double %x) {
 ; CHECK-LABEL: @is_signbit_clear_3(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp uge double [[S]], 0.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
@@ -705,7 +705,7 @@ define i1 @is_signbit_set_extra_use(double %x, ptr %p) {
 
 define i1 @is_signbit_clear_nonzero(double %x) {
 ; CHECK-LABEL: @is_signbit_clear_nonzero(
-; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double -4.200000e+01, double [[X:%.*]])
+; CHECK-NEXT:    [[S:%.*]] = call double @llvm.copysign.f64(double 4.200000e+01, double [[X:%.*]])
 ; CHECK-NEXT:    [[R:%.*]] = fcmp ogt double [[S]], 1.000000e+00
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index 7e7373e6ef5bdd..96e57939d28505 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -1093,11 +1093,11 @@ for.body:
 
 define double @fmul_negated_constant_expression(double %x) {
 ; CHECK-LABEL: @fmul_negated_constant_expression(
-; CHECK-NEXT:    [[FSUB:%.*]] = fneg double bitcast (i64 ptrtoint (ptr getelementptr inbounds ({ [2 x ptr] }, ptr @g, i64 0, inrange i32 0, i64 2) to i64) to double)
+; CHECK-NEXT:    [[FSUB:%.*]] = fneg double bitcast (i64 ptrtoint (ptr getelementptr inbounds ({ [2 x ptr] }, ptr @g, i64 1, i32 0, i64 0) to i64) to double)
 ; CHECK-NEXT:    [[R:%.*]] = fmul double [[FSUB]], [[X:%.*]]
 ; CHECK-NEXT:    ret double [[R]]
 ;
-  %fsub = fsub double -0.000000e+00, bitcast (i64 ptrtoint (ptr getelementptr inbounds ({ [2 x ptr] }, ptr @g, i64 0, inrange i32 0, i64 2) to i64) to double)
+  %fsub = fsub double -0.000000e+00, bitcast (i64 ptrtoint (ptr getelementptr inbounds ({ [2 x ptr] }, ptr @g, i64 0, i32 0, i64 2) to i64) to double)
   %r = fmul double %x, %fsub
   ret double %r
 }
diff --git a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
index a7adcd1ac61e7f..dedd12f8cc7a3d 100644
--- a/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
+++ b/llvm/test/Transforms/InstCombine/fold-select-fmul-if-zero.ll
@@ -427,13 +427,13 @@ define float @fmul_by_snan_if_0_oeq_zero_f32(float %x) {
 define float @fmul_by_var_if_0_oeq_zero_f32(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -441,14 +441,14 @@ define float @fmul_by_fabs_var_if_0_oeq_zero_f32(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_fabs_var_if_0_oeq_zero_f32(
 ; CHECK-NEXT:    [[Y_FABS:%.*]] = call float @llvm.fabs.f32(float [[Y:%.*]])
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y_FABS]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y_FABS]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %y.fabs = call float @llvm.fabs.f32(float %y)
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %y.fabs
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -467,13 +467,13 @@ define float @fmul_by_fabs_nnan_ninf_var_if_0_oeq_zero_f32(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nsz float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -481,13 +481,13 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_fmul(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul ninf nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nsz ninf float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -495,13 +495,13 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_ninf_fmul(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nsz nnan float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -509,13 +509,13 @@ define float @fmul_by_var_if_0_oeq_zero_f32_nsz_nnan_fmul(float %x, float %y) {
 define float @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_nnan_ninf_fmul(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -558,26 +558,26 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_nsz_inverted(f
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf nsz float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(float %x, float %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf nsz float %y, %x
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -585,26 +585,26 @@ define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz_commuted(float %x
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero(float %x, float nofpclass(nzero) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero_negsub(float %x, float nofpclass(nzero nsub) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_nnan_ninf_select_known_never_negzero_negsub(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -622,26 +622,26 @@ define float @fmul_by_var_if_0_oeq_zero_f32_known_never_nan_inf_select_nsz(float
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero(float %x, float nofpclass(nan inf nzero) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
 define float @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero_nsub(float %x, float nofpclass(nan inf nzero nsub) %y) {
 ; CHECK-LABEL: @fmul_by_var_if_0_oeq_zero_f32_fmul_known_never_nan_inf_negzero_nsub(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[Y:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %y
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
@@ -692,26 +692,26 @@ define float @fmul_by_var_if_not_one_0_zero_f32_assume_finite_fmul_nsz(float %x,
 define float @fmul_by_self_if_0_oeq_zero_f32(float %x) {
 ; CHECK-LABEL: @fmul_by_self_if_0_oeq_zero_f32(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul float %x, %x
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
 define float @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(float %x) {
 ; CHECK-LABEL: @fmul_by_self_if_0_oeq_zero_f32_fmul_nnan_ninf_nsz(
 ; CHECK-NEXT:    [[X_IS_ZERO:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[SCALED_X:%.*]] = select i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
+; CHECK-NEXT:    [[SCALED_X:%.*]] = select nnan i1 [[X_IS_ZERO]], float [[X]], float 1.000000e+00
 ; CHECK-NEXT:    [[SCALED_IF_DENORMAL:%.*]] = fmul nnan ninf nsz float [[SCALED_X]], [[X]]
 ; CHECK-NEXT:    ret float [[SCALED_IF_DENORMAL]]
 ;
   %x.is.zero = fcmp oeq float %x, 0.0
   %scaled.x = fmul nnan ninf nsz float %x, %x
-  %scaled.if.denormal = select i1 %x.is.zero, float %scaled.x, float %x
+  %scaled.if.denormal = select nnan i1 %x.is.zero, float %scaled.x, float %x
   ret float %scaled.if.denormal
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll
index 88ca556f4d8f8b..32bfdb52bb5f7e 100644
--- a/llvm/test/Transforms/InstCombine/fpcast.ll
+++ b/llvm/test/Transforms/InstCombine/fpcast.ll
@@ -170,7 +170,7 @@ define half @sint_to_fptrunc(i32 %x) {
 define half @masked_sint_to_fptrunc1(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fptrunc1(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[X:%.*]], 16777215
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to half
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %m = and i32 %x, 16777215
@@ -182,7 +182,7 @@ define half @masked_sint_to_fptrunc1(i32 %x) {
 define half @masked_sint_to_fptrunc2(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fptrunc2(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to half
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %m = lshr i32 %x, 8
@@ -194,7 +194,7 @@ define half @masked_sint_to_fptrunc2(i32 %x) {
 define half @masked_sint_to_fptrunc3(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fptrunc3(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[F:%.*]] = uitofp i32 [[M]] to float
+; CHECK-NEXT:    [[F:%.*]] = sitofp i32 [[M]] to float
 ; CHECK-NEXT:    [[R:%.*]] = fptrunc float [[F]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -218,7 +218,7 @@ define double @sint_to_fpext(i32 %x) {
 define double @masked_sint_to_fpext1(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fpext1(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[X:%.*]], 16777215
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to double
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %m = and i32 %x, 16777215
@@ -230,7 +230,7 @@ define double @masked_sint_to_fpext1(i32 %x) {
 define double @masked_sint_to_fpext2(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fpext2(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to double
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %m = lshr i32 %x, 8
@@ -242,7 +242,7 @@ define double @masked_sint_to_fpext2(i32 %x) {
 define double @masked_sint_to_fpext3(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fpext3(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[F:%.*]] = uitofp i32 [[M]] to float
+; CHECK-NEXT:    [[F:%.*]] = sitofp i32 [[M]] to float
 ; CHECK-NEXT:    [[R:%.*]] = fpext float [[F]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
@@ -347,3 +347,91 @@ define double @masked_uint_to_fpext3(i32 %x) {
   %r = fpext float %f to double
   ret double %r
 }
+
+define i32 @fptosi_nonnorm(float nofpclass(norm) %x) {
+; CHECK-LABEL: @fptosi_nonnorm(
+; CHECK-NEXT:    ret i32 0
+;
+  %ret = fptosi float %x to i32
+  ret i32 %ret
+}
+
+define i32 @fptoui_nonnorm(float nofpclass(pnorm) %x) {
+; CHECK-LABEL: @fptoui_nonnorm(
+; CHECK-NEXT:    ret i32 0
+;
+  %ret = fptoui float %x to i32
+  ret i32 %ret
+}
+
+define i32 @fptosi_nonnnorm(float nofpclass(nnorm) %x) {
+; CHECK-LABEL: @fptosi_nonnnorm(
+; CHECK-NEXT:    [[RET:%.*]] = fptosi float [[X:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %ret = fptosi float %x to i32
+  ret i32 %ret
+}
+
+define i32 @fptoui_nonnnorm(float nofpclass(nnorm) %x) {
+; CHECK-LABEL: @fptoui_nonnnorm(
+; CHECK-NEXT:    [[RET:%.*]] = fptoui float [[X:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %ret = fptoui float %x to i32
+  ret i32 %ret
+}
+
+define i32 @fptosi_nonnorm_copysign(float %x) {
+; CHECK-LABEL: @fptosi_nonnorm_copysign(
+; CHECK-NEXT:    ret i32 0
+;
+  %val = call float @llvm.copysign.f32(float 0.0, float %x)
+  %ret = fptosi float %val to i32
+  ret i32 %ret
+}
+
+define <2 x i32> @fptosi_nonnorm_copysign_vec(<2 x float> %x) {
+; CHECK-LABEL: @fptosi_nonnorm_copysign_vec(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %val = call <2 x float> @llvm.copysign.v2f32(<2 x float> zeroinitializer, <2 x float> %x)
+  %ret = fptosi <2 x float> %val to <2 x i32>
+  ret <2 x i32> %ret
+}
+
+define i32 @fptosi_nonnorm_fmul(float %x) {
+; CHECK-LABEL: @fptosi_nonnorm_fmul(
+; CHECK-NEXT:    [[SEL:%.*]] = fmul float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[RET:%.*]] = fptosi float [[SEL]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %sel = fmul float %x, 0.000000e+00
+  %ret = fptosi float %sel to i32
+  ret i32 %ret
+}
+
+define i32 @fptosi_select(i1 %cond) {
+; CHECK-LABEL: @fptosi_select(
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i32 1, i32 -1
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %sel = select i1 %cond, float 1.0, float -1.0
+  %ret = fptosi float %sel to i32
+  ret i32 %ret
+}
+
+define i32 @mul_pos_zero_convert(i32 %a) {
+; CHECK-LABEL: @mul_pos_zero_convert(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FP:%.*]] = sitofp i32 [[A:%.*]] to float
+; CHECK-NEXT:    [[RET:%.*]] = fmul float [[FP]], 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[RET]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %fp = sitofp i32 %a to float
+  %ret = fmul float %fp, 0.000000e+00
+  %conv = fptosi float %ret to i32
+  ret i32 %conv
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
index 07060922895846..5de3e89d7027ab 100644
--- a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll
@@ -226,12 +226,11 @@ define i1 @src_is_mask_shl_lshr(i8 %x_in, i8 %y, i1 %cond) {
 
 define i1 @src_is_mask_shl_lshr_fail_not_allones(i8 %x_in, i8 %y, i1 %cond) {
 ; CHECK-LABEL: @src_is_mask_shl_lshr_fail_not_allones(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 -1, [[Y:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = and i8 [[TMP1]], -2
-; CHECK-NEXT:    [[NOTMASK:%.*]] = xor i8 [[MASK]], -1
-; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[NOTMASK]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[MASK]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP3]], -1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -471,6 +470,24 @@ define i1 @src_is_notmask_x_xor_neg_x(i8 %x_in, i8 %y, i1 %cond) {
   ret i1 %r
 }
 
+define i1 @src_is_notmask_x_xor_neg_x_inv(i8 %x_in, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_is_notmask_x_xor_neg_x_inv(
+; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
+; CHECK-NEXT:    [[NEG_Y:%.*]] = add i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NOTMASK0:%.*]] = xor i8 [[NEG_Y]], [[Y]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[COND:%.*]], i8 [[NOTMASK0]], i8 7
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[TMP3]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %x = xor i8 %x_in, 123
+  %neg_y = sub i8 0, %y
+  %nmask0 = xor i8 %y, %neg_y
+  %notmask = select i1 %cond, i8 %nmask0, i8 -8
+  %and = and i8 %notmask, %x
+  %r = icmp eq i8 %and, 0
+  ret i1 %r
+}
+
 define i1 @src_is_notmask_shl_fail_multiuse_invert(i8 %x_in, i8 %y, i1 %cond) {
 ; CHECK-LABEL: @src_is_notmask_shl_fail_multiuse_invert(
 ; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 122
@@ -572,11 +589,10 @@ define i1 @src_is_notmask_neg_p2(i8 %x_in, i8 %y) {
 
 define i1 @src_is_notmask_neg_p2_fail_not_invertable(i8 %x_in, i8 %y) {
 ; CHECK-LABEL: @src_is_notmask_neg_p2_fail_not_invertable(
-; CHECK-NEXT:    [[X:%.*]] = xor i8 [[X_IN:%.*]], 123
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[Y:%.*]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i8 [[Y]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i8 0, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP2]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = icmp uge i8 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = xor i8 %x_in, 123
@@ -657,3 +673,238 @@ define i1 @src_is_mask_const_sge(i8 %x_in) {
   %r = icmp sge i8 %and, %x
   ret i1 %r
 }
+
+define i1 @src_x_and_mask_slt(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_mask_slt(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[MASK_POS:%.*]] = icmp sgt i8 [[MASK]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MASK_POS]])
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %mask_pos = icmp sge i8 %mask, 0
+  call void @llvm.assume(i1 %mask_pos)
+  %and = and i8 %x, %mask
+  %r = icmp slt i8 %and, %x
+  ret i1 %r
+}
+
+define i1 @src_x_and_mask_sge(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_mask_sge(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[MASK_POS:%.*]] = icmp sgt i8 [[MASK]], -1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MASK_POS]])
+; CHECK-NEXT:    [[R:%.*]] = icmp sge i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %mask_pos = icmp sge i8 %mask, 0
+  call void @llvm.assume(i1 %mask_pos)
+  %and = and i8 %x, %mask
+  %r = icmp sge i8 %and, %x
+  ret i1 %r
+}
+
+define i1 @src_x_and_mask_slt_fail_maybe_neg(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_mask_slt_fail_maybe_neg(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[AND]], [[X]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %and = and i8 %x, %mask
+  %r = icmp slt i8 %and, %x
+  ret i1 %r
+}
+
+define i1 @src_x_and_mask_sge_fail_maybe_neg(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_mask_sge_fail_maybe_neg(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sge i8 [[AND]], [[X]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %and = and i8 %x, %mask
+  %r = icmp sge i8 %and, %x
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_eq(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_eq(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R1:%.*]] = icmp ule i8 [[NOT_MASK0]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT_COND:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[NOT_COND]], i1 true, i1 [[R1]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp eq i8 %not_mask, %and
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_ne(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_ne(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R1:%.*]] = icmp ugt i8 [[NOT_MASK0]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[COND:%.*]], i1 [[R1]], i1 false
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp ne i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_ult(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_ult(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R1:%.*]] = icmp ugt i8 [[NOT_MASK0]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[COND:%.*]], i1 [[R1]], i1 false
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp ult i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_uge(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_uge(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R1:%.*]] = icmp ule i8 [[NOT_MASK0]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT_COND:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[NOT_COND]], i1 true, i1 [[R1]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp uge i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_slt(i8 %x, i8 %y) {
+; CHECK-LABEL: @src_x_and_nmask_slt(
+; CHECK-NEXT:    [[NOT_MASK:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i8 [[NOT_MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask = shl i8 -1, %y
+  %and = and i8 %x, %not_mask
+  %r = icmp slt i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_sge(i8 %x, i8 %y) {
+; CHECK-LABEL: @src_x_and_nmask_sge(
+; CHECK-NEXT:    [[NOT_MASK:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[NOT_MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask = shl i8 -1, %y
+  %and = and i8 %x, %not_mask
+  %r = icmp sge i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_slt_fail_maybe_z(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_slt_fail_maybe_z(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[NOT_MASK:%.*]] = select i1 [[COND:%.*]], i8 [[NOT_MASK0]], i8 0
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[NOT_MASK]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[AND]], [[NOT_MASK]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp slt i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_and_nmask_sge_fail_maybe_z(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_and_nmask_sge_fail_maybe_z(
+; CHECK-NEXT:    [[NOT_MASK0:%.*]] = shl nsw i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[NOT_MASK:%.*]] = select i1 [[COND:%.*]], i8 [[NOT_MASK0]], i8 0
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[NOT_MASK]], [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sge i8 [[AND]], [[NOT_MASK]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %not_mask0 = shl i8 -1, %y
+  %not_mask = select i1 %cond, i8 %not_mask0, i8 0
+  %and = and i8 %x, %not_mask
+  %r = icmp sge i8 %and, %not_mask
+  ret i1 %r
+}
+
+define i1 @src_x_or_mask_eq(i8 %x, i8 %y, i8 %z, i1 %c2, i1 %cond) {
+; CHECK-LABEL: @src_x_or_mask_eq(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[X:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C2:%.*]], i8 [[TMP1]], i8 -46
+; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.umax.i8(i8 [[Z:%.*]], i8 [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP3]], -12
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[TMP4]], [[MASK]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %nx = xor i8 %x, 123
+  %nx_c = select i1 %c2, i8 %nx, i8 45
+  %nz = xor i8 %z, -1
+  %nx_cc = call i8 @llvm.umin.i8(i8 %nz, i8 %nx_c)
+  %nx_ccc = add i8 %nx_cc, 12
+  %or = or i8 %nx_ccc, %mask
+  %r = icmp eq i8 %or, -1
+  ret i1 %r
+}
+
+define i1 @src_x_or_mask_ne(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_or_mask_ne(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[MASK]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %nx = xor i8 %x, -1
+  %or = or i8 %mask, %nx
+  %r = icmp ne i8 %or, -1
+  ret i1 %r
+}
+
+define i1 @src_x_or_mask_ne_fail_multiuse(i8 %x, i8 %y, i1 %cond) {
+; CHECK-LABEL: @src_x_or_mask_ne_fail_multiuse(
+; CHECK-NEXT:    [[MASK0:%.*]] = lshr i8 -1, [[Y:%.*]]
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[MASK0]], i8 0
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[MASK]], [[NX]]
+; CHECK-NEXT:    call void @use.i8(i8 [[OR]])
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[OR]], -1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %mask0 = lshr i8 -1, %y
+  %mask = select i1 %cond, i8 %mask0, i8 0
+  %nx = xor i8 %x, -1
+  %or = or i8 %mask, %nx
+  call void @use.i8(i8 %or)
+  %r = icmp ne i8 %or, -1
+  ret i1 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll b/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll
new file mode 100644
index 00000000000000..bacdb54f787d6a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-select-implies-common-op.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define i1 @sgt_3_impliesF_eq_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @sgt_3_impliesF_eq_2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[SEL:%.*]], [[X]]
+; CHECK-NEXT:    [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i8 %x, 3
+  %sel = select i1 %cmp, i8 2, i8 %y
+  %cmp2 = icmp eq i8 %sel, %x
+  ret i1 %cmp2
+}
+
+define i1 @sgt_3_impliesT_sgt_2(i8 %x, i8 %y) {
+; CHECK-LABEL: @sgt_3_impliesT_sgt_2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i8 [[SEL:%.*]], [[X]]
+; CHECK-NEXT:    [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i8 %x, 3
+  %sel = select i1 %cmp, i8 2, i8 %y
+  %cmp2 = icmp sgt i8 %sel, %x
+  ret i1 %cmp2
+}
+
+define i1 @sgt_x_impliesF_eq_smin_todo(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @sgt_x_impliesF_eq_smin_todo(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 -128, i8 [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[SEL]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp = icmp sgt i8 %x, %z
+  %sel = select i1 %cmp, i8 -128, i8 %y
+  %cmp2 = icmp eq i8 %sel, %x
+  ret i1 %cmp2
+}
+
+define i1 @slt_x_impliesT_ne_smin_todo(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @slt_x_impliesT_ne_smin_todo(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 127, i8 [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[SEL]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp = icmp slt i8 %x, %z
+  %sel = select i1 %cmp, i8 127, i8 %y
+  %cmp2 = icmp ne i8 %x, %sel
+  ret i1 %cmp2
+}
+
+define i1 @ult_x_impliesT_eq_umax_todo(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @ult_x_impliesT_eq_umax_todo(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 -1, i8 [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[SEL]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp = icmp ugt i8 %z, %x
+  %sel = select i1 %cmp, i8 255, i8 %y
+  %cmp2 = icmp ne i8 %sel, %x
+  ret i1 %cmp2
+}
+
+define i1 @ult_1_impliesF_eq_1(i8 %x, i8 %y) {
+; CHECK-LABEL: @ult_1_impliesF_eq_1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[SEL:%.*]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[X:%.*]], [[SEL]]
+; CHECK-NEXT:    [[CMP3:%.*]] = select i1 [[CMP]], i1 [[CMP2]], i1 false
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp ult i8 %x, 1
+  %sel = select i1 %cmp, i8 1, i8 %y
+  %cmp2 = icmp eq i8 %x, %sel
+  ret i1 %cmp2
+}
+
+define i1 @ugt_x_impliesF_eq_umin_todo(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @ugt_x_impliesF_eq_umin_todo(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 0, i8 [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[SEL]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+  %cmp = icmp ugt i8 %z, %x
+  %sel = select i1 %cmp, i8 0, i8 %y
+  %cmp2 = icmp eq i8 %x, %sel
+  ret i1 %cmp2
+}
diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
index a203b28bcb82a8..f37226bbd5b09c 100644
--- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
@@ -240,3 +240,43 @@ define i32 @vec_to_scalar_select_vector(<2 x i1> %b) {
   %c = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %s)
   ret i32 %c
 }
+
+define i8 @test_drop_noundef(i1 %cond, i8 %val) {
+; CHECK-LABEL: @test_drop_noundef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.smin.i8(i8 [[VAL:%.*]], i8 0)
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[COND:%.*]], i8 -1, i8 [[TMP0]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  %sel = select i1 %cond, i8 -1, i8 %val
+  %ret = call noundef i8 @llvm.smin.i8(i8 %sel, i8 0)
+  ret i8 %ret
+}
+
+define i1 @pr85536(i32 %a) {
+; CHECK-LABEL: @pr85536(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[A:%.*]], 31
+; CHECK-NEXT:    [[SHL1:%.*]] = shl nsw i32 -1, [[A]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SHL1]] to i64
+; CHECK-NEXT:    [[SHL2:%.*]] = shl i64 [[ZEXT]], 48
+; CHECK-NEXT:    [[SHR:%.*]] = ashr exact i64 [[SHL2]], 48
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.smin.i64(i64 [[SHR]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 65535
+; CHECK-NEXT:    [[RET1:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[CMP1]], i1 [[RET1]], i1 false
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+entry:
+  %cmp1 = icmp ugt i32 %a, 30
+  %shl1 = shl nsw i32 -1, %a
+  %zext = zext i32 %shl1 to i64
+  %shl2 = shl i64 %zext, 48
+  %shr = ashr exact i64 %shl2, 48
+  %sel = select i1 %cmp1, i64 -1, i64 %shr
+  %smin = call noundef i64 @llvm.smin.i64(i64 %sel, i64 0)
+  %masked = and i64 %smin, 65535
+  %ret = icmp eq i64 %masked, 0
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll
index fff4bc5943da06..8391fe33eb9b59 100644
--- a/llvm/test/Transforms/InstCombine/minmax-fold.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll
@@ -131,7 +131,7 @@ define i64 @t9(i32 %a) {
 define float @t10(i32 %x) {
 ; CHECK-LABEL: @t10(
 ; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 255)
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[R1]] to float
+; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[R1]] to float
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %f_x = sitofp i32 %x to float
@@ -143,7 +143,7 @@ define float @t10(i32 %x) {
 define float @t11(i64 %x) {
 ; CHECK-LABEL: @t11(
 ; CHECK-NEXT:    [[R1:%.*]] = call i64 @llvm.smax.i64(i64 [[X:%.*]], i64 255)
-; CHECK-NEXT:    [[R:%.*]] = uitofp i64 [[R1]] to float
+; CHECK-NEXT:    [[R:%.*]] = sitofp i64 [[R1]] to float
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %f_x = sitofp i64 %x to float
@@ -526,7 +526,7 @@ falselabel:
 define double @PR31751_umin1(i32 %x) {
 ; CHECK-LABEL: @PR31751_umin1(
 ; CHECK-NEXT:    [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647)
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp i32 [[SEL]] to double
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
 ; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp slt i32 %x, 0
@@ -538,7 +538,7 @@ define double @PR31751_umin1(i32 %x) {
 define double @PR31751_umin2(i32 %x) {
 ; CHECK-LABEL: @PR31751_umin2(
 ; CHECK-NEXT:    [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647)
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp i32 [[SEL]] to double
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
 ; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp ult i32 %x, 2147483647
@@ -550,7 +550,7 @@ define double @PR31751_umin2(i32 %x) {
 define double @PR31751_umin3(i32 %x) {
 ; CHECK-LABEL: @PR31751_umin3(
 ; CHECK-NEXT:    [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647)
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp i32 [[SEL]] to double
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
 ; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp ugt i32 %x, 2147483647
diff --git a/llvm/test/Transforms/InstCombine/minmax-fp.ll b/llvm/test/Transforms/InstCombine/minmax-fp.ll
index 728950ddb40c87..f89e8a18e63440 100644
--- a/llvm/test/Transforms/InstCombine/minmax-fp.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-fp.ll
@@ -257,7 +257,7 @@ define double @t16(i32 %x) {
 define double @t17(i32 %x) {
 ; CHECK-LABEL: @t17(
 ; CHECK-NEXT:    [[SEL1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 2)
-; CHECK-NEXT:    [[SEL:%.*]] = uitofp i32 [[SEL1]] to double
+; CHECK-NEXT:    [[SEL:%.*]] = sitofp i32 [[SEL1]] to double
 ; CHECK-NEXT:    ret double [[SEL]]
 ;
   %cmp = icmp sgt i32 %x, 2
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index e7141d7c25ad21..a176d16f2cdfb6 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -2049,3 +2049,94 @@ define i32 @zext_negpow2_use(i8 %x) {
   %r = mul i32 %zx, -16777216 ; -1 << 24
   ret i32 %r
 }
+
+define i32 @mul_sext_icmp_with_zero(i32 %x) {
+; CHECK-LABEL: @mul_sext_icmp_with_zero(
+; CHECK-NEXT:    ret i32 0
+;
+  %cmp = icmp eq i32 %x, 0
+  %sext = sext i1 %cmp to i32
+  %mul = mul i32 %sext, %x
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool(
+; CHECK-NEXT:    [[Y_NEG:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  %mul = mul i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool_nuw(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool_nuw(
+; CHECK-NEXT:    [[Y_NEG:%.*]] = sub i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  %mul = mul nuw i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool_nsw(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool_nsw(
+; CHECK-NEXT:    [[Y_NEG:%.*]] = sub nsw i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  %mul = mul nsw i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool_nuw_nsw(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool_nuw_nsw(
+; CHECK-NEXT:    [[Y_NEG:%.*]] = sub nsw i32 0, [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[Y_NEG]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  %mul = mul nuw nsw i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_bool_commuted(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_bool_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[Y:%.*]], -2
+; CHECK-NEXT:    [[YY_NEG1:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[X:%.*]], i32 [[YY_NEG1]], i32 0
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %yy = xor i32 %y, 1
+  %sext = sext i1 %x to i32
+  %mul = mul i32 %yy, %sext
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_nonbool(i2 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_nonbool(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i2 [[X:%.*]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SEXT]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i2 %x to i32
+  %mul = mul i32 %sext, %y
+  ret i32 %mul
+}
+
+define i32 @test_mul_sext_multiuse(i1 %x, i32 %y) {
+; CHECK-LABEL: @test_mul_sext_multiuse(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[X:%.*]] to i32
+; CHECK-NEXT:    tail call void @use(i32 [[SEXT]])
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SEXT]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %sext = sext i1 %x to i32
+  tail call void @use(i32 %sext)
+  %mul = mul i32 %sext, %y
+  ret i32 %mul
+}
diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll
index f277d13eee930c..98b5d980415602 100644
--- a/llvm/test/Transforms/InstCombine/not.ll
+++ b/llvm/test/Transforms/InstCombine/not.ll
@@ -3,6 +3,8 @@
 
 declare void @use1(i1)
 declare void @use8(i8)
+declare void @f1()
+declare void @f2()
 
 define i32 @test1(i32 %A) {
 ; CHECK-LABEL: @test1(
@@ -858,3 +860,204 @@ define i32 @test_zext(i32 %a, i32 %b){
   %not = xor i32 %add, -1
   ret i32 %not
 }
+
+define void @test_invert_demorgan_or(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @test_invert_demorgan_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[B1:%.*]], 0
+; CHECK-NEXT:    [[OR_NOT1:%.*]] = and i1 [[CMP2]], [[CMP3]]
+; CHECK-NEXT:    [[MERGE:%.*]] = and i1 [[OR_NOT1]], [[COND:%.*]]
+; CHECK-NEXT:    br i1 [[MERGE]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @f1()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp2 = icmp ne i32 %b, 0
+  %or = or i1 %cmp1, %cmp2
+  %not = xor i1 %cond, true
+  %merge = or i1 %not, %or
+  br i1 %merge, label %if.then, label %if.else
+if.then:
+  call void @f1()
+  unreachable
+if.else:
+  call void @f2()
+  unreachable
+}
+
+define i1 @test_invert_demorgan_or2(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: @test_invert_demorgan_or2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[A:%.*]], 24
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[B:%.*]], 60
+; CHECK-NEXT:    [[OR1_NOT1:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 [[C:%.*]], 60
+; CHECK-NEXT:    [[NOT:%.*]] = and i1 [[OR1_NOT1]], [[CMP3]]
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp1 = icmp ugt i64 %a, 23
+  %cmp2 = icmp ugt i64 %b, 59
+  %or1 = or i1 %cmp1, %cmp2
+  %cmp3 = icmp ugt i64 %c, 59
+  %or2 = or i1 %or1, %cmp3
+  %not = xor i1 %or2, true
+  ret i1 %not
+}
+
+define i1 @test_invert_demorgan_or3(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_invert_demorgan_or3(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[A:%.*]], 178206
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[B:%.*]], -196608
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TMP1]], -1506
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[B]], -917760
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[TMP2]], -716213
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[B]], -1114112
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp ult i32 [[TMP3]], -196112
+; CHECK-NEXT:    [[OR1_NOT2:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[OR2_NOT1:%.*]] = and i1 [[OR1_NOT2]], [[CMP3]]
+; CHECK-NEXT:    [[NOT:%.*]] = and i1 [[OR2_NOT1]], [[CMP4]]
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp1 = icmp eq i32 %a, 178206
+  %v1 = add i32 %b, -195102
+  %cmp2 = icmp ult i32 %v1, 1506
+  %v2 = add i32 %b, -201547
+  %cmp3 = icmp ult i32 %v2, 716213
+  %v3 = add i32 %b, -918000
+  %cmp4 = icmp ult i32 %v3, 196112
+  %or1 = or i1 %cmp1, %cmp2
+  %or2 = or i1 %or1, %cmp3
+  %or3 = or i1 %or2, %cmp4
+  %not = xor i1 %or3, true
+  ret i1 %not
+}
+
+define i1 @test_invert_demorgan_logical_or(i64 %x, i64 %y) {
+; CHECK-LABEL: @test_invert_demorgan_logical_or(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[X:%.*]], 27
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i64 [[Y:%.*]], 0
+; CHECK-NEXT:    [[SEL_NOT1:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 false
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    [[NOT:%.*]] = and i1 [[CMP3]], [[SEL_NOT1]]
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp1 = icmp eq i64 %x, 27
+  %cmp2 = icmp eq i64 %y, 0
+  %sel = select i1 %cmp1, i1 true, i1 %cmp2
+  %cmp3 = icmp eq i64 %x, 0
+  %or = or i1 %cmp3, %sel
+  %not = xor i1 %or, true
+  ret i1 %not
+}
+
+define i1 @test_invert_demorgan_and(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @test_invert_demorgan_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[B1:%.*]], 0
+; CHECK-NEXT:    [[AND_NOT1:%.*]] = or i1 [[CMP2]], [[CMP3]]
+; CHECK-NEXT:    [[MERGE:%.*]] = or i1 [[AND_NOT1]], [[COND:%.*]]
+; CHECK-NEXT:    br i1 [[MERGE]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @f1()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp2 = icmp ne i32 %b, 0
+  %and = and i1 %cmp1, %cmp2
+  %not = xor i1 %cond, true
+  %merge = and i1 %not, %and
+  br i1 %merge, label %if.then, label %if.else
+if.then:
+  call void @f1()
+  unreachable
+if.else:
+  call void @f2()
+  unreachable
+}
+
+define i64 @test_invert_demorgan_and2(i64 %x) {
+; CHECK-LABEL: @test_invert_demorgan_and2(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[X:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = or i64 [[TMP1]], -9223372036854775808
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %add = add i64 %x, 9223372036854775807
+  %and = and i64 %add, 9223372036854775807
+  %sub = xor i64 %and, -1
+  ret i64 %sub
+}
+
+define i1 @test_invert_demorgan_and3(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_invert_demorgan_and3(
+; CHECK-NEXT:    [[ADD:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ADD]], 4095
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 4095
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %not = xor i32 %a, -1
+  %add = add i32 %b, %not
+  %and = and i32 %add, 4095
+  %cmp = icmp eq i32 %and, 0
+  ret i1 %cmp
+}
+
+define i1 @test_invert_demorgan_logical_and(i64 %x, i64 %y) {
+; CHECK-LABEL: @test_invert_demorgan_logical_and(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[X:%.*]], 27
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i64 [[Y:%.*]], 0
+; CHECK-NEXT:    [[SEL_NOT1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP2]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    [[NOT:%.*]] = and i1 [[CMP3]], [[SEL_NOT1]]
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp1 = icmp eq i64 %x, 27
+  %cmp2 = icmp eq i64 %y, 0
+  %sel = select i1 %cmp1, i1 %cmp2, i1 false
+  %cmp3 = icmp eq i64 %x, 0
+  %or = or i1 %cmp3, %sel
+  %not = xor i1 %or, true
+  ret i1 %not
+}
+
+define i1 @test_invert_demorgan_and_multiuse(i32 %a, i32 %b, i1 %cond) {
+; CHECK-LABEL: @test_invert_demorgan_and_multiuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    call void @use1(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[COND:%.*]], true
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[CMP2]], [[NOT]]
+; CHECK-NEXT:    [[MERGE:%.*]] = and i1 [[TMP0]], [[CMP1]]
+; CHECK-NEXT:    br i1 [[MERGE]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @f1()
+; CHECK-NEXT:    unreachable
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp1 = icmp eq i32 %a, 0
+  call void @use1(i1 %cmp1)
+  %cmp2 = icmp ne i32 %b, 0
+  %and = and i1 %cmp1, %cmp2
+  %not = xor i1 %cond, true
+  %merge = and i1 %not, %and
+  br i1 %merge, label %if.then, label %if.else
+if.then:
+  call void @f1()
+  unreachable
+if.else:
+  call void @f2()
+  unreachable
+}
diff --git a/llvm/test/Transforms/InstCombine/powi.ll b/llvm/test/Transforms/InstCombine/powi.ll
index 89efbb6f453611..43e34c889106e1 100644
--- a/llvm/test/Transforms/InstCombine/powi.ll
+++ b/llvm/test/Transforms/InstCombine/powi.ll
@@ -125,22 +125,55 @@ entry:
   ret double %mul
 }
 
-define double @powi_fmul_powi_no_reassoc(double %x, i32 %y, i32 %z) {
-; CHECK-LABEL: @powi_fmul_powi_no_reassoc(
+; Negative test: Missing reassoc flag on fmul
+define double @powi_fmul_powi_no_reassoc1(double %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @powi_fmul_powi_no_reassoc1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
-; CHECK-NEXT:    [[P2:%.*]] = tail call double @llvm.powi.f64.i32(double [[X]], i32 [[Z:%.*]])
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P2:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X]], i32 [[Z:%.*]])
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[P2]], [[P1]]
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
   %mul = fmul double %p2, %p1
   ret double %mul
 }
 
+; Negative test: Missing reassoc flag on 2nd operand
+define double @powi_fmul_powi_no_reassoc2(double %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @powi_fmul_powi_no_reassoc2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P2:%.*]] = tail call double @llvm.powi.f64.i32(double [[X]], i32 [[Z:%.*]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P2]], [[P1]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+entry:
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %mul = fmul reassoc double %p2, %p1
+  ret double %mul
+}
 
+; Negative test: Missing reassoc flag on 1st operand
+define double @powi_fmul_powi_no_reassoc3(double %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @powi_fmul_powi_no_reassoc3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P2:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X]], i32 [[Z:%.*]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P2]], [[P1]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+entry:
+  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
+  %mul = fmul reassoc double %p2, %p1
+  ret double %mul
+}
+
+; All of the fmul and its operands should have the reassoc flags
 define double @powi_fmul_powi(double %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @powi_fmul_powi(
 ; CHECK-NEXT:  entry:
@@ -149,8 +182,8 @@ define double @powi_fmul_powi(double %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
   %mul = fmul reassoc double %p2, %p1
   ret double %mul
 }
@@ -163,8 +196,8 @@ define double @powi_fmul_powi_fast_on_fmul(double %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p1 = tail call fast double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call fast double @llvm.powi.f64.i32(double %x, i32 %z)
   %mul = fmul fast double %p2, %p1
   ret double %mul
 }
@@ -192,8 +225,23 @@ define double @powi_fmul_powi_same_power(double %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %mul = fmul reassoc double %p2, %p1
+  ret double %mul
+}
+
+define double @powi_fmul_powi_different_integer_types(double %x, i32 %y, i16 %z) {
+; CHECK-LABEL: @powi_fmul_powi_different_integer_types(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P2:%.*]] = tail call reassoc double @llvm.powi.f64.i16(double [[X]], i16 [[Z:%.*]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P2]], [[P1]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+entry:
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i16(double %x, i16 %z)
   %mul = fmul reassoc double %p2, %p1
   ret double %mul
 }
@@ -201,16 +249,16 @@ entry:
 define double @powi_fmul_powi_use_first(double %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @powi_fmul_powi_use_first(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 ; CHECK-NEXT:    tail call void @use(double [[P1]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[Y]], [[Z:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[X]], i32 [[TMP0]])
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
   tail call void @use(double %p1)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
   %mul = fmul reassoc double %p1, %p2
   ret double %mul
 }
@@ -218,16 +266,16 @@ entry:
 define double @powi_fmul_powi_use_second(double %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @powi_fmul_powi_use_second(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Z:%.*]])
+; CHECK-NEXT:    [[P1:%.*]] = tail call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Z:%.*]])
 ; CHECK-NEXT:    tail call void @use(double [[P1]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[Y:%.*]], [[Z]]
 ; CHECK-NEXT:    [[MUL:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[X]], i32 [[TMP0]])
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
 entry:
-  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 %z)
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %z)
   tail call void @use(double %p1)
-  %p2 = tail call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p2 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
   %mul = fmul reassoc double %p2, %p1
   ret double %mul
 }
@@ -333,11 +381,60 @@ define double @fdiv_pow_powi_negative(double %x) {
 ; Negative test: The 2nd powi argument is a variable
 define double @fdiv_pow_powi_negative_variable(double %x, i32 %y) {
 ; CHECK-LABEL: @fdiv_pow_powi_negative_variable(
-; CHECK-NEXT:    [[P1:%.*]] = call double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    [[P1:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 [[Y:%.*]])
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc nnan double [[P1]], [[X]]
 ; CHECK-NEXT:    ret double [[DIV]]
 ;
-  %p1 = call double @llvm.powi.f64.i32(double %x, i32 %y)
+  %p1 = call reassoc double @llvm.powi.f64.i32(double %x, i32 %y)
   %div = fdiv reassoc nnan double %p1, %x
   ret double %div
 }
+
+; powi(X, Y) * X --> powi(X, Y+1)
+define double @powi_fmul_powi_x(double noundef %x) {
+; CHECK-LABEL: @powi_fmul_powi_x(
+; CHECK-NEXT:    [[MUL:%.*]] = call reassoc double @llvm.powi.f64.i32(double [[X:%.*]], i32 4)
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %p1 = tail call reassoc double @llvm.powi.f64.i32(double %x, i32 3)
+  %mul = fmul reassoc double %p1, %x
+  ret double %mul
+}
+
+; Negative test: Multi-use
+define double @powi_fmul_powi_x_multi_use(double noundef %x) {
+; CHECK-LABEL: @powi_fmul_powi_x_multi_use(
+; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 3)
+; CHECK-NEXT:    tail call void @use(double [[P1]])
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P1]], [[X]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 3)
+  tail call void @use(double %p1)
+  %mul = fmul reassoc double %p1, %x
+  ret double %mul
+}
+
+; Negative test: Miss fmf flag
+define double @powi_fmul_powi_x_missing_reassoc(double noundef %x) {
+; CHECK-LABEL: @powi_fmul_powi_x_missing_reassoc(
+; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 3)
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[P1]], [[X]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 3)
+  %mul = fmul double %p1, %x
+  ret double %mul
+}
+
+; Negative test: overflow
+define double @powi_fmul_powi_x_overflow(double noundef %x) {
+; CHECK-LABEL: @powi_fmul_powi_x_overflow(
+; CHECK-NEXT:    [[P1:%.*]] = tail call double @llvm.powi.f64.i32(double [[X:%.*]], i32 2147483647)
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[P1]], [[X]]
+; CHECK-NEXT:    ret double [[MUL]]
+;
+  %p1 = tail call double @llvm.powi.f64.i32(double %x, i32 2147483647) ; INT_MAX
+  %mul = fmul reassoc double %p1, %x
+  ret double %mul
+}
diff --git a/llvm/test/Transforms/InstCombine/pr27236.ll b/llvm/test/Transforms/InstCombine/pr27236.ll
index 7aad0a90281a1a..61ea344b1bdbd4 100644
--- a/llvm/test/Transforms/InstCombine/pr27236.ll
+++ b/llvm/test/Transforms/InstCombine/pr27236.ll
@@ -4,7 +4,7 @@
 define float @test1(i32 %scale) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[SCALE:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = uitofp i32 [[TMP1]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp i32 [[TMP1]] to float
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
   %1 = icmp sgt i32 1, %scale
diff --git a/llvm/test/Transforms/InstCombine/pr63791.ll b/llvm/test/Transforms/InstCombine/pr63791.ll
index 78cc1130fb33f0..73a559f9892612 100644
--- a/llvm/test/Transforms/InstCombine/pr63791.ll
+++ b/llvm/test/Transforms/InstCombine/pr63791.ll
@@ -15,7 +15,7 @@ define void @y() {
 ; CHECK-NEXT:    store i1 true, ptr poison, align 1
 ; CHECK-NEXT:    br i1 poison, label [[FOR_COND_I]], label [[FOR_COND5_PREHEADER_I]]
 ; CHECK:       for.cond5.preheader.i:
-; CHECK-NEXT:    br i1 false, label [[FOR_INC19_I:%.*]], label [[FOR_COND1_LOOPEXIT_I:%.*]]
+; CHECK-NEXT:    br i1 true, label [[FOR_COND1_LOOPEXIT_I:%.*]], label [[FOR_INC19_I:%.*]]
 ; CHECK:       for.inc19.i:
 ; CHECK-NEXT:    br i1 poison, label [[FOR_INC19_I]], label [[FOR_COND1_LOOPEXIT_I]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
index 496854c7d731ab..77ff16a8b2e3d8 100644
--- a/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
+++ b/llvm/test/Transforms/InstCombine/select-binop-foldable-floating-point.ll
@@ -1,8 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-define float @select_fadd(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fadd(
+define float @select_maybe_nan_fadd(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_maybe_nan_fadd(
+; CHECK-NEXT:    [[C:%.*]] = fadd float [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fadd float %A, %B
+  %D = select i1 %cond, float %C, float %A
+  ret float %D
+}
+
+define float @select_fpclass_fadd(i1 %cond, float nofpclass(nan) %A, float %B) {
+; CHECK-LABEL: @select_fpclass_fadd(
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fadd float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
@@ -12,41 +23,52 @@ define float @select_fadd(i1 %cond, float %A, float %B) {
   ret float %D
 }
 
-define float @select_fadd_swapped(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fadd_swapped(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
+define float @select_nnan_fadd(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fadd(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fadd float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fadd_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fadd_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
+define float @select_nnan_fadd_swapped(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fadd_swapped(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = fadd float [[C]], [[A:%.*]]
+; CHECK-NEXT:    ret float [[D]]
+;
+  %C = fadd float %A, %B
+  %D = select nnan i1 %cond, float %A, float %C
+  ret float %D
+}
+
+define float @select_nnan_fadd_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fadd_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float -0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fadd fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd fast float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fadd_swapped_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fadd_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
+define float @select_nnan_fadd_swapped_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fadd_swapped_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float -0.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fadd fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fadd fast float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define <4 x float> @select_nsz_fadd_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fadd_v4f32(
+define <4 x float> @select_nnan_nsz_fadd_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fadd_v4f32(
 ; CHECK-NEXT:    [[C:%.*]] = select nnan nsz <4 x i1> [[COND:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    [[D:%.*]] = fadd nnan nsz <4 x float> [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[D]]
@@ -56,202 +78,202 @@ define <4 x float> @select_nsz_fadd_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x f
   ret <4 x float> %D
 }
 
-define <vscale x 4 x float> @select_nsz_fadd_nxv4f32(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fadd_nxv4f32(
+define <vscale x 4 x float> @select_nnan_nsz_fadd_nxv4f32(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fadd_nxv4f32(
 ; CHECK-NEXT:    [[C:%.*]] = select nnan nsz <vscale x 4 x i1> [[COND:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> zeroinitializer
 ; CHECK-NEXT:    [[D:%.*]] = fadd nnan nsz <vscale x 4 x float> [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[D]]
 ;
-  %C = fadd nsz nnan <vscale x 4 x float> %A, %B
-  %D = select nsz nnan <vscale x 4 x i1> %cond, <vscale x 4 x float> %C, <vscale x 4 x float> %A
+  %C = fadd nnan nsz <vscale x 4 x float> %A, %B
+  %D = select nnan nsz <vscale x 4 x i1> %cond, <vscale x 4 x float> %C, <vscale x 4 x float> %A
   ret <vscale x 4 x float> %D
 }
 
-define <vscale x 4 x float> @select_nsz_fadd_nxv4f32_swapops(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fadd_nxv4f32_swapops(
+define <vscale x 4 x float> @select_nnan_nsz_fadd_nxv4f32_swapops(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fadd_nxv4f32_swapops(
 ; CHECK-NEXT:    [[C:%.*]] = select fast <vscale x 4 x i1> [[COND:%.*]], <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fadd fast <vscale x 4 x float> [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[D]]
 ;
   %C = fadd fast <vscale x 4 x float> %A, %B
-  %D = select fast <vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %C
+  %D = select nnan fast <vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %C
   ret <vscale x 4 x float> %D
 }
 
-define float @select_fmul(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fmul(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+define float @select_nnan_fmul(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fmul(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fmul float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fmul_swapped(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fmul_swapped(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+define float @select_nnan_fmul_swapped(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fmul_swapped(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fmul float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define float @select_fmul_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fmul_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+define float @select_nnan_fmul_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fmul_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fmul fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul fast float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fmul_swapped_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fmul_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+define float @select_nnan_fmul_swapped_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fmul_swapped_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fmul fast float [[C]], [[A:%.*]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fmul fast float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define float @select_fsub(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+define float @select_nnan_fsub(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fsub float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fsub_swapped(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub_swapped(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
+define float @select_nnan_fsub_swapped(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub_swapped(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fsub float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define float @select_fsub_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
+define float @select_nnan_fsub_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 0.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fsub fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub fast float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fsub_swapped_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
+define float @select_nnan_fsub_swapped_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub_swapped_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 0.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fsub fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub fast float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define <4 x float> @select_nsz_fsub_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fsub_v4f32(
-; CHECK-NEXT:    [[C:%.*]] = select nsz <4 x i1> [[COND:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer
+define <4 x float> @select_nnan_nsz_fsub_v4f32(<4 x i1> %cond, <4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fsub_v4f32(
+; CHECK-NEXT:    [[C:%.*]] = select nnan nsz <4 x i1> [[COND:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    [[D:%.*]] = fsub <4 x float> [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret <4 x float> [[D]]
 ;
   %C = fsub <4 x float> %A, %B
-  %D = select nsz <4 x i1> %cond, <4 x float> %C, <4 x float> %A
+  %D = select nnan nsz <4 x i1> %cond, <4 x float> %C, <4 x float> %A
   ret <4 x float> %D
 }
 
-define <vscale x 4 x float> @select_nsz_fsub_nxv4f32(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
-; CHECK-LABEL: @select_nsz_fsub_nxv4f32(
-; CHECK-NEXT:    [[C:%.*]] = select nsz <vscale x 4 x i1> [[COND:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> zeroinitializer
+define <vscale x 4 x float> @select_nnan_nsz_fsub_nxv4f32(<vscale x 4 x i1> %cond, <vscale x 4 x float> %A, <vscale x 4 x float> %B) {
+; CHECK-LABEL: @select_nnan_nsz_fsub_nxv4f32(
+; CHECK-NEXT:    [[C:%.*]] = select nnan nsz <vscale x 4 x i1> [[COND:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> zeroinitializer
 ; CHECK-NEXT:    [[D:%.*]] = fsub <vscale x 4 x float> [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[D]]
 ;
   %C = fsub <vscale x 4 x float> %A, %B
-  %D = select nsz <vscale x 4 x i1> %cond, <vscale x 4 x float> %C, <vscale x 4 x float> %A
+  %D = select nnan nsz <vscale x 4 x i1> %cond, <vscale x 4 x float> %C, <vscale x 4 x float> %A
   ret <vscale x 4 x float> %D
 }
 
 ; 'fsub' can only fold on the amount subtracted.
-define float @select_fsub_invalid(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fsub_invalid(
+define float @select_nnan_fsub_invalid(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fsub_invalid(
 ; CHECK-NEXT:    [[C:%.*]] = fsub float [[B:%.*]], [[A:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[D:%.*]] = select nnan i1 [[COND:%.*]], float [[C]], float [[A]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fsub float %B, %A
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fdiv(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+define float @select_nnan_fdiv(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fdiv float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fdiv_swapped(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv_swapped(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+define float @select_nnan_fdiv_swapped(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv_swapped(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fdiv float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
-define float @select_fdiv_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
+define float @select_nnan_fdiv_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float [[B:%.*]], float 1.000000e+00
 ; CHECK-NEXT:    [[D:%.*]] = fdiv fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv fast float %A, %B
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
 
-define float @select_fdiv_swapped_fast_math(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv_swapped_fast_math(
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
+define float @select_nnan_fdiv_swapped_fast_math(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv_swapped_fast_math(
+; CHECK-NEXT:    [[C:%.*]] = select nnan i1 [[COND:%.*]], float 1.000000e+00, float [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = fdiv fast float [[A:%.*]], [[C]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv fast float %A, %B
-  %D = select i1 %cond, float %A, float %C
+  %D = select nnan i1 %cond, float %A, float %C
   ret float %D
 }
 
 ; 'fdiv' can only fold on the divisor amount.
-define float @select_fdiv_invalid(i1 %cond, float %A, float %B) {
-; CHECK-LABEL: @select_fdiv_invalid(
+define float @select_nnan_fdiv_invalid(i1 %cond, float %A, float %B) {
+; CHECK-LABEL: @select_nnan_fdiv_invalid(
 ; CHECK-NEXT:    [[C:%.*]] = fdiv float [[B:%.*]], [[A:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[COND:%.*]], float [[C]], float [[A]]
+; CHECK-NEXT:    [[D:%.*]] = select nnan i1 [[COND:%.*]], float [[C]], float [[A]]
 ; CHECK-NEXT:    ret float [[D]]
 ;
   %C = fdiv float %B, %A
-  %D = select i1 %cond, float %C, float %A
+  %D = select nnan i1 %cond, float %C, float %A
   ret float %D
 }
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index a84904106eced4..278cabdff9ed3e 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2925,10 +2925,8 @@ define i8 @select_replacement_loop3(i32 noundef %x) {
 
 define i16 @select_replacement_loop4(i16 noundef %p_12) {
 ; CHECK-LABEL: @select_replacement_loop4(
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[P_12:%.*]], 2
-; CHECK-NEXT:    [[AND1:%.*]] = and i16 [[P_12]], 1
-; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[CMP1]], i16 [[AND1]], i16 0
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i16 [[AND2]], [[P_12]]
+; CHECK-NEXT:    [[AND1:%.*]] = and i16 [[P_12:%.*]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i16 [[P_12]], 2
 ; CHECK-NEXT:    [[AND3:%.*]] = select i1 [[CMP2]], i16 [[AND1]], i16 0
 ; CHECK-NEXT:    ret i16 [[AND3]]
 ;
@@ -3708,3 +3706,59 @@ define i32 @src_select_xxory_eq0_xorxy_y(i32 %x, i32 %y) {
   %cond = select i1 %xor0, i32 %xor, i32 %y
   ret i32 %cond
 }
+
+define i32 @sequence_select_with_same_cond_false(i1 %c1, i1 %c2){
+; CHECK-LABEL: @sequence_select_with_same_cond_false(
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C1:%.*]], i32 23, i32 45
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[C2:%.*]], i32 666, i32 [[S1]]
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C1]], i32 789, i32 [[S2]]
+; CHECK-NEXT:    ret i32 [[S3]]
+;
+  %s1 = select i1 %c1, i32 23, i32 45
+  %s2 = select i1 %c2, i32 666, i32 %s1
+  %s3 = select i1 %c1, i32 789, i32 %s2
+  ret i32 %s3
+}
+
+define i32 @sequence_select_with_same_cond_true(i1 %c1, i1 %c2){
+; CHECK-LABEL: @sequence_select_with_same_cond_true(
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C1:%.*]], i32 45, i32 23
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[C2:%.*]], i32 [[S1]], i32 666
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C1]], i32 [[S2]], i32 789
+; CHECK-NEXT:    ret i32 [[S3]]
+;
+  %s1 = select i1 %c1, i32 45, i32 23
+  %s2 = select i1 %c2, i32 %s1, i32 666
+  %s3 = select i1 %c1, i32 %s2, i32 789
+  ret i32 %s3
+}
+
+define double @sequence_select_with_same_cond_double(double %a, i1 %c1, i1 %c2, double %r1, double %r2){
+; CHECK-LABEL: @sequence_select_with_same_cond_double(
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C1:%.*]], double 1.000000e+00, double 0.000000e+00
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[C2:%.*]], double [[S1]], double 2.000000e+00
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C1]], double [[S2]], double 3.000000e+00
+; CHECK-NEXT:    ret double [[S3]]
+;
+  %s1 = select i1 %c1, double 1.0, double 0.0
+  %s2 = select i1 %c2, double %s1, double 2.0
+  %s3 = select i1 %c1, double %s2, double 3.0
+  ret double %s3
+}
+
+declare void @use32(i32)
+
+define i32 @sequence_select_with_same_cond_extra_use(i1 %c1, i1 %c2){
+; CHECK-LABEL: @sequence_select_with_same_cond_extra_use(
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[C1:%.*]], i32 23, i32 45
+; CHECK-NEXT:    call void @use32(i32 [[S1]])
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[C2:%.*]], i32 666, i32 [[S1]]
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[C1]], i32 789, i32 [[S2]]
+; CHECK-NEXT:    ret i32 [[S3]]
+;
+  %s1 = select i1 %c1, i32 23, i32 45
+  call void @use32(i32 %s1)
+  %s2 = select i1 %c2, i32 666, i32 %s1
+  %s3 = select i1 %c1, i32 789, i32 %s2
+  ret i32 %s3
+}
diff --git a/llvm/test/Transforms/InstCombine/select_meta.ll b/llvm/test/Transforms/InstCombine/select_meta.ll
index cd133101736cfc..aa794e82e0fdc6 100644
--- a/llvm/test/Transforms/InstCombine/select_meta.ll
+++ b/llvm/test/Transforms/InstCombine/select_meta.ll
@@ -360,23 +360,23 @@ define i128 @select_ashr(i1 %cond, i128 %x, i128 %y) {
 
 define double @select_fmul(i1 %cond, double %x, double %y) {
 ; CHECK-LABEL: @select_fmul(
-; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], double [[Y:%.*]], double 1.000000e+00, !prof [[PROF0]], !unpredictable [[META2]]
+; CHECK-NEXT:    [[OP:%.*]] = select nnan i1 [[COND:%.*]], double [[Y:%.*]], double 1.000000e+00, !prof [[PROF0]], !unpredictable [[META2]]
 ; CHECK-NEXT:    [[RET:%.*]] = fmul double [[OP]], [[X:%.*]]
 ; CHECK-NEXT:    ret double [[RET]]
 ;
   %op = fmul double %x, %y
-  %ret = select i1 %cond, double %op, double %x, !prof !1, !unpredictable !3
+  %ret = select nnan i1 %cond, double %op, double %x, !prof !1, !unpredictable !3
   ret double %ret
 }
 
 define <2 x float> @select_fdiv(i1 %cond, <2 x float> %x, <2 x float> %y) {
 ; CHECK-LABEL: @select_fdiv(
-; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], <2 x float> [[Y:%.*]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, !prof [[PROF0]], !unpredictable [[META2]]
+; CHECK-NEXT:    [[OP:%.*]] = select nnan i1 [[COND:%.*]], <2 x float> [[Y:%.*]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, !prof [[PROF0]], !unpredictable [[META2]]
 ; CHECK-NEXT:    [[RET:%.*]] = fdiv <2 x float> [[X:%.*]], [[OP]]
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
   %op = fdiv <2 x float> %x, %y
-  %ret = select i1 %cond, <2 x float> %op, <2 x float> %x, !prof !1, !unpredictable !3
+  %ret = select nnan i1 %cond, <2 x float> %op, <2 x float> %x, !prof !1, !unpredictable !3
   ret <2 x float> %ret
 }
 
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index 62f32c28683711..bef7fc81a7d1f9 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -1751,12 +1751,11 @@ define void @ashr_out_of_range_1(ptr %A) {
 ; CHECK-NEXT:    [[L:%.*]] = load i177, ptr [[A:%.*]], align 4
 ; CHECK-NEXT:    [[L_FROZEN:%.*]] = freeze i177 [[L]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i177 [[L_FROZEN]], -1
-; CHECK-NEXT:    [[B:%.*]] = select i1 [[TMP1]], i177 0, i177 [[L_FROZEN]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i177 [[B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i177 [[L_FROZEN]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 0, i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i177, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[G11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 -24
-; CHECK-NEXT:    [[C17:%.*]] = icmp sgt i177 [[B]], [[L_FROZEN]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[C17]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP1]] to i64
 ; CHECK-NEXT:    [[G62:%.*]] = getelementptr i177, ptr [[G11]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i177 [[L_FROZEN]], -1
 ; CHECK-NEXT:    [[B28:%.*]] = select i1 [[TMP5]], i177 0, i177 [[L_FROZEN]]
diff --git a/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll b/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll
index 2e1ff0a21a3def..461c9b0fb1e0c0 100644
--- a/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll
@@ -117,11 +117,9 @@ define i64 @sext_diff_i1_xor_sub_1(i64 %a, i1 %b, i1 %c) {
 define i64 @sext_multi_uses(i64 %a, i1 %b, i64 %x) {
 ; CHECK-LABEL: define i64 @sext_multi_uses(
 ; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]], i64 [[X:%.*]]) {
-; CHECK-NEXT:    [[C:%.*]] = sext i1 [[B]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[A]]
-; CHECK-NEXT:    [[E:%.*]] = select i1 [[B]], i64 [[TMP1]], i64 [[A]]
-; CHECK-NEXT:    [[F:%.*]] = mul i64 [[C]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = add i64 [[F]], [[E]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[X]], [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 0, [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B]], i64 [[TMP2]], i64 [[A]]
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
   %c = sext i1 %b to i64
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll
index 6dd34a481df134..d91349a570b715 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll
@@ -11,21 +11,21 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define ptr @f0() {
 ; CHECK-LABEL: @f0(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([3 x ptr], ptr @vt, inrange i64 0, i64 2)
+; CHECK-NEXT:    ret ptr getelementptr inbounds inrange(-16, 8) ([3 x ptr], ptr @vt, i64 0, i64 2)
 ;
-  ret ptr getelementptr (ptr, ptr getelementptr inbounds ([3 x ptr], ptr @vt, inrange i64 0, i64 1), i64 1)
+  ret ptr getelementptr (ptr, ptr getelementptr inbounds inrange(-8, 16) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 1)
 }
 
 define ptr @f1() {
 ; CHECK-LABEL: @f1(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([3 x ptr], ptr @vt, i64 0, i64 2)
+; CHECK-NEXT:    ret ptr getelementptr inbounds inrange(-8, 0) ([3 x ptr], ptr @vt, i64 0, i64 2)
 ;
-  ret ptr getelementptr (ptr, ptr getelementptr inbounds ([3 x ptr], ptr @vt, i64 0, inrange i64 1), i64 1)
+  ret ptr getelementptr (ptr, ptr getelementptr inbounds inrange(0, 8) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 1)
 }
 
 define ptr @f2() {
 ; CHECK-LABEL: @f2(
-; CHECK-NEXT:    ret ptr getelementptr ([3 x ptr], ptr @vt, i64 1, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inrange(-24, -16) ([3 x ptr], ptr @vt, i64 1, i64 1)
 ;
-  ret ptr getelementptr (ptr, ptr getelementptr inbounds ([3 x ptr], ptr @vt, i64 0, inrange i64 1), i64 3)
+  ret ptr getelementptr (ptr, ptr getelementptr inbounds inrange(0, 8) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 3)
 }
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
index 3851bd090aef9f..b4afb7bd4a2b0e 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
@@ -298,3 +298,11 @@ bb:
   %cmp = icmp eq ptr blockaddress(@blockaddr_no_cfi, %bb), no_cfi @func
   ret i1 %cmp
 }
+
+define i1 @global_no_cfi_dso_local_equivalent() {
+; CHECK-LABEL: @global_no_cfi_dso_local_equivalent(
+; CHECK-NEXT:    ret i1 icmp eq (ptr dso_local_equivalent @func, ptr no_cfi @func)
+;
+  %cmp = icmp eq ptr dso_local_equivalent @func, no_cfi @func
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index b8244b1d508c62..5d17504c09df67 100644
--- a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -211,6 +211,50 @@ define double @fmul_nnan_ninf_nneg_n0.0_commute(i127 %x) {
   ret double %r
 }
 
+define float @src_mul_nzero_neg(float nofpclass(inf nan pzero psub pnorm) %f) {
+; CHECK-LABEL: @src_mul_nzero_neg(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %r = fmul float %f, -0.0
+  ret float %r
+}
+
+define <2 x float> @src_mul_zero_neg(<2 x float> nofpclass(inf nan pzero psub pnorm) %f) {
+; CHECK-LABEL: @src_mul_zero_neg(
+; CHECK-NEXT:    ret <2 x float> <float -0.000000e+00, float -0.000000e+00>
+;
+  %r = fmul <2 x float> <float 0.0, float 0.0>, %f
+  ret <2 x float> %r
+}
+
+define <2 x float> @src_mul_zero_and_nzero_neg(<2 x float> nofpclass(inf nan pzero psub pnorm) %f) {
+; CHECK-LABEL: @src_mul_zero_and_nzero_neg(
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float -0.000000e+00>
+;
+  %r = fmul <2 x float> <float -0.0, float 0.0>, %f
+  ret <2 x float> %r
+}
+
+
+define float @src_muladd_zero_neg(float nofpclass(inf nan pzero psub pnorm) %f, float %add) {
+; CHECK-LABEL: @src_muladd_zero_neg(
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.fmuladd.f32(float [[F:%.*]], float 0.000000e+00, float [[ADD:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %r = call float @llvm.fmuladd.f32(float %f, float 0.0, float %add)
+  ret float %r
+}
+
+define float @src_fma_nzero_neg(float nofpclass(inf nan pzero psub pnorm) %f, float %add) {
+; CHECK-LABEL: @src_fma_nzero_neg(
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.fma.f32(float -0.000000e+00, float [[F:%.*]], float [[ADD:%.*]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %r = call float @llvm.fma.f32(float -0.0, float %f, float %add)
+  ret float %r
+}
+
+
 ; Make sure we can infer %x can't be 0 based on assumes.
 define { float, float } @test_fmul_0_assumed_finite(float %x) {
 ; CHECK-LABEL: @test_fmul_0_assumed_finite(
diff --git a/llvm/test/Transforms/Internalize/vcall-visibility.ll b/llvm/test/Transforms/Internalize/vcall-visibility.ll
index c2fe8c3a01e9a2..ee6753534419d3 100644
--- a/llvm/test/Transforms/Internalize/vcall-visibility.ll
+++ b/llvm/test/Transforms/Internalize/vcall-visibility.ll
@@ -42,7 +42,7 @@ entry:
 define hidden noalias nonnull ptr @_Z6make_dv() {
 entry:
   %call = tail call ptr @_Znwm(i64 8) #3
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_11DE, i64 0, inrange i32 0, i64 2), ptr %call, align 8
+  store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_11DE, i64 0, i32 0, i64 2), ptr %call, align 8
   ret ptr %call
 }
 
diff --git a/llvm/test/Transforms/LICM/expr-reassociate-int.ll b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
index 63548974fb3185..70288731e31c6d 100644
--- a/llvm/test/Transforms/LICM/expr-reassociate-int.ll
+++ b/llvm/test/Transforms/LICM/expr-reassociate-int.ll
@@ -23,7 +23,7 @@ define void @innermost_loop_1d_shouldhoist(i32 %i, i64 %d1, i64 %delta, ptr %cel
 ; CHECK-LABEL: define void @innermost_loop_1d_shouldhoist
 ; CHECK-SAME: (i32 [[I:%.*]], i64 [[D1:%.*]], i64 [[DELTA:%.*]], ptr [[CELLS:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[DELTA]], [[D1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i64 [[DELTA]], [[D1]]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_J_1:%.*]], [[FOR_BODY:%.*]] ]
@@ -55,7 +55,7 @@ for.body:
   %idxprom.j.1 = zext i32 %add.j.1 to i64
   %arrayidx.j.1 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.1
   %cell.1 = load i64, ptr %arrayidx.j.1, align 8
-  %mul.1 = mul i64 %delta, %d1
+  %mul.1 = mul nsw nuw i64 %delta, %d1
   %mul.2 = mul i64 %mul.1, %cell.1
   %idxprom.j = zext i32 %j to i64
   %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
@@ -130,8 +130,8 @@ define void @innermost_loop_2d(i32 %i, i64 %d1, i64 %d2, i64 %delta, ptr %cells)
 ; CONSTRAINED-NEXT:    [[IDXPROM_J:%.*]] = zext i32 [[J]] to i64
 ; CONSTRAINED-NEXT:    [[ARRAYIDX_J:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J]]
 ; CONSTRAINED-NEXT:    [[CELL_2:%.*]] = load i64, ptr [[ARRAYIDX_J]], align 8
-; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul i64 [[CELL_2]], [[D2]]
-; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i64 [[CELL_2]], [[D2]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add nuw nsw i64 [[MUL_2]], [[MUL_1]]
 ; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD]], [[DELTA]]
 ; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J]], align 8
 ; CONSTRAINED-NEXT:    br label [[FOR_COND]]
@@ -155,8 +155,8 @@ for.body:
   %idxprom.j = zext i32 %j to i64
   %arrayidx.j = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j
   %cell.2 = load i64, ptr %arrayidx.j, align 8
-  %mul.2 = mul i64 %cell.2, %d2
-  %reass.add = add i64 %mul.2, %mul.1
+  %mul.2 = mul nsw nuw i64 %cell.2, %d2
+  %reass.add = add nsw nuw i64 %mul.2, %mul.1
   %reass.mul = mul i64 %reass.add, %delta
   store i64 %reass.mul, ptr %arrayidx.j, align 8
   br label %for.cond
@@ -243,10 +243,10 @@ define void @innermost_loop_3d(i32 %i, i64 %d1, i64 %d2, i64 %d3, i64 %delta, pt
 ; CONSTRAINED-NEXT:    [[IDXPROM_J_2:%.*]] = zext i32 [[ADD_J_2]] to i64
 ; CONSTRAINED-NEXT:    [[ARRAYIDX_J_2:%.*]] = getelementptr inbounds i64, ptr [[CELLS]], i64 [[IDXPROM_J_2]]
 ; CONSTRAINED-NEXT:    [[CELL_3:%.*]] = load i64, ptr [[ARRAYIDX_J_2]], align 8
-; CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul i64 [[CELL_3]], [[D3]]
-; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add i64 [[MUL_2]], [[MUL_1]]
-; CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add i64 [[REASS_ADD]], [[MUL_3]]
-; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD1]], [[DELTA]]
+; CONSTRAINED-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i64 [[CELL_3]], [[D3]]
+; CONSTRAINED-NEXT:    [[REASS_ADD:%.*]] = add nuw nsw i64 [[MUL_2]], [[MUL_1]]
+; CONSTRAINED-NEXT:    [[REASS_ADD1:%.*]] = add nuw nsw i64 [[REASS_ADD]], [[MUL_3]]
+; CONSTRAINED-NEXT:    [[REASS_MUL:%.*]] = mul nuw nsw i64 [[REASS_ADD1]], [[DELTA]]
 ; CONSTRAINED-NEXT:    store i64 [[REASS_MUL]], ptr [[ARRAYIDX_J_2]], align 8
 ; CONSTRAINED-NEXT:    br label [[FOR_COND]]
 ; CONSTRAINED:       for.end:
@@ -274,10 +274,10 @@ for.body:
   %idxprom.j.2 = zext i32 %add.j.2 to i64
   %arrayidx.j.2 = getelementptr inbounds i64, ptr %cells, i64 %idxprom.j.2
   %cell.3 = load i64, ptr %arrayidx.j.2, align 8
-  %mul.3 = mul i64 %cell.3, %d3
-  %reass.add = add i64 %mul.2, %mul.1
-  %reass.add1 = add i64 %reass.add, %mul.3
-  %reass.mul = mul i64 %reass.add1, %delta
+  %mul.3 = mul nsw nuw i64 %cell.3, %d3
+  %reass.add = add nsw nuw i64 %mul.2, %mul.1
+  %reass.add1 = add nsw nuw i64 %reass.add, %mul.3
+  %reass.mul = mul nsw nuw i64 %reass.add1, %delta
   store i64 %reass.mul, ptr %arrayidx.j.2, align 8
   br label %for.cond
 
@@ -362,3 +362,34 @@ for.body:
 for.end:
   ret void
 }
+
+; Make sure we drop poison flags on the mul in the loop.
+define i32 @pr85457(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @pr85457
+; CHECK-SAME: (i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FACTOR_OP_MUL:%.*]] = mul i32 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[MUL0:%.*]] = mul i32 [[FACTOR_OP_MUL]], [[IV]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[MUL0]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %iv.next = add nuw nsw i32 %iv, 1
+  %mul0 = mul nuw nsw i32 %x, %iv
+  %mul1 = mul nuw i32 %mul0, %y
+  %cmp = icmp slt i32 %mul1, 1
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopRotate/dbgvalue.ll b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
index 92cc886bc81c10..9ecc31e1bd2d3b 100644
--- a/llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -220,7 +220,7 @@ for.end:
 
 ; Test that dbg.value intrinsincs adjacent to the `icmp slt i32 0, 0` get
 ; rotated as expected. The icmp is loop-invariant and so gets hoisted to the
-; preheader via a different code path. This is more difficult for DPValue
+; preheader via a different code path. This is more difficult for DbgVariableRecord
 ; debug-info records to handle, because they have to get detached and moved
 ; somewhere else during rotation.
 define void @invariant_hoist() !dbg !70 {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
index 2470bca1e17b99..1c26ee8479e578 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -8,41 +8,39 @@ target triple = "aarch64-linux-gnu"
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 
 ; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp
 ; CHECK-VF4IC4:      vector.body:
-; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_PHI2:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_PHI3:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_PHI4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
 ; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[NOT1:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP1]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[NOT2:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP2]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[NOT3:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP3]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[NOT4:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP4]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = or <vscale x 4 x i1> [[VEC_PHI1]], [[NOT1]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = or <vscale x 4 x i1> [[VEC_PHI2]], [[NOT2]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = or <vscale x 4 x i1> [[VEC_PHI3]], [[NOT3]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = or <vscale x 4 x i1> [[VEC_PHI4]], [[NOT4]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL1]] = select <vscale x 4 x i1> [[VEC_ICMP1]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL2]] = select <vscale x 4 x i1> [[VEC_ICMP2]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL3]] = select <vscale x 4 x i1> [[VEC_ICMP3]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL4]] = select <vscale x 4 x i1> [[VEC_ICMP4]], <vscale x 4 x i32> [[VEC_PHI4]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4:      middle.block:
-; CHECK-VF4IC4-NEXT:   [[OR1:%.*]] = or <vscale x 4 x i1> [[VEC_SEL2]], [[VEC_SEL1]]
-; CHECK-VF4IC4-NEXT:   [[OR2:%.*]] = or <vscale x 4 x i1> [[VEC_SEL3]], [[OR1]]
-; CHECK-VF4IC4-NEXT:   [[OR3:%.*]] = or <vscale x 4 x i1> [[VEC_SEL4]], [[OR2]]
-; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[OR3]])
-; CHECK-VF4IC4-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[FR]], i32 7, i32 3
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP5]], <vscale x 4 x i32> [[VEC_SEL1]], <vscale x 4 x i32> [[VEC_SEL2]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP6]], <vscale x 4 x i32> [[VEC_SEL5]], <vscale x 4 x i32> [[VEC_SEL3]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP7:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL7:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP7]], <vscale x 4 x i32> [[VEC_SEL6]], <vscale x 4 x i32> [[VEC_SEL4]]
+; CHECK-VF4IC4-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 entry:
   br label %for.body
 
@@ -64,18 +62,21 @@ exit:                                     ; preds = %for.body
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_i32_from_icmp
 ; CHECK-VF4IC1:      vector.ph:
-; CHECK-VF4IC1-NOT:    shufflevector <vscale x 4 x i32>
-; CHECK-VF4IC1-NOT:    shufflevector <vscale x 4 x i32>
+; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %b, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[SPLAT_OF_B]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 %b, i32 %a
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[FIN_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
 
 ; CHECK-VF4IC4-LABEL: @select_i32_from_icmp
 ; CHECK-VF4IC4:      vector.body:
@@ -100,15 +101,14 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x float>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = fcmp fast ueq <vscale x 4 x float> [[VEC_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
 
 ; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp
 ; CHECK-VF4IC4:      vector.body:
@@ -156,17 +156,17 @@ exit:                                     ; preds = %for.body
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1:        [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
-; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[VEC_SEL_TMP]], <vscale x 4 x i1> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> [[VEC_SEL_TMP]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 1, i32 0
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 0
 
 ; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp
 ; CHECK-VF4IC4:      vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index 938e1deab0b6c9..69538343356693 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -45,8 +45,8 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %
 ; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP9]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = fdiv fast <4 x float> [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[DOTNOT8]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[TMP11]]
-; CHECK-NEXT:    [[PREDPHI]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP12]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast <4 x float> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[DOTNOT8]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP12]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index a235da069b3252..34a7987bb40abe 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -12,24 +12,25 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[Y]], i32 0
 ;
 ; SCALABLE-LABEL: @select_icmp
 ; SCALABLE:       vector.ph:
@@ -41,24 +42,25 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 {
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT2]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[Y]], i32 0
 ;
 entry:
   br label %for.body
@@ -85,24 +87,25 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[Y]], i32 0
 ;
 ; SCALABLE-LABEL: @select_fcmp
 ; SCALABLE:       vector.ph:
@@ -114,24 +117,25 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[X:%.*]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT2]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[Y]], i32 0
 ;
 entry:
   br label %for.body
@@ -159,21 +163,20 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 7, i32 3
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 7, i32 3
 ;
 ; SCALABLE-LABEL: @select_const_i32_from_icmp
 ; SCALABLE:       vector.ph:
@@ -186,21 +189,20 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 7, i32 3
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 7, i32 3
 ;
 entry:
   br label %for.body
@@ -225,24 +227,29 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %b, i32 %a
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[B]], i32 [[A]]
 ;
 ; SCALABLE-LABEL: @select_i32_from_icmp
 ; SCALABLE:       vector.ph:
@@ -252,24 +259,29 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; SCALABLE-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i64 0
+; SCALABLE-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %b, i32 %a
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], [[DOTSPLAT]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[B]], i32 [[A]]
 ;
 entry:
   br label %for.body
@@ -297,21 +309,20 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 2
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 2
 ;
 ; SCALABLE-LABEL: @select_const_i32_from_fcmp
 ; SCALABLE:       vector.ph:
@@ -324,21 +335,20 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 2
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 2
 ;
 entry:
   br label %for.body
@@ -391,7 +401,7 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
@@ -401,16 +411,16 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[VEC_PHI]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP9]], <4 x i1> [[VEC_PHI]]
+; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[PREDPHI]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP12]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 0
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[PREDPHI]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 1, i32 0
 ;
 ; SCALABLE-LABEL: @pred_select_const_i32_from_icmp
 ; SCALABLE:       vector.ph:
@@ -423,7 +433,7 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
@@ -433,16 +443,16 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP13:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP12]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[VEC_PHI]]
+; SCALABLE-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
 ; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[PREDPHI]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP18]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 0
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[PREDPHI]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP18]], i32 1, i32 0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index c81ca46039f892..c55e732c901475 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -25,7 +25,7 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; AUTO_VEC:       vector.ph:
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[ZEXT]], 2147483616
-; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; AUTO_VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; AUTO_VEC-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP0]], 1.000000e+00
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -201,7 +201,7 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) {
 ; AUTO_VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; AUTO_VEC:       vector.ph:
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
-; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to double
+; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to double
 ; AUTO_VEC-NEXT:    [[TMP0:%.*]] = fmul fast double [[DOTCAST]], 3.000000e+00
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AUTO_VEC:       vector.body:
@@ -366,7 +366,7 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; AUTO_VEC:       vector.ph:
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967264
-; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; AUTO_VEC-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[DOTCAST]], 4.200000e+01
 ; AUTO_VEC-NEXT:    [[IND_END:%.*]] = fadd reassoc float [[TMP1]], 1.000000e+00
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
index 14acb6f57aa0c9..3f38abc75a5837 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -29,7 +29,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[ARR]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 -3
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 4dba183a44b94b..caea114e3d4487 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -29,7 +29,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -84,7 +84,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -142,7 +142,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -193,7 +193,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
@@ -276,7 +276,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -331,7 +331,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -389,7 +389,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -442,7 +442,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
@@ -526,7 +526,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]]
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -574,7 +574,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -625,7 +625,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]]
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -675,7 +675,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
@@ -758,10 +758,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
-; VEC4_INTERL1-NEXT:    [[DOTCAST2:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC4_INTERL1-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -835,10 +835,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
-; VEC4_INTERL2-NEXT:    [[DOTCAST2:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC4_INTERL2-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -922,10 +922,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
-; VEC1_INTERL2-NEXT:    [[DOTCAST2:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC1_INTERL2-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1000,10 +1000,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST2:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
@@ -1113,7 +1113,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1158,7 +1158,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1206,7 +1206,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1256,7 +1256,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1319,7 +1319,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL1:       vector.body:
 ; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
@@ -1396,7 +1396,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775800
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE17:%.*]] ]
@@ -1512,7 +1512,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC1_INTERL2:       vector.body:
 ; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
@@ -1570,7 +1570,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.body:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
index 3ba57821bc31b0..873f6364f82811 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-cond.ll
@@ -850,8 +850,8 @@ define float @cond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
 ; CHECK:       pred.load.continue6:
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x float> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[TMP24]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP25]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast <4 x float> [[TMP24]], [[VEC_PHI]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[TMP25]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[TMP26]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
@@ -889,8 +889,8 @@ define float @cond_cond(ptr noalias %src1, ptr noalias %src2, ptr noalias %cond,
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
 ; CHECK:       pred.load.continue14:
 ; CHECK-NEXT:    [[TMP46:%.*]] = phi <4 x float> [ [[TMP41]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP45]], [[PRED_LOAD_IF13]] ]
-; CHECK-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP26]], <4 x float> [[TMP46]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT:    [[PREDPHI15]] = fadd fast <4 x float> [[PREDPHI]], [[TMP47]]
+; CHECK-NEXT:    [[TMP47:%.*]] = fadd fast <4 x float> [[TMP46]], [[PREDPHI]]
+; CHECK-NEXT:    [[PREDPHI15]] = select <4 x i1> [[TMP26]], <4 x float> [[TMP47]], <4 x float> [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index b1c5ccbead64e5..e79f0983a63785 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -65,7 +65,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ poison, [[DOTLR_PH]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
@@ -1173,7 +1173,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], <i64 257, i64 257, i64 257, i64 257>
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -1216,11 +1216,11 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP23]], <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP24]])
-; CHECK-NEXT:    [[TMP26]] = call i32 @llvm.smin.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
+; CHECK-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1228,7 +1228,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
 ;
 entry:
@@ -1260,7 +1260,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], <i64 257, i64 257, i64 257, i64 257>
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -1303,11 +1303,11 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP22]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP23]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP24]])
-; CHECK-NEXT:    [[TMP26]] = call i32 @llvm.umax.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
+; CHECK-NEXT:    [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP25]], i32 [[VEC_PHI]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
-; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1315,7 +1315,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
 ;
 entry:
@@ -1351,25 +1351,25 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP11]]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP13]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[PREDPHI_V]]
-; CHECK-NEXT:    [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI3]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -1386,7 +1386,7 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ poison, [[FOR_INC]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ poison, [[FOR_INC]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[SUM_1_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index d85241167d0cd8..204d021f06571b 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -690,16 +690,16 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP5]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP9]]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP11]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[PREDPHI_V]]
-; CHECK-NEXT:    [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]]
+; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
@@ -1354,9 +1354,8 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK:       pred.load.continue6:
 ; CHECK-NEXT:    [[TMP43:%.*]] = phi <4 x i32> [ [[TMP37]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP42]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne <4 x i32> [[TMP43]], zeroinitializer
-; CHECK-NEXT:    [[TMP45:%.*]] = select <4 x i1> [[TMP19]], <4 x i1> [[TMP44]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP46:%.*]] = xor <4 x i1> [[TMP19]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP47:%.*]] = or <4 x i1> [[TMP45]], [[TMP46]]
+; CHECK-NEXT:    [[TMP47:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP44]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <4 x i1> [[TMP47]] to i4
 ; CHECK-NEXT:    [[TMP49:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP48]]), !range [[RNG42:![0-9]+]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = zext nneg i4 [[TMP49]] to i32
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
index ba82bac6fad266..a47a38510eeeb3 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -761,16 +761,16 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP5]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i1> [[TMP9]]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP11]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[PREDPHI_V]]
-; CHECK-NEXT:    [[PREDPHI3]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI2]]
+; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP11]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
index cedc769b811fc6..e8c75635f2c69f 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
@@ -4,7 +4,7 @@
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) {
 ; CHECK-VF2IC1-LABEL: @pred_select_const_i32_from_icmp(
 ; CHECK-VF2IC1:       vector.body:
-; CHECK-VF2IC1:         [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ]
+; CHECK-VF2IC1:         [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ]
 ; CHECK-VF2IC1:         [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr {{%.*}}, align 4
 ; CHECK-VF2IC1-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 35, i32 35>
 ; CHECK-VF2IC1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
@@ -26,14 +26,14 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-VF2IC1:       pred.load.continue2:
 ; CHECK-VF2IC1-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %pred.load.continue ], [ [[TMP14]], %pred.load.if1 ]
 ; CHECK-VF2IC1-NEXT:    [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], <i32 2, i32 2>
-; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = or <2 x i1> [[VEC_PHI]], [[TMP16]]
+; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x i32> <i32 1, i32 1>, <2 x i32> [[VEC_PHI]]
 ; CHECK-VF2IC1-NEXT:    [[TMP18:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP17]], <2 x i1> [[VEC_PHI]]
+; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP17]], <2 x i32> [[VEC_PHI]]
 ; CHECK-VF2IC1:         br i1 {{%.*}}, label %middle.block, label %vector.body
 ; CHECK-VF2IC1:       middle.block:
-; CHECK-VF2IC1-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[PREDPHI]])
-; CHECK-VF2IC1-NEXT:    [[FR_TMP20:%.*]] = freeze i1 [[TMP20]]
-; CHECK-VF2IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP20]], i32 1, i32 0
+; CHECK-VF2IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i32> [[PREDPHI]], zeroinitializer
+; CHECK-VF2IC1-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
+; CHECK-VF2IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 1, i32 0
 ; CHECK-VF2IC1:       scalar.ph:
 ; CHECK-VF2IC1:         [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
 ; CHECK-VF2IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
@@ -56,8 +56,8 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ;
 ; CHECK-VF1IC2-LABEL: @pred_select_const_i32_from_icmp(
 ; CHECK-VF1IC2:       vector.body:
-; CHECK-VF1IC2:         [[VEC_PHI:%.*]] = phi i1 [ false, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue3 ]
-; CHECK-VF1IC2-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue3 ]
+; CHECK-VF1IC2:         [[VEC_PHI:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue3 ]
+; CHECK-VF1IC2-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue3 ]
 ; CHECK-VF1IC2:         [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 {{%.*}}
 ; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 {{%.*}}
 ; CHECK-VF1IC2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -80,17 +80,16 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-VF1IC2-NEXT:    [[TMP11:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP10]], %pred.load.if2 ]
 ; CHECK-VF1IC2-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2
 ; CHECK-VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2
-; CHECK-VF1IC2-NEXT:    [[TMP14:%.*]] = or i1 [[VEC_PHI]], [[TMP12]]
-; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = or i1 [[VEC_PHI2]], [[TMP13]]
+; CHECK-VF1IC2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP12]], i32 1, i32 [[VEC_PHI]]
+; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i32 1, i32 [[VEC_PHI2]]
 ; CHECK-VF1IC2-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP4]], true
 ; CHECK-VF1IC2-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP5]], true
-; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = select i1 [[TMP4]], i1 [[TMP14]], i1 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT:    [[PREDPHI5]] = select i1 [[TMP5]], i1 [[TMP15]], i1 [[VEC_PHI2]]
+; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = select i1 [[TMP4]], i32 [[TMP14]], i32 [[VEC_PHI]]
+; CHECK-VF1IC2-NEXT:    [[PREDPHI5]] = select i1 [[TMP5]], i32 [[TMP15]], i32 [[VEC_PHI2]]
 ; CHECK-VF1IC2:         br i1 {{%.*}}, label %middle.block, label %vector.body
 ; CHECK-VF1IC2:       middle.block:
-; CHECK-VF1IC2-NEXT:    [[OR:%.*]] = or i1 [[PREDPHI5]], [[PREDPHI]]
-; CHECK-VF1IC2-NEXT:    [[FR_OR:%.*]] = freeze i1 [[OR]]
-; CHECK-VF1IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_OR]], i32 1, i32 0
+; CHECK-VF1IC2-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[PREDPHI]], 0
+; CHECK-VF1IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI5]]
 ; CHECK-VF1IC2:         br i1 {{%.*}}, label %for.end.loopexit, label %scalar.ph
 ; CHECK-VF1IC2:       scalar.ph:
 ; CHECK-VF1IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 993b56a05207be..c9f2aaef6d5c8e 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -5,47 +5,45 @@
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_icmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 
 ; CHECK-VF4IC4:      vector.body:
-; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI2:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI3:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI4:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
 ; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC4-NEXT:   [[NOT1:%.*]] = xor <4 x i1> [[VEC_ICMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC4-NEXT:   [[NOT2:%.*]] = xor <4 x i1> [[VEC_ICMP2]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC4-NEXT:   [[NOT3:%.*]] = xor <4 x i1> [[VEC_ICMP3]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC4-NEXT:   [[NOT4:%.*]] = xor <4 x i1> [[VEC_ICMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = or <4 x i1> [[VEC_PHI1]], [[NOT1]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = or <4 x i1> [[VEC_PHI2]], [[NOT2]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = or <4 x i1> [[VEC_PHI3]], [[NOT3]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = or <4 x i1> [[VEC_PHI4]], [[NOT4]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = select <4 x i1> [[VEC_ICMP1]], <4 x i32> [[VEC_PHI1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = select <4 x i1> [[VEC_ICMP2]], <4 x i32> [[VEC_PHI2]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = select <4 x i1> [[VEC_ICMP3]], <4 x i32> [[VEC_PHI3]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = select <4 x i1> [[VEC_ICMP4]], <4 x i32> [[VEC_PHI4]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 ; CHECK-VF4IC4:      middle.block:
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = or <4 x i1>  [[VEC_SEL2]], [[VEC_SEL1]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = or <4 x i1> [[VEC_SEL3]], [[VEC_SEL5]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL7:%.*]] = or <4 x i1> [[VEC_SEL4]], [[VEC_SEL6]]
-; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL7]])
-; CHECK-VF4IC4-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne <4 x i32> [[VEC_SEL1]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = select <4 x i1> [[VEC_ICMP5]], <4 x i32> [[VEC_SEL1]], <4 x i32> [[VEC_SEL2]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne <4 x i32> [[VEC_SEL5]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = select <4 x i1> [[VEC_ICMP6]], <4 x i32> [[VEC_SEL5]], <4 x i32> [[VEC_SEL3]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP7:%.*]] = icmp ne <4 x i32> [[VEC_SEL6]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL_FIN:%.*]] = select <4 x i1> [[VEC_ICMP7]], <4 x i32> [[VEC_SEL6]], <4 x i32> [[VEC_SEL4]]
+; CHECK-VF4IC4-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL_FIN]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 
 
 ; CHECK-VF1IC4:      vector.body:
-; CHECK-VF1IC4:        [[VEC_PHI1:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI3:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI4:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF1IC4:        [[VEC_PHI1:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI2:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI3:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI4:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
 ; CHECK-VF1IC4:        [[VEC_LOAD1:%.*]] = load i32
 ; CHECK-VF1IC4-NEXT:   [[VEC_LOAD2:%.*]] = load i32
 ; CHECK-VF1IC4-NEXT:   [[VEC_LOAD3:%.*]] = load i32
@@ -54,20 +52,17 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq i32 [[VEC_LOAD2]], 3
 ; CHECK-VF1IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq i32 [[VEC_LOAD3]], 3
 ; CHECK-VF1IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_LOAD4]], 3
-; CHECK-VF1IC4-NEXT:   [[NOT1:%.*]] = xor i1 [[VEC_ICMP1]], true
-; CHECK-VF1IC4-NEXT:   [[NOT2:%.*]] = xor i1 [[VEC_ICMP2]], true
-; CHECK-VF1IC4-NEXT:   [[NOT3:%.*]] = xor i1 [[VEC_ICMP3]], true
-; CHECK-VF1IC4-NEXT:   [[NOT4:%.*]] = xor i1 [[VEC_ICMP4]], true
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL1:%.*]] = or i1 [[VEC_PHI1]], [[NOT1]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL2:%.*]] = or i1 [[VEC_PHI2]], [[NOT2]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL3:%.*]] = or i1 [[VEC_PHI3]], [[NOT3]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL4:%.*]] = or i1 [[VEC_PHI4]], [[NOT4]]
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL1]] = select i1 [[VEC_ICMP1]], i32 [[VEC_PHI1]], i32 7
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL2]] = select i1 [[VEC_ICMP2]], i32 [[VEC_PHI2]], i32 7
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL3]] = select i1 [[VEC_ICMP3]], i32 [[VEC_PHI3]], i32 7
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL4]] = select i1 [[VEC_ICMP4]], i32 [[VEC_PHI4]], i32 7
 ; CHECK-VF1IC4:      middle.block:
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL5:%.*]] = or i1 [[VEC_SEL2]], [[VEC_SEL1]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL6:%.*]] = or i1 [[VEC_SEL3]], [[VEC_SEL5]]
-; CHECK-VF1IC4-NEXT:   [[OR_RDX:%.*]] = or i1  [[VEC_SEL4]], [[VEC_SEL6]]
-; CHECK-VF1IC4-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF1IC4-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp ne i32 [[VEC_SEL1]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL5:%.*]] = select i1 [[VEC_ICMP4]], i32 [[VEC_SEL1]], i32 [[VEC_SEL2]]
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne i32 [[VEC_SEL5]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL6:%.*]] = select i1 [[VEC_ICMP5]], i32 [[VEC_SEL5]], i32 [[VEC_SEL3]]
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne i32 [[VEC_SEL6]], 3
+; CHECK-VF1IC4-NEXT:   {{.*}} = select i1 [[VEC_ICMP6]], i32 [[VEC_SEL6]], i32 [[VEC_SEL4]]
 
 entry:
   br label %for.body
@@ -91,14 +86,14 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_icmp2(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_icmp2
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 
 entry:
   br label %for.body
@@ -122,18 +117,21 @@ exit:                                     ; preds = %for.body
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-LABEL: @select_i32_from_icmp
 ; CHECK-VF4IC1:      vector.ph:
-; CHECK-VF4IC1-NOT:    shufflevector <4 x i32>
-; CHECK-VF4IC1-NOT:    shufflevector <4 x i32>
+; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
 entry:
   br label %for.body
 
@@ -156,15 +154,14 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp_fast(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_fcmp_fast
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
 ; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp fast ueq <4 x float> [[VEC_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
 entry:
   br label %for.body
 
@@ -187,15 +184,14 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_fcmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
 ; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp ueq <4 x float> [[VEC_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
 entry:
   br label %for.body
 
@@ -220,16 +216,18 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC1:      vector.ph:
 ; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
 ; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NOT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
-; CHECK-VF4IC1:        [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[SPLAT_OF_A]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_PHI]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
index 55e61158a79c61..16ab45415b5cc5 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
@@ -8,25 +8,26 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ undef, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], <i32 1, i32 1>
-; CHECK-NEXT:    [[NOT:%*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
-; CHECK-NEXT:    [[FR_TMP6:%.*]] = freeze i1 [[TMP6]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 undef
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 undef
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
@@ -71,25 +72,26 @@ define i64 @pr62565_incoming_value_known_poison(i64 %a, ptr %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ poison, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], <i32 1, i32 1>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
-; CHECK-NEXT:    [[FR_TMP6:%.*]] = freeze i1 [[TMP6]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 poison
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], poison
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 poison
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
@@ -134,25 +136,30 @@ define i64 @pr62565_incoming_value_may_be_poison(i64 %a, ptr %src, i64 %start) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <2 x i64> [[MINMAX_IDENT_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], <i32 1, i32 1>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
-; CHECK-NEXT:    [[FR_TMP6:%.*]] = freeze i1 [[TMP6]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 [[START]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 [[START]]
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
index ef8665b7969097..bdd0c6f728ae78 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-lifetime-ends.ll
@@ -6,15 +6,11 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; Tests to make sure no loads are introduced after a lifetime.end by multiply
 ; fusion.
 
-; FIXME: Currently the tests are mis-compiled, with loads being introduced after
-;       llvm.lifetime.end calls.
-
 define void @lifetime_for_first_arg_before_multiply(ptr noalias %B, ptr noalias %C) {
 ; CHECK-LABEL: @lifetime_for_first_arg_before_multiply(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
 ; CHECK-NEXT:    call void @init(ptr [[A]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
@@ -77,6 +73,7 @@ define void @lifetime_for_first_arg_before_multiply(ptr noalias %B, ptr noalias
 ; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
 ; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -95,7 +92,6 @@ define void @lifetime_for_second_arg_before_multiply(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B:%.*]] = alloca <4 x double>, align 32
 ; CHECK-NEXT:    call void @init(ptr [[B]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
@@ -158,6 +154,7 @@ define void @lifetime_for_second_arg_before_multiply(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
 ; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -177,7 +174,6 @@ define void @lifetime_for_first_arg_before_multiply_load_from_offset(ptr noalias
 ; CHECK-NEXT:    [[A:%.*]] = alloca <8 x double>, align 64
 ; CHECK-NEXT:    call void @init(ptr [[A]])
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr i8, ptr [[A]], i64 8
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[GEP_8]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
@@ -240,6 +236,7 @@ define void @lifetime_for_first_arg_before_multiply_load_from_offset(ptr noalias
 ; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
 ; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -261,7 +258,6 @@ define void @lifetime_for_first_arg_before_multiply_lifetime_does_not_dominate(p
 ; CHECK-NEXT:    call void @init(ptr [[A]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
@@ -352,7 +348,6 @@ define void @lifetime_for_second_arg_before_multiply_lifetime_does_not_dominate(
 ; CHECK-NEXT:    call void @init(ptr [[B]])
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
@@ -441,10 +436,9 @@ define void @lifetime_for_ptr_first_arg_before_multiply(ptr noalias %A, ptr noal
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A:%.*]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
@@ -528,15 +522,104 @@ define void @lifetime_for_both_ptr_args_before_multiply(ptr noalias %A, ptr noal
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B:%.*]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A:%.*]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = load <4 x double>, ptr %A, align 8
+  %b = load <4 x double>, ptr %B, align 8
+  br i1 %c.0, label %then, label %exit
+
+then:
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  br label %exit
+
+exit:
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  ret void
+}
+
+define void @multiple_unrelated_lifetimes(ptr noalias %A, ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @multiple_unrelated_lifetimes(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOC_1:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ALLOC_2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[ALLOC_2]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
@@ -597,13 +680,17 @@ define void @lifetime_for_both_ptr_args_before_multiply(ptr noalias %A, ptr noal
 ; CHECK-NEXT:    ret void
 ;
 entry:
+  %alloc.1 = alloca i32
+  %alloc.2 = alloca i32
   %a = load <4 x double>, ptr %A, align 8
   %b = load <4 x double>, ptr %B, align 8
   br i1 %c.0, label %then, label %exit
 
 then:
   call void @llvm.lifetime.end(i64 -1, ptr %B)
+  call void @llvm.lifetime.end(i64 -1, ptr %alloc.1)
   call void @llvm.lifetime.end(i64 -1, ptr %A)
+  call void @llvm.lifetime.end(i64 -1, ptr %alloc.2)
   br label %exit
 
 exit:
@@ -618,7 +705,6 @@ define void @lifetime_for_ptr_select_before_multiply(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    [[P:%.*]] = select i1 [[C_0:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]
 ; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[P]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[P]], i64 0
@@ -701,6 +787,374 @@ exit:
   ret void
 }
 
+define void @lifetimes_for_args_in_different_blocks(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @lifetimes_for_args_in_different_blocks(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %A = alloca <4 x double>
+  call void @init(ptr %A)
+  br i1 %c.0, label %then, label %exit
+
+then:
+  %a = load <4 x double>, ptr %A, align 8
+  %b = load <4 x double>, ptr %B, align 8
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  br label %exit
+
+exit:
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  ret void
+}
+
+define void @lifetimes_for_args_in_different_blocks2(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @lifetimes_for_args_in_different_blocks2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 -1, ptr [[B:%.*]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %A = alloca <4 x double>
+  call void @init(ptr %A)
+  br i1 %c.0, label %then, label %exit
+
+then:
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  br label %exit
+
+exit:
+  %a = load <4 x double>, ptr %A, align 8
+  %b = load <4 x double>, ptr %B, align 8
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  ret void
+}
+
+define void @lifetimes_for_args_load0_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @lifetimes_for_args_load0_in_different_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %A = alloca <4 x double>
+  call void @init(ptr %A)
+  %a = load <4 x double>, ptr %A, align 8
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  br i1 %c.0, label %then, label %exit
+
+then:
+  %b = load <4 x double>, ptr %B, align 8
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  br label %exit
+
+exit:
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  ret void
+}
+
+define void @lifetimes_for_args_load1_in_different_block(ptr noalias %B, ptr noalias %C, i1 %c.0) {
+; CHECK-LABEL: @lifetimes_for_args_load1_in_different_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <4 x double>, align 32
+; CHECK-NEXT:    call void @init(ptr [[A]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[A]], i64 0
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[B:%.*]], i64 0
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul contract <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK5:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT6]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK5]], <1 x double> [[SPLAT_SPLAT7]], <1 x double> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <1 x double> [[TMP5]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP6]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
+; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[COL_LOAD2]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK12]], <1 x double> [[SPLAT_SPLAT14]], <1 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> poison, double [[TMP14]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
+; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> poison, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK18]], <1 x double> [[SPLAT_SPLAT20]], <1 x double> [[TMP15]])
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x double> [[TMP17]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP18]], <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[BLOCK22:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT23:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT24:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT23]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <1 x double> [[BLOCK22]], [[SPLAT_SPLAT24]]
+; CHECK-NEXT:    [[BLOCK25:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[COL_LOAD4]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT26:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT27:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT26]], <1 x double> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK25]], <1 x double> [[SPLAT_SPLAT27]], <1 x double> [[TMP21]])
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <1 x double> [[TMP23]], <1 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x double> [[TMP19]], <2 x double> [[TMP24]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C1:%.*]], i64 0
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[VEC_GEP28:%.*]] = getelementptr double, ptr [[TMP26]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr [[VEC_GEP28]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %A = alloca <4 x double>
+  call void @init(ptr %A)
+  %b = load <4 x double>, ptr %B, align 8
+  call void @llvm.lifetime.end(i64 -1, ptr %B)
+  br i1 %c.0, label %then, label %exit
+
+then:
+  %a = load <4 x double>, ptr %A, align 8
+  %m = call <4 x double> @llvm.matrix.multiply(<4 x double> %a, <4 x double> %b, i32 2, i32 2, i32 2)
+  store <4 x double> %m, ptr %C, align 8
+  br label %exit
+
+exit:
+  call void @llvm.lifetime.end(i64 -1, ptr %A)
+  ret void
+}
+
 declare void @init(ptr)
 declare void @llvm.lifetime.end(i64, ptr)
 
diff --git a/llvm/test/Transforms/MergeFunc/constexpr.ll b/llvm/test/Transforms/MergeFunc/constexpr.ll
index 9fb78060174226..3946fd6a066ff5 100644
--- a/llvm/test/Transforms/MergeFunc/constexpr.ll
+++ b/llvm/test/Transforms/MergeFunc/constexpr.ll
@@ -66,9 +66,9 @@ define i64 @f8() unnamed_addr {
 
 define ptr @f10() unnamed_addr {
 ; CHECK-LABEL: define ptr @f10() unnamed_addr {
-; CHECK-NEXT:    ret ptr getelementptr ([4 x i32], ptr @g1, i64 0, inrange i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inrange(0, 4) ([4 x i32], ptr @g1, i64 0, i64 1)
 ;
-  ret ptr getelementptr ([4 x i32], ptr @g1, i64 0, inrange i64 1)
+  ret ptr getelementptr inrange(0, 4) ([4 x i32], ptr @g1, i64 0, i64 1)
 }
 
 define ptr @f11() unnamed_addr {
@@ -80,7 +80,7 @@ define ptr @f11() unnamed_addr {
 
 define ptr @f12() unnamed_addr {
 ; CHECK-LABEL: define ptr @f12() unnamed_addr {
-; CHECK-NEXT:    ret ptr getelementptr ([4 x i32], ptr @g1, inrange i64 0, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inrange(-4, 12) ([4 x i32], ptr @g1, i64 0, i64 1)
 ;
-  ret ptr getelementptr ([4 x i32], ptr @g1, inrange i64 0, i64 1)
+  ret ptr getelementptr inrange(-4, 12) ([4 x i32], ptr @g1, i64 0, i64 1)
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-DominatedLoop.ll b/llvm/test/Transforms/NewGVN/2007-07-25-DominatedLoop.ll
index 978f061ac4fd87..6f0ef197338c28 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-DominatedLoop.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-DominatedLoop.ll
@@ -1,86 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%struct.PerlInterpreter = type { i8 }
+  %struct.PerlInterpreter = type { i8 }
 @PL_sv_count = external global i32		; <ptr> [#uses=2]
 
 define void @perl_destruct(ptr %sv_interp) {
 entry:
-	br i1 false, label %cond_next25, label %cond_true16
+  br i1 false, label %cond_next25, label %cond_true16
 
 cond_true16:		; preds = %entry
-	ret void
+  ret void
 
 cond_next25:		; preds = %entry
-	br i1 false, label %cond_next33, label %cond_true32
+  br i1 false, label %cond_next33, label %cond_true32
 
 cond_true32:		; preds = %cond_next25
-	ret void
+  ret void
 
 cond_next33:		; preds = %cond_next25
-	br i1 false, label %cond_next61, label %cond_true.i46
+  br i1 false, label %cond_next61, label %cond_true.i46
 
 cond_true.i46:		; preds = %cond_next33
-	ret void
+  ret void
 
 cond_next61:		; preds = %cond_next33
-	br i1 false, label %cond_next69, label %cond_true66
+  br i1 false, label %cond_next69, label %cond_true66
 
 cond_true66:		; preds = %cond_next61
-	ret void
+  ret void
 
 cond_next69:		; preds = %cond_next61
-	br i1 false, label %Perl_safefree.exit52, label %cond_true.i50
+  br i1 false, label %Perl_safefree.exit52, label %cond_true.i50
 
 cond_true.i50:		; preds = %cond_next69
-	ret void
+  ret void
 
 Perl_safefree.exit52:		; preds = %cond_next69
-	br i1 false, label %cond_next80, label %cond_true77
+  br i1 false, label %cond_next80, label %cond_true77
 
 cond_true77:		; preds = %Perl_safefree.exit52
-	ret void
+  ret void
 
 cond_next80:		; preds = %Perl_safefree.exit52
-	br i1 false, label %Perl_safefree.exit56, label %cond_true.i54
+  br i1 false, label %Perl_safefree.exit56, label %cond_true.i54
 
 cond_true.i54:		; preds = %cond_next80
-	ret void
+  ret void
 
 Perl_safefree.exit56:		; preds = %cond_next80
-	br i1 false, label %Perl_safefree.exit60, label %cond_true.i58
+  br i1 false, label %Perl_safefree.exit60, label %cond_true.i58
 
 cond_true.i58:		; preds = %Perl_safefree.exit56
-	ret void
+  ret void
 
 Perl_safefree.exit60:		; preds = %Perl_safefree.exit56
-	br i1 false, label %Perl_safefree.exit64, label %cond_true.i62
+  br i1 false, label %Perl_safefree.exit64, label %cond_true.i62
 
 cond_true.i62:		; preds = %Perl_safefree.exit60
-	ret void
+  ret void
 
 Perl_safefree.exit64:		; preds = %Perl_safefree.exit60
-	br i1 false, label %Perl_safefree.exit68, label %cond_true.i66
+  br i1 false, label %Perl_safefree.exit68, label %cond_true.i66
 
 cond_true.i66:		; preds = %Perl_safefree.exit64
-	ret void
+  ret void
 
 Perl_safefree.exit68:		; preds = %Perl_safefree.exit64
-	br i1 false, label %cond_next150, label %cond_true23.i
+  br i1 false, label %cond_next150, label %cond_true23.i
 
 cond_true23.i:		; preds = %Perl_safefree.exit68
-	ret void
+  ret void
 
 cond_next150:		; preds = %Perl_safefree.exit68
-	%tmp16092 = load i32, ptr @PL_sv_count, align 4		; <i32> [#uses=0]
-	br label %cond_next165
+  %tmp16092 = load i32, ptr @PL_sv_count, align 4		; <i32> [#uses=0]
+  br label %cond_next165
 
 bb157:		; preds = %cond_next165
-	%tmp158 = load i32, ptr @PL_sv_count, align 4		; <i32> [#uses=0]
-	br label %cond_next165
+  %tmp158 = load i32, ptr @PL_sv_count, align 4		; <i32> [#uses=0]
+  br label %cond_next165
 
 cond_next165:		; preds = %bb157, %cond_next150
-	br i1 false, label %bb171, label %bb157
+  br i1 false, label %bb171, label %bb157
 
 bb171:		; preds = %cond_next165
-	ret void
+  ret void
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-InfiniteLoop.ll b/llvm/test/Transforms/NewGVN/2007-07-25-InfiniteLoop.ll
index abb6fbe5030f71..5202a2bc6466b9 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-InfiniteLoop.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-InfiniteLoop.ll
@@ -1,15 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
-	%struct.INT2 = type { i32, i32 }
+  %struct.INT2 = type { i32, i32 }
 @blkshifts = external global ptr		; <ptr> [#uses=2]
 
 define i32 @xcompact() {
+; CHECK-LABEL: define i32 @xcompact() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store ptr null, ptr @blkshifts, align 4
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-	store ptr null, ptr @blkshifts, align 4
-	br label %bb
+  store ptr null, ptr @blkshifts, align 4
+  br label %bb
 
 bb:		; preds = %bb, %entry
-	%tmp10 = load ptr, ptr @blkshifts, align 4		; <ptr> [#uses=0]
-; CHECK-NOT:  %tmp10
-	br label %bb
+  %tmp10 = load ptr, ptr @blkshifts, align 4		; <ptr> [#uses=0]
+  br label %bb
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-Loop.ll b/llvm/test/Transforms/NewGVN/2007-07-25-Loop.ll
index 336f390459b94c..2ee599c11d4199 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-Loop.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-Loop.ll
@@ -1,15 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%struct.s_segment_inf = type { float, i32, i16, i16, float, float, i32, float, float }
+  %struct.s_segment_inf = type { float, i32, i16, i16, float, float, i32, float, float }
 
 define void @print_arch(ptr %arch_file, i32 %route_type, i64 %det_routing_arch.0.0, i64 %det_routing_arch.0.1, i64 %det_routing_arch.0.2, i64 %det_routing_arch.0.3, i64 %det_routing_arch.0.4, ptr %segment_inf, i64 %timing_inf.0.0, i64 %timing_inf.0.1, i64 %timing_inf.0.2, i64 %timing_inf.0.3, i64 %timing_inf.0.4, i32 %timing_inf.1) {
 entry:
-	br i1 false, label %bb278, label %bb344
+  br i1 false, label %bb278, label %bb344
 
 bb278:		; preds = %bb278, %entry
-	br i1 false, label %bb278, label %bb344
+  br i1 false, label %bb278, label %bb344
 
 bb344:		; preds = %bb278, %entry
-	%tmp38758 = load i16, ptr null, align 2		; <i16> [#uses=0]
-	ret void
+  %tmp38758 = load i16, ptr null, align 2		; <i16> [#uses=0]
+  ret void
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-NestedLoop.ll b/llvm/test/Transforms/NewGVN/2007-07-25-NestedLoop.ll
index c46f2b7630aca9..e7461c2f32bf88 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-NestedLoop.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-NestedLoop.ll
@@ -1,38 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%struct.TypHeader = type { i32, ptr, [3 x i8], i8 }
+  %struct.TypHeader = type { i32, ptr, [3 x i8], i8 }
 
 define ptr @LtRec(ptr %hdL, ptr %hdR) {
 entry:
-	br i1 false, label %bb556.preheader, label %bb534.preheader
+  br i1 false, label %bb556.preheader, label %bb534.preheader
 
 bb534.preheader:		; preds = %entry
-	ret ptr null
+  ret ptr null
 
 bb556.preheader:		; preds = %entry
-	%tmp56119 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
-	%tmp56220 = load i32, ptr %tmp56119		; <i32> [#uses=0]
-	br i1 false, label %bb.nph23, label %bb675.preheader
+  %tmp56119 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
+  %tmp56220 = load i32, ptr %tmp56119		; <i32> [#uses=0]
+  br i1 false, label %bb.nph23, label %bb675.preheader
 
 bb.nph23:		; preds = %bb556.preheader
-	ret ptr null
+  ret ptr null
 
 bb656:		; preds = %bb675.outer, %bb656
-	%tmp678 = load i32, ptr %tmp677		; <i32> [#uses=0]
-	br i1 false, label %bb684, label %bb656
+  %tmp678 = load i32, ptr %tmp677		; <i32> [#uses=0]
+  br i1 false, label %bb684, label %bb656
 
 bb684:		; preds = %bb675.outer, %bb656
-	br i1 false, label %bb924.preheader, label %bb675.outer
+  br i1 false, label %bb924.preheader, label %bb675.outer
 
 bb675.outer:		; preds = %bb675.preheader, %bb684
-	%tmp67812 = load i32, ptr %tmp67711		; <i32> [#uses=0]
-	br i1 false, label %bb684, label %bb656
+  %tmp67812 = load i32, ptr %tmp67711		; <i32> [#uses=0]
+  br i1 false, label %bb684, label %bb656
 
 bb675.preheader:		; preds = %bb556.preheader
-	%tmp67711 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
-	%tmp677 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
-	br label %bb675.outer
+  %tmp67711 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
+  %tmp677 = getelementptr %struct.TypHeader, ptr %hdR, i32 0, i32 0		; <ptr> [#uses=1]
+  br label %bb675.outer
 
 bb924.preheader:		; preds = %bb684
-	ret ptr null
+  ret ptr null
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-25-SinglePredecessor.ll b/llvm/test/Transforms/NewGVN/2007-07-25-SinglePredecessor.ll
index 0b0597f44ee7ea..6fafce30475698 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-25-SinglePredecessor.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-25-SinglePredecessor.ll
@@ -1,29 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%struct.ggBRDF = type { ptr }
-	%struct.ggBox3 = type { %struct.ggPoint3, %struct.ggPoint3 }
-	%struct.ggMaterialRecord = type { %struct.ggPoint2, %struct.ggBox3, %struct.ggBox3, %struct.ggSpectrum, %struct.ggSpectrum, %struct.ggSpectrum, ptr, i32, i32, i32, i32 }
-	%struct.ggONB3 = type { %struct.ggPoint3, %struct.ggPoint3, %struct.ggPoint3 }
-	%struct.ggPoint2 = type { [2 x double] }
-	%struct.ggPoint3 = type { [3 x double] }
-	%struct.ggSpectrum = type { [8 x float] }
-	%struct.mrViewingHitRecord = type { double, %struct.ggPoint3, %struct.ggONB3, %struct.ggPoint2, double, %struct.ggSpectrum, %struct.ggSpectrum, i32, i32, i32, i32 }
-	%struct.mrXEllipticalCylinder = type { %struct.ggBRDF, float, float, float, float, float, float }
+  %struct.ggBRDF = type { ptr }
+  %struct.ggBox3 = type { %struct.ggPoint3, %struct.ggPoint3 }
+  %struct.ggMaterialRecord = type { %struct.ggPoint2, %struct.ggBox3, %struct.ggBox3, %struct.ggSpectrum, %struct.ggSpectrum, %struct.ggSpectrum, ptr, i32, i32, i32, i32 }
+  %struct.ggONB3 = type { %struct.ggPoint3, %struct.ggPoint3, %struct.ggPoint3 }
+  %struct.ggPoint2 = type { [2 x double] }
+  %struct.ggPoint3 = type { [3 x double] }
+  %struct.ggSpectrum = type { [8 x float] }
+  %struct.mrViewingHitRecord = type { double, %struct.ggPoint3, %struct.ggONB3, %struct.ggPoint2, double, %struct.ggSpectrum, %struct.ggSpectrum, i32, i32, i32, i32 }
+  %struct.mrXEllipticalCylinder = type { %struct.ggBRDF, float, float, float, float, float, float }
 
 define i32 @_ZNK21mrZEllipticalCylinder10viewingHitERK6ggRay3dddR18mrViewingHitRecordR16ggMaterialRecord(ptr %this, ptr %ray, double %unnamed_arg, double %tmin, double %tmax, ptr %VHR, ptr %unnamed_arg2) {
 entry:
-	%tmp80.i = getelementptr %struct.mrViewingHitRecord, ptr %VHR, i32 0, i32 1, i32 0, i32 0		; <ptr> [#uses=1]
-	store double 0.000000e+00, ptr %tmp80.i
-	br i1 false, label %return, label %cond_next.i
+  %tmp80.i = getelementptr %struct.mrViewingHitRecord, ptr %VHR, i32 0, i32 1, i32 0, i32 0		; <ptr> [#uses=1]
+  store double 0.000000e+00, ptr %tmp80.i
+  br i1 false, label %return, label %cond_next.i
 
 cond_next.i:		; preds = %entry
-	br i1 false, label %return, label %cond_true
+  br i1 false, label %return, label %cond_true
 
 cond_true:		; preds = %cond_next.i
-	%tmp3.i8 = getelementptr %struct.mrViewingHitRecord, ptr %VHR, i32 0, i32 1, i32 0, i32 0		; <ptr> [#uses=1]
-	%tmp46 = load double, ptr %tmp3.i8		; <double> [#uses=0]
-	ret i32 1
+  %tmp3.i8 = getelementptr %struct.mrViewingHitRecord, ptr %VHR, i32 0, i32 1, i32 0, i32 0		; <ptr> [#uses=1]
+  %tmp46 = load double, ptr %tmp3.i8		; <double> [#uses=0]
+  ret i32 1
 
 return:		; preds = %cond_next.i, %entry
-	ret i32 0
+  ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-26-NonRedundant.ll b/llvm/test/Transforms/NewGVN/2007-07-26-NonRedundant.ll
index 8d3bfcd1367bf5..a64901e71b05a5 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-26-NonRedundant.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-26-NonRedundant.ll
@@ -1,16 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
 @bsLive = external global i32		; <ptr> [#uses=2]
 
 define i32 @bsR(i32 %n) {
 entry:
-	br i1 false, label %cond_next, label %bb19
+  br i1 false, label %cond_next, label %bb19
 
 cond_next:		; preds = %entry
-	store i32 0, ptr @bsLive, align 4
-	br label %bb19
+  store i32 0, ptr @bsLive, align 4
+  br label %bb19
 
 bb19:		; preds = %cond_next, %entry
-	%tmp29 = load i32, ptr @bsLive, align 4		; <i32> [#uses=0]
-	ret i32 0
+  %tmp29 = load i32, ptr @bsLive, align 4		; <i32> [#uses=0]
+  ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll b/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
index 22d64324c2352e..46f9b84a7e3783 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
@@ -20,7 +20,7 @@ define i32 @reload(ptr %first, i32 %global, ptr %dumpfile) {
 ; CHECK:       cond_next2943:
 ; CHECK-NEXT:    br i1 false, label [[BB2982_PREHEADER:%.*]], label [[BB2928]]
 ; CHECK:       bb2982.preheader:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    ret i32 poison
 ;
 cond_next2835.1:		; preds = %cond_next2861
diff --git a/llvm/test/Transforms/NewGVN/2007-07-30-PredIDom.ll b/llvm/test/Transforms/NewGVN/2007-07-30-PredIDom.ll
index 59da31c5e33e05..c70846054caf5a 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-30-PredIDom.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-30-PredIDom.ll
@@ -1,274 +1,275 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
-	%"struct.Block::$_16" = type { i32 }
-	%struct.Exp = type { ptr, i32, i32, i32, ptr, ptr, %"struct.Exp::$_10", %"struct.Block::$_16", %"struct.Exp::$_12" }
-	%"struct.Exp::$_10" = type { ptr }
-	%"struct.Exp::$_12" = type { ptr }
-	%struct.Exp_ = type { i32, i32, i32, i32, ptr }
-	%struct.Id = type { ptr, i32, i32, i32, %"struct.Id::$_13" }
-	%"struct.Id::$_13" = type { double }
+  %"struct.Block::$_16" = type { i32 }
+  %struct.Exp = type { ptr, i32, i32, i32, ptr, ptr, %"struct.Exp::$_10", %"struct.Block::$_16", %"struct.Exp::$_12" }
+  %"struct.Exp::$_10" = type { ptr }
+  %"struct.Exp::$_12" = type { ptr }
+  %struct.Exp_ = type { i32, i32, i32, i32, ptr }
+  %struct.Id = type { ptr, i32, i32, i32, %"struct.Id::$_13" }
+  %"struct.Id::$_13" = type { double }
 
 define ptr @_ZN3Exp8toStringEj(ptr %this, i32 %nextpc) {
 entry:
-	switch i32 0, label %bb970 [
-		 i32 1, label %bb
-		 i32 2, label %bb39
-		 i32 3, label %bb195
-		 i32 4, label %bb270
-		 i32 5, label %bb418
-		 i32 6, label %bb633
-		 i32 7, label %bb810
-		 i32 8, label %bb882
-		 i32 9, label %bb925
-	]
+  switch i32 0, label %bb970 [
+  i32 1, label %bb
+  i32 2, label %bb39
+  i32 3, label %bb195
+  i32 4, label %bb270
+  i32 5, label %bb418
+  i32 6, label %bb633
+  i32 7, label %bb810
+  i32 8, label %bb882
+  i32 9, label %bb925
+  ]
 
 bb:		; preds = %entry
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb39:		; preds = %entry
-	br i1 false, label %cond_true, label %cond_false132
+  br i1 false, label %cond_true, label %cond_false132
 
 cond_true:		; preds = %bb39
-	br i1 false, label %cond_true73, label %cond_false
+  br i1 false, label %cond_true73, label %cond_false
 
 cond_true73:		; preds = %cond_true
-	br i1 false, label %cond_true108, label %cond_next
+  br i1 false, label %cond_true108, label %cond_next
 
 cond_true108:		; preds = %cond_true73
-	br label %cond_next
+  br label %cond_next
 
 cond_next:		; preds = %cond_true108, %cond_true73
-	br label %cond_next131
+  br label %cond_next131
 
 cond_false:		; preds = %cond_true
-	br label %cond_next131
+  br label %cond_next131
 
 cond_next131:		; preds = %cond_false, %cond_next
-	br label %cond_next141
+  br label %cond_next141
 
 cond_false132:		; preds = %bb39
-	br label %cond_next141
+  br label %cond_next141
 
 cond_next141:		; preds = %cond_false132, %cond_next131
-	br i1 false, label %cond_true169, label %cond_false175
+  br i1 false, label %cond_true169, label %cond_false175
 
 cond_true169:		; preds = %cond_next141
-	br label %cond_next181
+  br label %cond_next181
 
 cond_false175:		; preds = %cond_next141
-	br label %cond_next181
+  br label %cond_next181
 
 cond_next181:		; preds = %cond_false175, %cond_true169
-	br i1 false, label %cond_true189, label %cond_next191
+  br i1 false, label %cond_true189, label %cond_next191
 
 cond_true189:		; preds = %cond_next181
-	br label %cond_next191
+  br label %cond_next191
 
 cond_next191:		; preds = %cond_true189, %cond_next181
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb195:		; preds = %entry
-	br i1 false, label %cond_true248, label %cond_false250
+  br i1 false, label %cond_true248, label %cond_false250
 
 cond_true248:		; preds = %bb195
-	br label %cond_next252
+  br label %cond_next252
 
 cond_false250:		; preds = %bb195
-	br label %cond_next252
+  br label %cond_next252
 
 cond_next252:		; preds = %cond_false250, %cond_true248
-	br i1 false, label %cond_true265, label %cond_next267
+  br i1 false, label %cond_true265, label %cond_next267
 
 cond_true265:		; preds = %cond_next252
-	br label %cond_next267
+  br label %cond_next267
 
 cond_next267:		; preds = %cond_true265, %cond_next252
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb270:		; preds = %entry
-	br i1 false, label %cond_true338, label %cond_false340
+  br i1 false, label %cond_true338, label %cond_false340
 
 cond_true338:		; preds = %bb270
-	br label %cond_next342
+  br label %cond_next342
 
 cond_false340:		; preds = %bb270
-	br label %cond_next342
+  br label %cond_next342
 
 cond_next342:		; preds = %cond_false340, %cond_true338
-	br i1 false, label %cond_true362, label %cond_false364
+  br i1 false, label %cond_true362, label %cond_false364
 
 cond_true362:		; preds = %cond_next342
-	br label %cond_next366
+  br label %cond_next366
 
 cond_false364:		; preds = %cond_next342
-	br label %cond_next366
+  br label %cond_next366
 
 cond_next366:		; preds = %cond_false364, %cond_true362
-	br i1 false, label %cond_true393, label %cond_next395
+  br i1 false, label %cond_true393, label %cond_next395
 
 cond_true393:		; preds = %cond_next366
-	br label %cond_next395
+  br label %cond_next395
 
 cond_next395:		; preds = %cond_true393, %cond_next366
-	br i1 false, label %cond_true406, label %cond_next408
+  br i1 false, label %cond_true406, label %cond_next408
 
 cond_true406:		; preds = %cond_next395
-	br label %cond_next408
+  br label %cond_next408
 
 cond_next408:		; preds = %cond_true406, %cond_next395
-	br i1 false, label %cond_true413, label %cond_next415
+  br i1 false, label %cond_true413, label %cond_next415
 
 cond_true413:		; preds = %cond_next408
-	br label %cond_next415
+  br label %cond_next415
 
 cond_next415:		; preds = %cond_true413, %cond_next408
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb418:		; preds = %entry
-	br i1 false, label %cond_true512, label %cond_false514
+  br i1 false, label %cond_true512, label %cond_false514
 
 cond_true512:		; preds = %bb418
-	br label %cond_next516
+  br label %cond_next516
 
 cond_false514:		; preds = %bb418
-	br label %cond_next516
+  br label %cond_next516
 
 cond_next516:		; preds = %cond_false514, %cond_true512
-	br i1 false, label %cond_true536, label %cond_false538
+  br i1 false, label %cond_true536, label %cond_false538
 
 cond_true536:		; preds = %cond_next516
-	br label %cond_next540
+  br label %cond_next540
 
 cond_false538:		; preds = %cond_next516
-	br label %cond_next540
+  br label %cond_next540
 
 cond_next540:		; preds = %cond_false538, %cond_true536
-	br i1 false, label %cond_true560, label %cond_false562
+  br i1 false, label %cond_true560, label %cond_false562
 
 cond_true560:		; preds = %cond_next540
-	br label %cond_next564
+  br label %cond_next564
 
 cond_false562:		; preds = %cond_next540
-	br label %cond_next564
+  br label %cond_next564
 
 cond_next564:		; preds = %cond_false562, %cond_true560
-	br i1 false, label %cond_true597, label %cond_next599
+  br i1 false, label %cond_true597, label %cond_next599
 
 cond_true597:		; preds = %cond_next564
-	br label %cond_next599
+  br label %cond_next599
 
 cond_next599:		; preds = %cond_true597, %cond_next564
-	br i1 false, label %cond_true614, label %cond_next616
+  br i1 false, label %cond_true614, label %cond_next616
 
 cond_true614:		; preds = %cond_next599
-	br label %cond_next616
+  br label %cond_next616
 
 cond_next616:		; preds = %cond_true614, %cond_next599
-	br i1 false, label %cond_true621, label %cond_next623
+  br i1 false, label %cond_true621, label %cond_next623
 
 cond_true621:		; preds = %cond_next616
-	br label %cond_next623
+  br label %cond_next623
 
 cond_next623:		; preds = %cond_true621, %cond_next616
-	br i1 false, label %cond_true628, label %cond_next630
+  br i1 false, label %cond_true628, label %cond_next630
 
 cond_true628:		; preds = %cond_next623
-	br label %cond_next630
+  br label %cond_next630
 
 cond_next630:		; preds = %cond_true628, %cond_next623
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb633:		; preds = %entry
-	br i1 false, label %cond_true667, label %cond_next669
+  br i1 false, label %cond_true667, label %cond_next669
 
 cond_true667:		; preds = %bb633
-	br label %cond_next669
+  br label %cond_next669
 
 cond_next669:		; preds = %cond_true667, %bb633
-	br i1 false, label %cond_true678, label %cond_next791
+  br i1 false, label %cond_true678, label %cond_next791
 
 cond_true678:		; preds = %cond_next669
-	br label %bb735
+  br label %bb735
 
 bb679:		; preds = %bb735
-	br i1 false, label %cond_true729, label %cond_next731
+  br i1 false, label %cond_true729, label %cond_next731
 
 cond_true729:		; preds = %bb679
-	br label %cond_next731
+  br label %cond_next731
 
 cond_next731:		; preds = %cond_true729, %bb679
-	br label %bb735
+  br label %bb735
 
 bb735:		; preds = %cond_next731, %cond_true678
-	br i1 false, label %bb679, label %bb743
+  br i1 false, label %bb679, label %bb743
 
 bb743:		; preds = %bb735
-	br i1 false, label %cond_true788, label %cond_next790
+  br i1 false, label %cond_true788, label %cond_next790
 
 cond_true788:		; preds = %bb743
-	br label %cond_next790
+  br label %cond_next790
 
 cond_next790:		; preds = %cond_true788, %bb743
-	br label %cond_next791
+  br label %cond_next791
 
 cond_next791:		; preds = %cond_next790, %cond_next669
-	br i1 false, label %cond_true805, label %cond_next807
+  br i1 false, label %cond_true805, label %cond_next807
 
 cond_true805:		; preds = %cond_next791
-	br label %cond_next807
+  br label %cond_next807
 
 cond_next807:		; preds = %cond_true805, %cond_next791
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb810:		; preds = %entry
-	br i1 false, label %cond_true870, label %cond_next872
+  br i1 false, label %cond_true870, label %cond_next872
 
 cond_true870:		; preds = %bb810
-	br label %cond_next872
+  br label %cond_next872
 
 cond_next872:		; preds = %cond_true870, %bb810
-	br i1 false, label %cond_true877, label %cond_next879
+  br i1 false, label %cond_true877, label %cond_next879
 
 cond_true877:		; preds = %cond_next872
-	br label %cond_next879
+  br label %cond_next879
 
 cond_next879:		; preds = %cond_true877, %cond_next872
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb882:		; preds = %entry
-	br i1 false, label %cond_true920, label %cond_next922
+  br i1 false, label %cond_true920, label %cond_next922
 
 cond_true920:		; preds = %bb882
-	br label %cond_next922
+  br label %cond_next922
 
 cond_next922:		; preds = %cond_true920, %bb882
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb925:		; preds = %entry
-	br i1 false, label %cond_true965, label %cond_next967
+  br i1 false, label %cond_true965, label %cond_next967
 
 cond_true965:		; preds = %bb925
-	br label %cond_next967
+  br label %cond_next967
 
 cond_next967:		; preds = %cond_true965, %bb925
-	store ptr null, ptr null
-	br label %return
+  store ptr null, ptr null
+  br label %return
 
 bb970:		; preds = %entry
-	unreachable
-		; No predecessors!
-	store ptr null, ptr null
-	br label %return
+  unreachable
+  ; No predecessors!
+  store ptr null, ptr null
+  br label %return
 
 return:		; preds = %0, %cond_next967, %cond_next922, %cond_next879, %cond_next807, %cond_next630, %cond_next415, %cond_next267, %cond_next191, %bb
-	%retval980 = load ptr, ptr null		; <ptr> [#uses=1]
-	ret ptr %retval980
+  %retval980 = load ptr, ptr null		; <ptr> [#uses=1]
+  ret ptr %retval980
 }
diff --git a/llvm/test/Transforms/NewGVN/2007-07-31-RedundantPhi.ll b/llvm/test/Transforms/NewGVN/2007-07-31-RedundantPhi.ll
index 934fffc986f815..5411bcc7d406ad 100644
--- a/llvm/test/Transforms/NewGVN/2007-07-31-RedundantPhi.ll
+++ b/llvm/test/Transforms/NewGVN/2007-07-31-RedundantPhi.ll
@@ -1,23 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 @img_width = external global i16		; <ptr> [#uses=2]
 
 define i32 @smpUMHEXBipredIntegerPelBlockMotionSearch(ptr %cur_pic, i16 signext  %ref, i32 %list, i32 %pic_pix_x, i32 %pic_pix_y, i32 %blocktype, i16 signext  %pred_mv_x1, i16 signext  %pred_mv_y1, i16 signext  %pred_mv_x2, i16 signext  %pred_mv_y2, ptr %mv_x, ptr %mv_y, ptr %s_mv_x, ptr %s_mv_y, i32 %search_range, i32 %min_mcost, i32 %lambda_factor) {
+; CHECK-LABEL: define i32 @smpUMHEXBipredIntegerPelBlockMotionSearch(
+; CHECK-SAME: ptr [[CUR_PIC:%.*]], i16 signext [[REF:%.*]], i32 [[LIST:%.*]], i32 [[PIC_PIX_X:%.*]], i32 [[PIC_PIX_Y:%.*]], i32 [[BLOCKTYPE:%.*]], i16 signext [[PRED_MV_X1:%.*]], i16 signext [[PRED_MV_Y1:%.*]], i16 signext [[PRED_MV_X2:%.*]], i16 signext [[PRED_MV_Y2:%.*]], ptr [[MV_X:%.*]], ptr [[MV_Y:%.*]], ptr [[S_MV_X:%.*]], ptr [[S_MV_Y:%.*]], i32 [[SEARCH_RANGE:%.*]], i32 [[MIN_MCOST:%.*]], i32 [[LAMBDA_FACTOR:%.*]]) {
+; CHECK-NEXT:  cond_next143:
+; CHECK-NEXT:    store i16 0, ptr @img_width, align 2
+; CHECK-NEXT:    br i1 false, label [[COND_NEXT449:%.*]], label [[COND_FALSE434:%.*]]
+; CHECK:       cond_false434:
+; CHECK-NEXT:    br label [[COND_NEXT449]]
+; CHECK:       cond_next449:
+; CHECK-NEXT:    br i1 false, label [[COND_NEXT698:%.*]], label [[COND_FALSE470:%.*]]
+; CHECK:       cond_false470:
+; CHECK-NEXT:    br label [[COND_NEXT698]]
+; CHECK:       cond_next698:
+; CHECK-NEXT:    ret i32 0
+;
 cond_next143:		; preds = %entry
-	store i16 0, ptr @img_width, align 2
-	br i1 false, label %cond_next449, label %cond_false434
+  store i16 0, ptr @img_width, align 2
+  br i1 false, label %cond_next449, label %cond_false434
 
 cond_false434:		; preds = %cond_true415
-	br label %cond_next449
+  br label %cond_next449
 
 cond_next449:		; preds = %cond_false434, %cond_true415
-	br i1 false, label %cond_next698, label %cond_false470
+  br i1 false, label %cond_next698, label %cond_false470
 
 cond_false470:		; preds = %cond_next449
-	br label %cond_next698
+  br label %cond_next698
 
 cond_next698:		; preds = %cond_true492
-	%tmp701 = load i16, ptr @img_width, align 2		; <i16> [#uses=0]
-; CHECK-NOT: %tmp701 =
-	ret i32 0
+  %tmp701 = load i16, ptr @img_width, align 2		; <i16> [#uses=0]
+  ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-02-13-NewPHI.ll b/llvm/test/Transforms/NewGVN/2008-02-13-NewPHI.ll
index b2440fbced41f4..49d6de7d7a0f69 100644
--- a/llvm/test/Transforms/NewGVN/2008-02-13-NewPHI.ll
+++ b/llvm/test/Transforms/NewGVN/2008-02-13-NewPHI.ll
@@ -1,22 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn
 ; PR2032
 
 define i32 @sscal(i32 %n, double %sa1, ptr %sx, i32 %incx) {
 entry:
-	%sx_addr = alloca ptr		; <ptr> [#uses=3]
-	store ptr %sx, ptr %sx_addr, align 4
-	br label %bb33
+  %sx_addr = alloca ptr		; <ptr> [#uses=3]
+  store ptr %sx, ptr %sx_addr, align 4
+  br label %bb33
 
 bb:		; preds = %bb33
-	%tmp27 = load ptr, ptr %sx_addr, align 4		; <ptr> [#uses=1]
-	store float 0.000000e+00, ptr %tmp27, align 4
-	store ptr null, ptr %sx_addr, align 4
-	br label %bb33
+  %tmp27 = load ptr, ptr %sx_addr, align 4		; <ptr> [#uses=1]
+  store float 0.000000e+00, ptr %tmp27, align 4
+  store ptr null, ptr %sx_addr, align 4
+  br label %bb33
 
 bb33:		; preds = %bb, %entry
-	br i1 false, label %bb, label %return
+  br i1 false, label %bb, label %return
 
 return:		; preds = %bb33
-	%retval59 = load i32, ptr null, align 4		; <i32> [#uses=1]
-	ret i32 %retval59
+  %retval59 = load i32, ptr null, align 4		; <i32> [#uses=1]
+  ret i32 %retval59
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-07-02-Unreachable.ll b/llvm/test/Transforms/NewGVN/2008-07-02-Unreachable.ll
index 0c1891e46d2675..cf591d7e2a79a0 100644
--- a/llvm/test/Transforms/NewGVN/2008-07-02-Unreachable.ll
+++ b/llvm/test/Transforms/NewGVN/2008-07-02-Unreachable.ll
@@ -1,36 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 ; PR2503
 
 @g_3 = external global i8		; <ptr> [#uses=2]
 
 define i8 @func_1(i32 %x, i32 %y) nounwind  {
+; CHECK-LABEL: define i8 @func_1(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IFELSE:%.*]], label [[IFTHEN:%.*]]
+; CHECK:       ifthen:
+; CHECK-NEXT:    br label [[IFEND:%.*]]
+; CHECK:       ifelse:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr @g_3, align 1
+; CHECK-NEXT:    store i8 [[TMP3]], ptr [[A]], align 1
+; CHECK-NEXT:    br label [[AFTERFOR:%.*]]
+; CHECK:       forcond:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br i1 false, label [[AFTERFOR]], label [[FORBODY:%.*]]
+; CHECK:       forbody:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[FORINC:%.*]]
+; CHECK:       forinc:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[FORCOND:%.*]]
+; CHECK:       afterfor:
+; CHECK-NEXT:    ret i8 [[TMP3]]
+; CHECK:       ifend:
+; CHECK-NEXT:    ret i8 0
+;
 entry:
   %A = alloca i8
-    %cmp = icmp eq i32 %x, %y
-	br i1 %cmp, label %ifelse, label %ifthen
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %ifelse, label %ifthen
 
 ifthen:		; preds = %entry
-	br label %ifend
+  br label %ifend
 
 ifelse:		; preds = %entry
-	%tmp3 = load i8, ptr @g_3		; <i8> [#uses=0]
-        store i8 %tmp3, ptr %A
-	br label %afterfor
+  %tmp3 = load i8, ptr @g_3		; <i8> [#uses=0]
+  store i8 %tmp3, ptr %A
+  br label %afterfor
 
 forcond:		; preds = %forinc
-	br i1 false, label %afterfor, label %forbody
+  br i1 false, label %afterfor, label %forbody
 
 forbody:		; preds = %forcond
-	br label %forinc
+  br label %forinc
 
 forinc:		; preds = %forbody
-	br label %forcond
+  br label %forcond
 
 afterfor:		; preds = %forcond, %forcond.thread
-	%tmp10 = load i8, ptr @g_3		; <i8> [#uses=0]
-	ret i8 %tmp10
-; CHECK: ret i8 %tmp3
+  %tmp10 = load i8, ptr @g_3		; <i8> [#uses=0]
+  ret i8 %tmp10
 
 ifend:		; preds = %afterfor, %ifthen
-	ret i8 0
+  ret i8 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-12-09-SelfRemove.ll b/llvm/test/Transforms/NewGVN/2008-12-09-SelfRemove.ll
index a2e252d065d2fb..95cb229f17f586 100644
--- a/llvm/test/Transforms/NewGVN/2008-12-09-SelfRemove.ll
+++ b/llvm/test/Transforms/NewGVN/2008-12-09-SelfRemove.ll
@@ -1,38 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.5"
-	%struct.anon = type { ptr, i32 }
-	%struct.d_print_info = type { i32, ptr, i32, i32, ptr, ptr, i32 }
-	%struct.d_print_mod = type { ptr, ptr, i32, ptr }
-	%struct.d_print_template = type { ptr, ptr }
-	%struct.demangle_component = type { i32, { %struct.anon } }
+  %struct.anon = type { ptr, i32 }
+  %struct.d_print_info = type { i32, ptr, i32, i32, ptr, ptr, i32 }
+  %struct.d_print_mod = type { ptr, ptr, i32, ptr }
+  %struct.d_print_template = type { ptr, ptr }
+  %struct.demangle_component = type { i32, { %struct.anon } }
 
 define void @d_print_mod_list(ptr %dpi, ptr %mods, i32 %suffix) nounwind {
+; CHECK-LABEL: define void @d_print_mod_list(
+; CHECK-SAME: ptr [[DPI:%.*]], ptr [[MODS:%.*]], i32 [[SUFFIX:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_D_PRINT_INFO:%.*]], ptr [[DPI]], i32 0, i32 1
+; CHECK-NEXT:    br i1 false, label [[RETURN:%.*]], label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB21:%.*]]
+; CHECK:       bb21:
+; CHECK-NEXT:    br label [[BB21]]
+; CHECK:       return:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    ret void
+;
 entry:
-	%0 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1		; <ptr> [#uses=1]
-	br i1 false, label %return, label %bb
+  %0 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1		; <ptr> [#uses=1]
+  br i1 false, label %return, label %bb
 
 bb:		; preds = %entry
-	%1 = load ptr, ptr %0, align 4		; <ptr> [#uses=0]
-	%2 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1		; <ptr> [#uses=0]
-	br label %bb21
+  %1 = load ptr, ptr %0, align 4		; <ptr> [#uses=0]
+  %2 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1		; <ptr> [#uses=0]
+  br label %bb21
 
 bb21:		; preds = %bb21, %bb
-	br label %bb21
+  br label %bb21
 
 return:		; preds = %entry
-	ret void
+  ret void
 }
 
-; CHECK: define void @d_print_mod_list(ptr %dpi, ptr %mods, i32 %suffix) #0 {
-; CHECK: entry:
-; CHECK:   %0 = getelementptr %struct.d_print_info, ptr %dpi, i32 0, i32 1
-; CHECK:   br i1 false, label %return, label %bb
-; CHECK: bb:
-; CHECK:   br label %bb21
-; CHECK: bb21:
-; CHECK:   br label %bb21
-; CHECK: return:
-; CHECK:   ret void
-; CHECK: }
diff --git a/llvm/test/Transforms/NewGVN/2008-12-12-RLE-Crash.ll b/llvm/test/Transforms/NewGVN/2008-12-12-RLE-Crash.ll
index bb51f72752a257..3df35a15932837 100644
--- a/llvm/test/Transforms/NewGVN/2008-12-12-RLE-Crash.ll
+++ b/llvm/test/Transforms/NewGVN/2008-12-12-RLE-Crash.ll
@@ -1,35 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
 
 define i32 @main(i32 %argc, ptr %argv) nounwind {
 entry:
-	br label %bb84
+  br label %bb84
 
 bb41:		; preds = %bb82
-	%tmp = load i8, ptr %opt.0, align 1		; <i8> [#uses=0]
-	%tmp1 = getelementptr i8, ptr %opt.0, i32 1		; <ptr> [#uses=2]
-	switch i32 0, label %bb81 [
-		i32 102, label %bb82
-		i32 110, label %bb79
-		i32 118, label %bb80
-	]
+  %tmp = load i8, ptr %opt.0, align 1		; <i8> [#uses=0]
+  %tmp1 = getelementptr i8, ptr %opt.0, i32 1		; <ptr> [#uses=2]
+  switch i32 0, label %bb81 [
+  i32 102, label %bb82
+  i32 110, label %bb79
+  i32 118, label %bb80
+  ]
 
 bb79:		; preds = %bb41
-	br label %bb82
+  br label %bb82
 
 bb80:		; preds = %bb41
-	ret i32 0
+  ret i32 0
 
 bb81:		; preds = %bb41
-	ret i32 1
+  ret i32 1
 
 bb82:		; preds = %bb84, %bb79, %bb41
-	%opt.0 = phi ptr [ %tmp3, %bb84 ], [ %tmp1, %bb79 ], [ %tmp1, %bb41 ]		; <ptr> [#uses=3]
-	%tmp2 = load i8, ptr %opt.0, align 1		; <i8> [#uses=0]
-	br i1 false, label %bb84, label %bb41
+  %opt.0 = phi ptr [ %tmp3, %bb84 ], [ %tmp1, %bb79 ], [ %tmp1, %bb41 ]		; <ptr> [#uses=3]
+  %tmp2 = load i8, ptr %opt.0, align 1		; <i8> [#uses=0]
+  br i1 false, label %bb84, label %bb41
 
 bb84:		; preds = %bb82, %entry
-	%tmp3 = getelementptr i8, ptr null, i32 1		; <ptr> [#uses=1]
-	br label %bb82
+  %tmp3 = getelementptr i8, ptr null, i32 1		; <ptr> [#uses=1]
+  br label %bb82
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-12-14-rle-reanalyze.ll b/llvm/test/Transforms/NewGVN/2008-12-14-rle-reanalyze.ll
index 38d1240fd6a29c..821355664a7f09 100644
--- a/llvm/test/Transforms/NewGVN/2008-12-14-rle-reanalyze.ll
+++ b/llvm/test/Transforms/NewGVN/2008-12-14-rle-reanalyze.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
@@ -5,14 +6,14 @@ target triple = "i386-apple-darwin7"
 
 define i32 @Quiesce(i32 %alpha, i32 %beta, i32 %wtm, i32 %ply) nounwind {
 entry:
-	br label %bb22
+  br label %bb22
 
 bb22:		; preds = %bb23, %bb22, %entry
-	br i1 false, label %bb23, label %bb22
+  br i1 false, label %bb23, label %bb22
 
 bb23:		; preds = %bb23, %bb22
-	%sortv.233 = phi ptr [ @sort_value, %bb22 ], [ %sortv.2, %bb23 ]		; <ptr> [#uses=1]
-	%0 = load i32, ptr %sortv.233, align 4		; <i32> [#uses=0]
-	%sortv.2 = getelementptr [256 x i32], ptr @sort_value, i32 0, i32 0		; <ptr> [#uses=1]
-	br i1 false, label %bb23, label %bb22
+  %sortv.233 = phi ptr [ @sort_value, %bb22 ], [ %sortv.2, %bb23 ]		; <ptr> [#uses=1]
+  %0 = load i32, ptr %sortv.233, align 4		; <i32> [#uses=0]
+  %sortv.2 = getelementptr [256 x i32], ptr @sort_value, i32 0, i32 0		; <ptr> [#uses=1]
+  br i1 false, label %bb23, label %bb22
 }
diff --git a/llvm/test/Transforms/NewGVN/2008-12-15-CacheVisited.ll b/llvm/test/Transforms/NewGVN/2008-12-15-CacheVisited.ll
index 0b1761da0c33a1..ff9ecce88b458d 100644
--- a/llvm/test/Transforms/NewGVN/2008-12-15-CacheVisited.ll
+++ b/llvm/test/Transforms/NewGVN/2008-12-15-CacheVisited.ll
@@ -1,28 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 ; Cached results must be added to and verified against the visited sets.
 ; PR3217
 
 define fastcc void @gen_field_die(ptr %decl) nounwind {
 entry:
-	br i1 false, label %bb203, label %bb202
+  br i1 false, label %bb203, label %bb202
 
 bb202:		; preds = %entry
-	unreachable
+  unreachable
 
 bb203:		; preds = %entry
-	%tmp = getelementptr i32, ptr %decl, i32 1		; <ptr> [#uses=1]
-	%tmp1 = load i32, ptr %tmp, align 4		; <i32> [#uses=0]
-	br i1 false, label %bb207, label %bb204
+  %tmp = getelementptr i32, ptr %decl, i32 1		; <ptr> [#uses=1]
+  %tmp1 = load i32, ptr %tmp, align 4		; <i32> [#uses=0]
+  br i1 false, label %bb207, label %bb204
 
 bb204:		; preds = %bb203
-	%tmp2 = getelementptr i32, ptr %decl, i32 1		; <ptr> [#uses=1]
-	br label %bb208
+  %tmp2 = getelementptr i32, ptr %decl, i32 1		; <ptr> [#uses=1]
+  br label %bb208
 
 bb207:		; preds = %bb203
-	br label %bb208
+  br label %bb208
 
 bb208:		; preds = %bb207, %bb204
-	%iftmp.1374.0.in = phi ptr [ null, %bb207 ], [ %tmp2, %bb204 ]		; <ptr> [#uses=1]
-	%iftmp.1374.0 = load i32, ptr %iftmp.1374.0.in		; <i32> [#uses=0]
-	unreachable
+  %iftmp.1374.0.in = phi ptr [ null, %bb207 ], [ %tmp2, %bb204 ]		; <ptr> [#uses=1]
+  %iftmp.1374.0 = load i32, ptr %iftmp.1374.0.in		; <i32> [#uses=0]
+  unreachable
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-01-21-SortInvalidation.ll b/llvm/test/Transforms/NewGVN/2009-01-21-SortInvalidation.ll
index 8631cbd82f5d76..1f0a32a8745849 100644
--- a/llvm/test/Transforms/NewGVN/2009-01-21-SortInvalidation.ll
+++ b/llvm/test/Transforms/NewGVN/2009-01-21-SortInvalidation.ll
@@ -1,55 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 ; PR3358
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
-	%struct.re_pattern_buffer = type { ptr, i64, i64, i64, ptr, ptr, i64, i8 }
-	%struct.re_registers = type { i32, ptr, ptr }
+  %struct.re_pattern_buffer = type { ptr, i64, i64, i64, ptr, ptr, i64, i8 }
+  %struct.re_registers = type { i32, ptr, ptr }
 
 define fastcc i32 @byte_re_match_2_internal(ptr nocapture %bufp, ptr %string1, i32 %size1, ptr %string2, i32 %size2, i32 %pos, ptr %regs, i32 %stop) nounwind {
 entry:
-	br label %bb159
+  br label %bb159
 
 succeed_label:		; preds = %bb159
-	ret i32 0
+  ret i32 0
 
 bb159:		; preds = %bb664, %bb554, %bb159, %bb159, %bb159, %entry
-	%d.0 = phi ptr [ null, %entry ], [ %d.0, %bb159 ], [ %d.0, %bb554 ], [ %d.0, %bb159 ], [ %d.0, %bb159 ], [ %d.12, %bb664 ]		; <ptr> [#uses=5]
-	switch i32 0, label %bb661 [
-		i32 0, label %bb159
-		i32 1, label %succeed_label
-		i32 13, label %bb159
-		i32 14, label %bb159
-		i32 16, label %bb411
-		i32 24, label %bb622
-		i32 28, label %bb543
-	]
+  %d.0 = phi ptr [ null, %entry ], [ %d.0, %bb159 ], [ %d.0, %bb554 ], [ %d.0, %bb159 ], [ %d.0, %bb159 ], [ %d.12, %bb664 ]		; <ptr> [#uses=5]
+  switch i32 0, label %bb661 [
+  i32 0, label %bb159
+  i32 1, label %succeed_label
+  i32 13, label %bb159
+  i32 14, label %bb159
+  i32 16, label %bb411
+  i32 24, label %bb622
+  i32 28, label %bb543
+  ]
 
 bb411:		; preds = %bb411, %bb159
-	br label %bb411
+  br label %bb411
 
 bb543:		; preds = %bb159
-	br i1 false, label %bb549, label %bb550
+  br i1 false, label %bb549, label %bb550
 
 bb549:		; preds = %bb543
-	br label %bb554
+  br label %bb554
 
 bb550:		; preds = %bb543
-	br i1 false, label %bb554, label %bb552
+  br i1 false, label %bb554, label %bb552
 
 bb552:		; preds = %bb550
-	%0 = load i8, ptr %d.0, align 8		; <i8> [#uses=0]
-	br label %bb554
+  %0 = load i8, ptr %d.0, align 8		; <i8> [#uses=0]
+  br label %bb554
 
 bb554:		; preds = %bb552, %bb550, %bb549
-	br i1 false, label %bb159, label %bb661
+  br i1 false, label %bb159, label %bb661
 
 bb622:		; preds = %bb622, %bb159
-	br label %bb622
+  br label %bb622
 
 bb661:		; preds = %bb554, %bb159
-	%d.12 = select i1 false, ptr null, ptr null		; <ptr> [#uses=1]
-	br label %bb664
+  %d.12 = select i1 false, ptr null, ptr null		; <ptr> [#uses=1]
+  br label %bb664
 
 bb664:		; preds = %bb664, %bb661
-	br i1 false, label %bb159, label %bb664
+  br i1 false, label %bb159, label %bb664
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-01-22-SortInvalidation.ll b/llvm/test/Transforms/NewGVN/2009-01-22-SortInvalidation.ll
index d8871700df6d8b..25f73dc8aa19d2 100644
--- a/llvm/test/Transforms/NewGVN/2009-01-22-SortInvalidation.ll
+++ b/llvm/test/Transforms/NewGVN/2009-01-22-SortInvalidation.ll
@@ -1,100 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
-	%struct..4sPragmaType = type { ptr, i32 }
-	%struct.AggInfo = type { i8, i8, i32, ptr, i32, ptr, i32, i32, i32, ptr, i32, i32 }
-	%struct.AggInfo_col = type { ptr, i32, i32, i32, i32, ptr }
-	%struct.AggInfo_func = type { ptr, ptr, i32, i32 }
-	%struct.AuxData = type { ptr, ptr }
-	%struct.Bitvec = type { i32, i32, i32, { [125 x i32] } }
-	%struct.BtCursor = type { ptr, ptr, ptr, ptr, ptr, ptr, i32, ptr, i32, %struct.CellInfo, i8, i8, ptr, i64, i32, i8, ptr }
-	%struct.BtLock = type { ptr, i32, i8, ptr }
-	%struct.BtShared = type { ptr, ptr, ptr, ptr, i8, i8, i8, i8, i8, i8, i8, i8, i32, i16, i16, i32, i32, i32, i32, i8, i32, ptr, ptr, ptr, %struct.BusyHandler, i32, ptr, ptr, ptr }
-	%struct.Btree = type { ptr, ptr, i8, i8, i8, i32, ptr, ptr }
-	%struct.BtreeMutexArray = type { i32, [11 x ptr] }
-	%struct.BusyHandler = type { ptr, ptr, i32 }
-	%struct.CellInfo = type { ptr, i64, i32, i32, i16, i16, i16, i16 }
-	%struct.CollSeq = type { ptr, i8, i8, ptr, ptr, ptr }
-	%struct.Column = type { ptr, ptr, ptr, ptr, i8, i8, i8, i8 }
-	%struct.Context = type { i64, i32, %struct.Fifo }
-	%struct.CountCtx = type { i64 }
-	%struct.Cursor = type { ptr, i32, i64, i64, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i64, ptr, i32, ptr, i64, ptr, ptr, i32, i64, ptr, ptr, i32, i32, ptr, ptr, ptr }
-	%struct.Db = type { ptr, ptr, i8, i8, ptr, ptr, ptr }
-	%struct.Expr = type { i8, i8, i16, ptr, ptr, ptr, ptr, %struct..4sPragmaType, %struct..4sPragmaType, i32, i32, ptr, i32, i32, ptr, ptr, i32 }
-	%struct.ExprList = type { i32, i32, i32, ptr }
-	%struct.ExprList_item = type { ptr, ptr, i8, i8, i8 }
-	%struct.FKey = type { ptr, ptr, ptr, ptr, i32, ptr, i8, i8, i8, i8 }
-	%struct.Fifo = type { i32, ptr, ptr }
-	%struct.FifoPage = type { i32, i32, i32, ptr, [1 x i64] }
-	%struct.FuncDef = type { i16, i8, i8, i8, ptr, ptr, ptr, ptr, ptr, [1 x i8] }
-	%struct.Hash = type { i8, i8, i32, i32, ptr, ptr }
-	%struct.HashElem = type { ptr, ptr, ptr, ptr, i32 }
-	%struct.IdList = type { ptr, i32, i32 }
-	%struct.Index = type { ptr, i32, ptr, ptr, ptr, i32, i8, i8, ptr, ptr, ptr, ptr, ptr }
-	%struct.KeyInfo = type { ptr, i8, i8, i8, i32, ptr, [1 x ptr] }
-	%struct.Mem = type { %struct.CountCtx, double, ptr, ptr, i32, i16, i8, i8, ptr }
-	%struct.MemPage = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i16, i16, i16, i16, i16, i16, [5 x %struct._OvflCell], ptr, ptr, ptr, i32, ptr }
-	%struct.Module = type { ptr, ptr, ptr, ptr }
-	%struct.Op = type { i8, i8, i8, i8, i32, i32, i32, { i32 } }
-	%struct.Pager = type { ptr, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, %struct.PagerLruList, ptr, ptr, ptr, i64, i64, i64, i64, i64, i32, ptr, ptr, i32, ptr, ptr, [16 x i8] }
-	%struct.PagerLruLink = type { ptr, ptr }
-	%struct.PagerLruList = type { ptr, ptr, ptr }
-	%struct.Parse = type { ptr, i32, ptr, ptr, i8, i8, i8, i8, i8, i8, i8, [8 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [12 x i32], i32, ptr, i32, i32, i32, i32, i32, ptr, i8, %struct..4sPragmaType, %struct..4sPragmaType, %struct..4sPragmaType, ptr, ptr, ptr, ptr, ptr, ptr, %struct..4sPragmaType, i8, ptr, i32 }
-	%struct.PgHdr = type { ptr, i32, ptr, ptr, %struct.PagerLruLink, ptr, i8, i8, i8, i8, i8, i16, ptr, ptr, ptr }
-	%struct.Schema = type { i32, %struct.Hash, %struct.Hash, %struct.Hash, %struct.Hash, ptr, i8, i8, i16, i32, ptr }
-	%struct.Select = type { ptr, i8, i8, i8, i8, i8, i8, i8, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, [3 x i32] }
-	%struct.SrcList = type { i16, i16, [1 x %struct.SrcList_item] }
-	%struct.SrcList_item = type { ptr, ptr, ptr, ptr, ptr, i8, i8, i32, ptr, ptr, i64 }
-	%struct.Table = type { ptr, i32, ptr, i32, ptr, i32, ptr, i32, ptr, ptr, ptr, ptr, i32, i8, i8, i8, i8, i8, i8, i8, ptr, ptr, i32, ptr, ptr }
-	%struct.TableLock = type { i32, i32, i8, ptr }
-	%struct.Trigger = type { ptr, ptr, i8, i8, ptr, ptr, %struct..4sPragmaType, ptr, ptr, ptr, ptr }
-	%struct.TriggerStack = type { ptr, i32, i32, i32, i32, i32, i32, ptr, ptr }
-	%struct.TriggerStep = type { i32, i32, ptr, ptr, %struct..4sPragmaType, ptr, ptr, ptr, ptr, ptr }
-	%struct.Vdbe = type { ptr, ptr, ptr, i32, i32, ptr, i32, i32, ptr, ptr, ptr, i32, ptr, i32, ptr, ptr, i32, i32, i32, ptr, i32, i32, %struct.Fifo, i32, i32, ptr, i32, i32, i32, i32, i32, [25 x i32], i32, i32, ptr, ptr, ptr, i8, i8, i8, i8, i8, i8, i32, i64, i32, %struct.BtreeMutexArray, i32, ptr, i32 }
-	%struct.VdbeFunc = type { ptr, i32, [1 x %struct.AuxData] }
-	%struct._OvflCell = type { ptr, i16 }
-	%struct._ht = type { i32, ptr }
-	%struct.anon = type { double }
-	%struct.sColMap = type { i32, ptr }
-	%struct.sqlite3 = type { ptr, i32, ptr, i32, i32, i32, i32, i8, i8, i8, i8, i32, ptr, i64, i64, i32, i32, i32, ptr, %struct.sqlite3InitInfo, i32, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, %struct.anon, ptr, ptr, ptr, ptr, i32, %struct.Hash, ptr, ptr, i32, %struct.Hash, %struct.Hash, %struct.BusyHandler, i32, [2 x %struct.Db], i8 }
-	%struct.sqlite3InitInfo = type { i32, i32, i8 }
-	%struct.sqlite3_context = type { ptr, ptr, %struct.Mem, ptr, i32, ptr }
-	%struct.sqlite3_file = type { ptr }
-	%struct.sqlite3_index_constraint = type { i32, i8, i8, i32 }
-	%struct.sqlite3_index_constraint_usage = type { i32, i8 }
-	%struct.sqlite3_index_info = type { i32, ptr, i32, ptr, ptr, i32, ptr, i32, i32, double }
-	%struct.sqlite3_io_methods = type { i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
-	%struct.sqlite3_module = type { i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
-	%struct.sqlite3_mutex = type opaque
-	%struct.sqlite3_vfs = type { i32, i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
-	%struct.sqlite3_vtab = type { ptr, i32, ptr }
-	%struct.sqlite3_vtab_cursor = type { ptr }
+  %struct..4sPragmaType = type { ptr, i32 }
+  %struct.AggInfo = type { i8, i8, i32, ptr, i32, ptr, i32, i32, i32, ptr, i32, i32 }
+  %struct.AggInfo_col = type { ptr, i32, i32, i32, i32, ptr }
+  %struct.AggInfo_func = type { ptr, ptr, i32, i32 }
+  %struct.AuxData = type { ptr, ptr }
+  %struct.Bitvec = type { i32, i32, i32, { [125 x i32] } }
+  %struct.BtCursor = type { ptr, ptr, ptr, ptr, ptr, ptr, i32, ptr, i32, %struct.CellInfo, i8, i8, ptr, i64, i32, i8, ptr }
+  %struct.BtLock = type { ptr, i32, i8, ptr }
+  %struct.BtShared = type { ptr, ptr, ptr, ptr, i8, i8, i8, i8, i8, i8, i8, i8, i32, i16, i16, i32, i32, i32, i32, i8, i32, ptr, ptr, ptr, %struct.BusyHandler, i32, ptr, ptr, ptr }
+  %struct.Btree = type { ptr, ptr, i8, i8, i8, i32, ptr, ptr }
+  %struct.BtreeMutexArray = type { i32, [11 x ptr] }
+  %struct.BusyHandler = type { ptr, ptr, i32 }
+  %struct.CellInfo = type { ptr, i64, i32, i32, i16, i16, i16, i16 }
+  %struct.CollSeq = type { ptr, i8, i8, ptr, ptr, ptr }
+  %struct.Column = type { ptr, ptr, ptr, ptr, i8, i8, i8, i8 }
+  %struct.Context = type { i64, i32, %struct.Fifo }
+  %struct.CountCtx = type { i64 }
+  %struct.Cursor = type { ptr, i32, i64, i64, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i64, ptr, i32, ptr, i64, ptr, ptr, i32, i64, ptr, ptr, i32, i32, ptr, ptr, ptr }
+  %struct.Db = type { ptr, ptr, i8, i8, ptr, ptr, ptr }
+  %struct.Expr = type { i8, i8, i16, ptr, ptr, ptr, ptr, %struct..4sPragmaType, %struct..4sPragmaType, i32, i32, ptr, i32, i32, ptr, ptr, i32 }
+  %struct.ExprList = type { i32, i32, i32, ptr }
+  %struct.ExprList_item = type { ptr, ptr, i8, i8, i8 }
+  %struct.FKey = type { ptr, ptr, ptr, ptr, i32, ptr, i8, i8, i8, i8 }
+  %struct.Fifo = type { i32, ptr, ptr }
+  %struct.FifoPage = type { i32, i32, i32, ptr, [1 x i64] }
+  %struct.FuncDef = type { i16, i8, i8, i8, ptr, ptr, ptr, ptr, ptr, [1 x i8] }
+  %struct.Hash = type { i8, i8, i32, i32, ptr, ptr }
+  %struct.HashElem = type { ptr, ptr, ptr, ptr, i32 }
+  %struct.IdList = type { ptr, i32, i32 }
+  %struct.Index = type { ptr, i32, ptr, ptr, ptr, i32, i8, i8, ptr, ptr, ptr, ptr, ptr }
+  %struct.KeyInfo = type { ptr, i8, i8, i8, i32, ptr, [1 x ptr] }
+  %struct.Mem = type { %struct.CountCtx, double, ptr, ptr, i32, i16, i8, i8, ptr }
+  %struct.MemPage = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i16, i16, i16, i16, i16, i16, [5 x %struct._OvflCell], ptr, ptr, ptr, i32, ptr }
+  %struct.Module = type { ptr, ptr, ptr, ptr }
+  %struct.Op = type { i8, i8, i8, i8, i32, i32, i32, { i32 } }
+  %struct.Pager = type { ptr, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, %struct.PagerLruList, ptr, ptr, ptr, i64, i64, i64, i64, i64, i32, ptr, ptr, i32, ptr, ptr, [16 x i8] }
+  %struct.PagerLruLink = type { ptr, ptr }
+  %struct.PagerLruList = type { ptr, ptr, ptr }
+  %struct.Parse = type { ptr, i32, ptr, ptr, i8, i8, i8, i8, i8, i8, i8, [8 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [12 x i32], i32, ptr, i32, i32, i32, i32, i32, ptr, i8, %struct..4sPragmaType, %struct..4sPragmaType, %struct..4sPragmaType, ptr, ptr, ptr, ptr, ptr, ptr, %struct..4sPragmaType, i8, ptr, i32 }
+  %struct.PgHdr = type { ptr, i32, ptr, ptr, %struct.PagerLruLink, ptr, i8, i8, i8, i8, i8, i16, ptr, ptr, ptr }
+  %struct.Schema = type { i32, %struct.Hash, %struct.Hash, %struct.Hash, %struct.Hash, ptr, i8, i8, i16, i32, ptr }
+  %struct.Select = type { ptr, i8, i8, i8, i8, i8, i8, i8, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, i32, i32, [3 x i32] }
+  %struct.SrcList = type { i16, i16, [1 x %struct.SrcList_item] }
+  %struct.SrcList_item = type { ptr, ptr, ptr, ptr, ptr, i8, i8, i32, ptr, ptr, i64 }
+  %struct.Table = type { ptr, i32, ptr, i32, ptr, i32, ptr, i32, ptr, ptr, ptr, ptr, i32, i8, i8, i8, i8, i8, i8, i8, ptr, ptr, i32, ptr, ptr }
+  %struct.TableLock = type { i32, i32, i8, ptr }
+  %struct.Trigger = type { ptr, ptr, i8, i8, ptr, ptr, %struct..4sPragmaType, ptr, ptr, ptr, ptr }
+  %struct.TriggerStack = type { ptr, i32, i32, i32, i32, i32, i32, ptr, ptr }
+  %struct.TriggerStep = type { i32, i32, ptr, ptr, %struct..4sPragmaType, ptr, ptr, ptr, ptr, ptr }
+  %struct.Vdbe = type { ptr, ptr, ptr, i32, i32, ptr, i32, i32, ptr, ptr, ptr, i32, ptr, i32, ptr, ptr, i32, i32, i32, ptr, i32, i32, %struct.Fifo, i32, i32, ptr, i32, i32, i32, i32, i32, [25 x i32], i32, i32, ptr, ptr, ptr, i8, i8, i8, i8, i8, i8, i32, i64, i32, %struct.BtreeMutexArray, i32, ptr, i32 }
+  %struct.VdbeFunc = type { ptr, i32, [1 x %struct.AuxData] }
+  %struct._OvflCell = type { ptr, i16 }
+  %struct._ht = type { i32, ptr }
+  %struct.anon = type { double }
+  %struct.sColMap = type { i32, ptr }
+  %struct.sqlite3 = type { ptr, i32, ptr, i32, i32, i32, i32, i8, i8, i8, i8, i32, ptr, i64, i64, i32, i32, i32, ptr, %struct.sqlite3InitInfo, i32, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, %struct.anon, ptr, ptr, ptr, ptr, i32, %struct.Hash, ptr, ptr, i32, %struct.Hash, %struct.Hash, %struct.BusyHandler, i32, [2 x %struct.Db], i8 }
+  %struct.sqlite3InitInfo = type { i32, i32, i8 }
+  %struct.sqlite3_context = type { ptr, ptr, %struct.Mem, ptr, i32, ptr }
+  %struct.sqlite3_file = type { ptr }
+  %struct.sqlite3_index_constraint = type { i32, i8, i8, i32 }
+  %struct.sqlite3_index_constraint_usage = type { i32, i8 }
+  %struct.sqlite3_index_info = type { i32, ptr, i32, ptr, ptr, i32, ptr, i32, i32, double }
+  %struct.sqlite3_io_methods = type { i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
+  %struct.sqlite3_module = type { i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
+  %struct.sqlite3_mutex = type opaque
+  %struct.sqlite3_vfs = type { i32, i32, i32, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
+  %struct.sqlite3_vtab = type { ptr, i32, ptr }
+  %struct.sqlite3_vtab_cursor = type { ptr }
 
 define fastcc void @sqlite3Insert(ptr %pParse, ptr %pTabList, ptr %pList, ptr %pSelect, ptr %pColumn, i32 %onError) nounwind {
 entry:
-	br i1 false, label %bb54, label %bb69.loopexit
+  br i1 false, label %bb54, label %bb69.loopexit
 
 bb54:		; preds = %entry
-	br label %bb69.loopexit
+  br label %bb69.loopexit
 
 bb59:		; preds = %bb63.preheader
-	%0 = load ptr, ptr %3, align 4		; <ptr> [#uses=0]
-	br label %bb65
+  %0 = load ptr, ptr %3, align 4		; <ptr> [#uses=0]
+  br label %bb65
 
 bb65:		; preds = %bb63.preheader, %bb59
-	%1 = load ptr, ptr %4, align 4		; <ptr> [#uses=0]
-	br i1 false, label %bb67, label %bb63.preheader
+  %1 = load ptr, ptr %4, align 4		; <ptr> [#uses=0]
+  br i1 false, label %bb67, label %bb63.preheader
 
 bb67:		; preds = %bb65
-	%2 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=0]
-	unreachable
+  %2 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=0]
+  unreachable
 
 bb69.loopexit:		; preds = %bb54, %entry
-	%3 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=1]
-	%4 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=1]
-	br label %bb63.preheader
+  %3 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=1]
+  %4 = getelementptr %struct.IdList, ptr %pColumn, i32 0, i32 0		; <ptr> [#uses=1]
+  br label %bb63.preheader
 
 bb63.preheader:		; preds = %bb69.loopexit, %bb65
-	br i1 false, label %bb59, label %bb65
+  br i1 false, label %bb59, label %bb65
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-03-10-PREOnVoid.ll b/llvm/test/Transforms/NewGVN/2009-03-10-PREOnVoid.ll
index 6aa79e0433d486..83177e5498fe83 100644
--- a/llvm/test/Transforms/NewGVN/2009-03-10-PREOnVoid.ll
+++ b/llvm/test/Transforms/NewGVN/2009-03-10-PREOnVoid.ll
@@ -1,21 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -disable-output
 ; PR3775
 
 ; ModuleID = 'bugpoint-reduced-simplified.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i386-pc-linux-gnu"
-	%llvm.dbg.anchor.type = type { i32, i32 }
-	%"struct.__gnu_cxx::hash<ptr>" = type <{ i8 }>
-	%struct.__sched_param = type { i32 }
-	%struct._pthread_descr_struct = type opaque
-	%struct.pthread_attr_t = type { i32, i32, %struct.__sched_param, i32, i32, i32, i32, ptr, i32 }
-	%struct.pthread_mutex_t = type { i32, i32, ptr, i32, %llvm.dbg.anchor.type }
-	%"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >" = type { %"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >::_Rb_tree_impl<std::less<ptr>,false>" }
-	%"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >::_Rb_tree_impl<std::less<ptr>,false>" = type { %"struct.__gnu_cxx::hash<ptr>", %"struct.std::_Rb_tree_node_base", i32 }
-	%"struct.std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >" = type { ptr }
-	%"struct.std::_Rb_tree_node_base" = type { i32, ptr, ptr, ptr }
-	%"struct.std::pair<std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,bool>" = type { %"struct.std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >", i8 }
-	%"struct.std::pair<ptr const,ptr>" = type { ptr, ptr }
+  %llvm.dbg.anchor.type = type { i32, i32 }
+  %"struct.__gnu_cxx::hash<ptr>" = type <{ i8 }>
+  %struct.__sched_param = type { i32 }
+  %struct._pthread_descr_struct = type opaque
+  %struct.pthread_attr_t = type { i32, i32, %struct.__sched_param, i32, i32, i32, i32, ptr, i32 }
+  %struct.pthread_mutex_t = type { i32, i32, ptr, i32, %llvm.dbg.anchor.type }
+  %"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >" = type { %"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >::_Rb_tree_impl<std::less<ptr>,false>" }
+  %"struct.std::_Rb_tree<ptr,std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > >,std::_Select1st<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,std::less<ptr>,std::allocator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > > >::_Rb_tree_impl<std::less<ptr>,false>" = type { %"struct.__gnu_cxx::hash<ptr>", %"struct.std::_Rb_tree_node_base", i32 }
+  %"struct.std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >" = type { ptr }
+  %"struct.std::_Rb_tree_node_base" = type { i32, ptr, ptr, ptr }
+  %"struct.std::pair<std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,bool>" = type { %"struct.std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >", i8 }
+  %"struct.std::pair<ptr const,ptr>" = type { ptr, ptr }
 
 @_ZL20__gthrw_pthread_oncePiPFvvE = weak alias i32 (ptr, ptr), ptr @pthread_once		; <ptr> [#uses=0]
 @_ZL27__gthrw_pthread_getspecificj = weak alias ptr (i32), ptr @pthread_getspecific		; <ptr> [#uses=0]
@@ -36,75 +37,75 @@ declare fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind readnone
 
 define fastcc void @_ZNSt8_Rb_treeIPvSt4pairIKS0_S0_ESt10_Select1stIS3_ESt4lessIS0_ESaIS3_EE16_M_insert_uniqueERKS3_(ptr noalias nocapture sret(%"struct.std::pair<std::_Rb_tree_iterator<std::pair<ptr const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,bool>") %agg.result, ptr %this, ptr %__v) nounwind {
 entry:
-	br i1 false, label %bb7, label %bb
+  br i1 false, label %bb7, label %bb
 
 bb:		; preds = %bb, %entry
-	br i1 false, label %bb5, label %bb
+  br i1 false, label %bb5, label %bb
 
 bb5:		; preds = %bb
-	call fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind
-	br i1 false, label %bb11, label %bb7
+  call fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind
+  br i1 false, label %bb11, label %bb7
 
 bb7:		; preds = %bb5, %entry
-	br label %bb11
+  br label %bb11
 
 bb11:		; preds = %bb7, %bb5
-	call fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind
-	unreachable
+  call fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind
+  unreachable
 }
 
 define i32 @pthread_once(ptr, ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define ptr @pthread_getspecific(i32) {
-       ret ptr null
+  ret ptr null
 }
 
 define i32 @pthread_setspecific(i32, ptr) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_create(ptr, ptr, ptr, ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_cancel(i32) {
-      ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutex_lock(ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutex_trylock(ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutex_unlock(ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutex_init(ptr, ptr) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_key_create(ptr, ptr) {
-       ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_key_delete(i32) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutexattr_init(ptr) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutexattr_settype(ptr, i32) {
-        ret i32 0
+  ret i32 0
 }
 
 define i32 @pthread_mutexattr_destroy(ptr) {
-       ret i32 0
+  ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-07-13-MemDepSortFail.ll b/llvm/test/Transforms/NewGVN/2009-07-13-MemDepSortFail.ll
index 24ad1852780cdb..9694ae47e977c5 100644
--- a/llvm/test/Transforms/NewGVN/2009-07-13-MemDepSortFail.ll
+++ b/llvm/test/Transforms/NewGVN/2009-07-13-MemDepSortFail.ll
@@ -1,67 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn | llvm-dis
 ; PR4256
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i386-pc-linux-gnu"
-	%llvm.dbg.anchor.type = type { i32, i32 }
-	%struct.cset = type { ptr, i8, i8, i32, ptr }
-	%struct.lmat = type { ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr }
-	%struct.re_guts = type { ptr, ptr, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, i32, i32, i32, i32, [1 x i8] }
+  %llvm.dbg.anchor.type = type { i32, i32 }
+  %struct.cset = type { ptr, i8, i8, i32, ptr }
+  %struct.lmat = type { ptr, i32, ptr, ptr, ptr, ptr, ptr, ptr, i32, ptr, ptr, ptr, ptr, ptr }
+  %struct.re_guts = type { ptr, ptr, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, i32, i32, i32, i32, [1 x i8] }
 
 define ptr @lbackref(ptr %m, ptr %start, ptr %stop, i32 %startst, i32 %stopst, i32 %lev, i32 %rec) nounwind {
 entry:
-	br label %bb63
+  br label %bb63
 
 bb:		; preds = %bb63
-	switch i32 0, label %bb62 [
-		i32 268435456, label %bb2
-		i32 805306368, label %bb9
-		i32 -1610612736, label %bb51
-	]
+  switch i32 0, label %bb62 [
+  i32 268435456, label %bb2
+  i32 805306368, label %bb9
+  i32 -1610612736, label %bb51
+  ]
 
 bb2:		; preds = %bb
-	br label %bb62
+  br label %bb62
 
 bb9:		; preds = %bb
-	%0 = load i8, ptr %sp.1, align 1		; <i8> [#uses=0]
-	br label %bb62
+  %0 = load i8, ptr %sp.1, align 1		; <i8> [#uses=0]
+  br label %bb62
 
 bb51:		; preds = %bb
-	%1 = load i8, ptr %sp.1, align 1		; <i8> [#uses=0]
-	ret ptr null
+  %1 = load i8, ptr %sp.1, align 1		; <i8> [#uses=0]
+  ret ptr null
 
 bb62:		; preds = %bb9, %bb2, %bb
-	br label %bb63
+  br label %bb63
 
 bb63:		; preds = %bb84, %bb69, %bb62, %entry
-	%sp.1 = phi ptr [ null, %bb62 ], [ %sp.1.lcssa, %bb84 ], [ %start, %entry ], [ %sp.1.lcssa, %bb69 ]		; <ptr> [#uses=3]
-	br i1 false, label %bb, label %bb65
+  %sp.1 = phi ptr [ null, %bb62 ], [ %sp.1.lcssa, %bb84 ], [ %start, %entry ], [ %sp.1.lcssa, %bb69 ]		; <ptr> [#uses=3]
+  br i1 false, label %bb, label %bb65
 
 bb65:		; preds = %bb63
-	%sp.1.lcssa = phi ptr [ %sp.1, %bb63 ]		; <ptr> [#uses=4]
-	br i1 false, label %bb66, label %bb69
+  %sp.1.lcssa = phi ptr [ %sp.1, %bb63 ]		; <ptr> [#uses=4]
+  br i1 false, label %bb66, label %bb69
 
 bb66:		; preds = %bb65
-	ret ptr null
+  ret ptr null
 
 bb69:		; preds = %bb65
-	switch i32 0, label %bb108.loopexit2.loopexit.loopexit [
-		i32 1342177280, label %bb63
-		i32 1476395008, label %bb84
-		i32 1879048192, label %bb104
-		i32 2013265920, label %bb93
-	]
+  switch i32 0, label %bb108.loopexit2.loopexit.loopexit [
+  i32 1342177280, label %bb63
+  i32 1476395008, label %bb84
+  i32 1879048192, label %bb104
+  i32 2013265920, label %bb93
+  ]
 
 bb84:		; preds = %bb69
-	%2 = tail call ptr @lbackref(ptr %m, ptr %sp.1.lcssa, ptr %stop, i32 0, i32 %stopst, i32 0, i32 0) nounwind		; <ptr> [#uses=0]
-	br label %bb63
+  %2 = tail call ptr @lbackref(ptr %m, ptr %sp.1.lcssa, ptr %stop, i32 0, i32 %stopst, i32 0, i32 0) nounwind		; <ptr> [#uses=0]
+  br label %bb63
 
 bb93:		; preds = %bb69
-	ret ptr null
+  ret ptr null
 
 bb104:		; preds = %bb69
-	%sp.1.lcssa.lcssa33 = phi ptr [ %sp.1.lcssa, %bb69 ]		; <ptr> [#uses=0]
-	unreachable
+  %sp.1.lcssa.lcssa33 = phi ptr [ %sp.1.lcssa, %bb69 ]		; <ptr> [#uses=0]
+  unreachable
 
 bb108.loopexit2.loopexit.loopexit:		; preds = %bb69
-	ret ptr null
+  ret ptr null
 }
diff --git a/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll b/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll
index 3eda7ca1b812c8..c49f651437cbb5 100644
--- a/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll
+++ b/llvm/test/Transforms/NewGVN/2009-11-12-MemDepMallocBitCast.ll
@@ -1,14 +1,19 @@
-; Test to make sure malloc's bitcast does not block detection of a store 
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; Test to make sure malloc's bitcast does not block detection of a store
 ; to aliased memory; GVN should not optimize away the load in this program.
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 define i64 @test() {
+; CHECK-LABEL: define i64 @test() {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call ptr @malloc(i64 mul (i64 ptrtoint (ptr getelementptr (i64, ptr null, i64 1) to i64), i64 4))
+; CHECK-NEXT:    store i8 42, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[Y:%.*]] = load i64, ptr [[TMP1]], align 4
+; CHECK-NEXT:    ret i64 [[Y]]
+;
   %1 = tail call ptr @malloc(i64 mul (i64 4, i64 ptrtoint (ptr getelementptr (i64, ptr null, i64 1) to i64))) ; <ptr> [#uses=2]
   store i8 42, ptr %1
   %Y = load i64, ptr %1                               ; <i64> [#uses=1]
   ret i64 %Y
-; CHECK: %Y = load i64, ptr %1
-; CHECK: ret i64 %Y
 }
 
 declare noalias ptr @malloc(i64)
diff --git a/llvm/test/Transforms/NewGVN/2010-03-31-RedundantPHIs.ll b/llvm/test/Transforms/NewGVN/2010-03-31-RedundantPHIs.ll
index 321f3cff89a1e0..c6fc7b99cdf8df 100644
--- a/llvm/test/Transforms/NewGVN/2010-03-31-RedundantPHIs.ll
+++ b/llvm/test/Transforms/NewGVN/2010-03-31-RedundantPHIs.ll
@@ -1,9 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 ; CHECK-NOT: load
 ; CHECK-NOT: phi
 
 define ptr @cat(ptr %s1, ...) nounwind {
+; CHECK-LABEL: define ptr @cat(
+; CHECK-SAME: ptr [[S1:%.*]], ...) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[BB:%.*]], label [[BB3:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb3:
+; CHECK-NEXT:    store ptr undef, ptr undef, align 4
+; CHECK-NEXT:    br i1 undef, label [[BB5:%.*]], label [[BB6:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb6:
+; CHECK-NEXT:    br label [[BB12:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    br i1 undef, label [[BB9:%.*]], label [[BB10:%.*]]
+; CHECK:       bb9:
+; CHECK-NEXT:    br label [[BB11:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    br label [[BB11]]
+; CHECK:       bb11:
+; CHECK-NEXT:    br label [[BB12]]
+; CHECK:       bb12:
+; CHECK-NEXT:    br i1 undef, label [[BB8:%.*]], label [[BB13:%.*]]
+; CHECK:       bb13:
+; CHECK-NEXT:    ret ptr undef
+;
 entry:
   br i1 undef, label %bb, label %bb3
 
diff --git a/llvm/test/Transforms/NewGVN/2010-05-08-OneBit.ll b/llvm/test/Transforms/NewGVN/2010-05-08-OneBit.ll
index 0d2d45a5e0d9ff..0a121ffab761f4 100644
--- a/llvm/test/Transforms/NewGVN/2010-05-08-OneBit.ll
+++ b/llvm/test/Transforms/NewGVN/2010-05-08-OneBit.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn
 ; PR7052
 
@@ -12,7 +13,7 @@ entry:
 
 l117.i.i:                                         ; preds = %entry
   invoke fastcc void @foo()
-          to label %.noexc5 unwind label %landing_pad
+  to label %.noexc5 unwind label %landing_pad
 
 .noexc5:                                          ; preds = %l117.i.i
   unreachable
@@ -22,7 +23,7 @@ k121.i.i:                                         ; preds = %entry
 
 l129.i.i:                                         ; preds = %k121.i.i
   invoke fastcc void @foo()
-          to label %.noexc7 unwind label %landing_pad
+  to label %.noexc7 unwind label %landing_pad
 
 .noexc7:                                          ; preds = %l129.i.i
   unreachable
@@ -34,7 +35,7 @@ k133.i.i:                                         ; preds = %k121.i.i
 
 l147.i.i:                                         ; preds = %k133.i.i
   invoke fastcc void @foo()
-          to label %.noexc10 unwind label %landing_pad
+  to label %.noexc10 unwind label %landing_pad
 
 .noexc10:                                         ; preds = %l147.i.i
   unreachable
@@ -44,10 +45,10 @@ k151.i.i:                                         ; preds = %k133.i.i
 
 landing_pad:                                      ; preds = %l147.i.i, %l129.i.i, %l117.i.i
   %exn = landingpad {ptr, i32}
-            cleanup
+  cleanup
   switch i32 undef, label %fin [
-    i32 1, label %catch1
-    i32 2, label %catch
+  i32 1, label %catch1
+  i32 2, label %catch
   ]
 
 fin:                                              ; preds = %landing_pad
diff --git a/llvm/test/Transforms/NewGVN/2010-11-13-Simplify.ll b/llvm/test/Transforms/NewGVN/2010-11-13-Simplify.ll
index b06570b0ebf8dd..3d12783ee53cc7 100644
--- a/llvm/test/Transforms/NewGVN/2010-11-13-Simplify.ll
+++ b/llvm/test/Transforms/NewGVN/2010-11-13-Simplify.ll
@@ -1,9 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 declare i32 @foo(i32) readnone
 
 define i1 @bar() {
-; CHECK-LABEL: @bar(
+; CHECK-LABEL: define i1 @bar() {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(i32 0) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @foo(i32 [[A]]) #[[ATTR0]]
+; CHECK-NEXT:    ret i1 true
+;
   %a = call i32 @foo (i32 0) readnone
   %b = call i32 @foo (i32 0) readnone
   %c = and i32 %a, %b
@@ -11,5 +16,4 @@ define i1 @bar() {
   %y = call i32 @foo (i32 %c) readnone
   %z = icmp eq i32 %x, %y
   ret i1 %z
-; CHECK: ret i1 true
-} 
+}
diff --git a/llvm/test/Transforms/NewGVN/2011-04-27-phioperands.ll b/llvm/test/Transforms/NewGVN/2011-04-27-phioperands.ll
index 3e8a5d84405eaf..c039422be84ed1 100644
--- a/llvm/test/Transforms/NewGVN/2011-04-27-phioperands.ll
+++ b/llvm/test/Transforms/NewGVN/2011-04-27-phioperands.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
@@ -64,10 +65,10 @@ doemit.exit76.i:
 
 "<bb 64>.i":
   switch i32 undef, label %"<bb 5>" [
-    i32 42, label %"<L54>.i"
-    i32 43, label %"<L55>.i"
-    i32 63, label %"<L56>.i"
-    i32 123, label %"<bb 5>.i258.i"
+  i32 42, label %"<L54>.i"
+  i32 43, label %"<L55>.i"
+  i32 63, label %"<L56>.i"
+  i32 123, label %"<bb 5>.i258.i"
   ]
 
 "<L54>.i":
@@ -93,14 +94,14 @@ doemit.exit127.i:
 
 "<bb 5>":
   switch i32 undef, label %"<L39>.i" [
-    i32 36, label %"<L19>.i"
-    i32 94, label %"<L18>.i"
-    i32 124, label %"<L98>.i"
-    i32 42, label %"<L99>.i"
-    i32 43, label %"<L99>.i"
-    i32 46, label %"<L24>.i"
-    i32 63, label %"<L99>.i"
-    i32 91, label %"<L28>.i"
-    i32 92, label %"<L29>.i"
+  i32 36, label %"<L19>.i"
+  i32 94, label %"<L18>.i"
+  i32 124, label %"<L98>.i"
+  i32 42, label %"<L99>.i"
+  i32 43, label %"<L99>.i"
+  i32 46, label %"<L24>.i"
+  i32 63, label %"<L99>.i"
+  i32 91, label %"<L28>.i"
+  i32 92, label %"<L29>.i"
   ]
 }
diff --git a/llvm/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll b/llvm/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
index c547e8fac5e583..444385d81ab416 100644
--- a/llvm/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
+++ b/llvm/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
@@ -1,9 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 ;
 
 %0 = type { i64, i1 }
 
 define i64 @test1(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test1(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[UADD_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[UADD_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %uadd = tail call %0 @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %uadd.0 = extractvalue %0 %uadd, 0
@@ -12,11 +25,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test1(
-; CHECK-NOT: add1
-; CHECK: ret
 
 define i64 @test2(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test2(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[USUB_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[USUB_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %usub = tail call %0 @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %usub.0 = extractvalue %0 %usub, 0
@@ -25,11 +47,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test2(
-; CHECK-NOT: sub1
-; CHECK: ret
 
 define i64 @test3(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test3(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[UMUL_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[UMUL_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %umul = tail call %0 @llvm.umul.with.overflow.i64(i64 %a, i64 %b)
   %umul.0 = extractvalue %0 %umul, 0
@@ -38,11 +69,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test3(
-; CHECK-NOT: mul1
-; CHECK: ret
 
 define i64 @test4(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test4(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[SADD_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[SADD_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %sadd = tail call %0 @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
   %sadd.0 = extractvalue %0 %sadd, 0
@@ -51,11 +91,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test4(
-; CHECK-NOT: add1
-; CHECK: ret
 
 define i64 @test5(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test5(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[SSUB_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[SSUB_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %ssub = tail call %0 @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
   %ssub.0 = extractvalue %0 %ssub, 0
@@ -64,11 +113,20 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test5(
-; CHECK-NOT: sub1
-; CHECK: ret
 
 define i64 @test6(i64 %a, i64 %b) nounwind ssp {
+; CHECK-LABEL: define i64 @test6(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[TMP0]] poison, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[TMP0]] [[TMP2]], i1 [[TMP3]], 1
+; CHECK-NEXT:    [[SMUL_0:%.*]] = extractvalue [[TMP0]] [[TMP4]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[TMP1]], [[SMUL_0]]
+; CHECK-NEXT:    ret i64 [[ADD2]]
+;
 entry:
   %smul = tail call %0 @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
   %smul.0 = extractvalue %0 %smul, 0
@@ -77,9 +135,6 @@ entry:
   ret i64 %add2
 }
 
-; CHECK-LABEL: @test6(
-; CHECK-NOT: mul1
-; CHECK: ret
 
 declare void @exit(i32) noreturn
 declare %0 @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll b/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
index 46e3c2830920ce..675e7da26a1059 100644
--- a/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
+++ b/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 %struct.__fundamental_type_info_pseudo = type { %struct.__type_info_pseudo }
 %struct.__type_info_pseudo = type { ptr, ptr }
@@ -18,26 +19,70 @@ declare void @__cxa_end_catch()
 declare i32 @__gxx_personality_v0(i32, i64, ptr, ptr)
 
 define void @_Z3foov() uwtable personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: define void @_Z3foov(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] personality ptr @__gxx_personality_v0 {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    invoke void @_Z4barv()
+; CHECK-NEXT:            to label [[RETURN:%.*]] unwind label [[LPAD:%.*]]
+; CHECK:       lpad:
+; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            catch ptr @_ZTIi
+; CHECK-NEXT:            catch ptr @_ZTIb
+; CHECK-NEXT:            catch ptr @_ZTIi
+; CHECK-NEXT:            catch ptr @_ZTIb
+; CHECK-NEXT:    [[EXC_PTR2_I:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0
+; CHECK-NEXT:    [[FILTER3_I:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1
+; CHECK-NEXT:    [[TYPEID_I:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[FILTER3_I]], [[TYPEID_I]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[PPAD:%.*]], label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    [[TYPEID1_I:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[FILTER3_I]], [[TYPEID1_I]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PPAD2:%.*]], label [[NEXT2:%.*]]
+; CHECK:       ppad:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXC_PTR2_I]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR1]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       ppad2:
+; CHECK-NEXT:    [[D_2073_5_I:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXC_PTR2_I]]) #[[ATTR1]]
+; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR1]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       next2:
+; CHECK-NEXT:    call void @_Z7cleanupv()
+; CHECK-NEXT:    br i1 [[TMP1]], label [[PPAD3:%.*]], label [[NEXT3:%.*]]
+; CHECK:       next3:
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PPAD4:%.*]], label [[UNWIND:%.*]]
+; CHECK:       unwind:
+; CHECK-NEXT:    resume { ptr, i32 } [[TMP0]]
+; CHECK:       ppad3:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXC_PTR2_I]]) #[[ATTR1]]
+; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR1]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       ppad4:
+; CHECK-NEXT:    [[D_2080_5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXC_PTR2_I]]) #[[ATTR1]]
+; CHECK-NEXT:    tail call void @__cxa_end_catch() #[[ATTR1]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
 entry:
   invoke void @_Z4barv()
-          to label %return unwind label %lpad
+  to label %return unwind label %lpad
 
 lpad:                                             ; preds = %entry
   %0 = landingpad { ptr, i32 }
-          catch ptr @_ZTIi
-          catch ptr @_ZTIb
-          catch ptr @_ZTIi
-          catch ptr @_ZTIb
+  catch ptr @_ZTIi
+  catch ptr @_ZTIb
+  catch ptr @_ZTIi
+  catch ptr @_ZTIb
   %exc_ptr2.i = extractvalue { ptr, i32 } %0, 0
   %filter3.i = extractvalue { ptr, i32 } %0, 1
   %typeid.i = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
-; CHECK: call i32 @llvm.eh.typeid.for
   %1 = icmp eq i32 %filter3.i, %typeid.i
   br i1 %1, label %ppad, label %next
 
 next:                                             ; preds = %lpad
   %typeid1.i = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
-; CHECK: call i32 @llvm.eh.typeid.for
   %2 = icmp eq i32 %filter3.i, %typeid1.i
   br i1 %2, label %ppad2, label %next2
 
@@ -54,7 +99,6 @@ ppad2:                                            ; preds = %next
 next2:                                            ; preds = %next
   call void @_Z7cleanupv()
   %typeid = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
-; CHECK-NOT: call i32 @llvm.eh.typeid.for
   %4 = icmp eq i32 %filter3.i, %typeid
   br i1 %4, label %ppad3, label %next3
 
diff --git a/llvm/test/Transforms/NewGVN/2012-05-22-PreCrash.ll b/llvm/test/Transforms/NewGVN/2012-05-22-PreCrash.ll
index 787a3ba70325e6..1357f2b809f748 100644
--- a/llvm/test/Transforms/NewGVN/2012-05-22-PreCrash.ll
+++ b/llvm/test/Transforms/NewGVN/2012-05-22-PreCrash.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn
 ; PR12858
 
diff --git a/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll b/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll
index 2fb275d7aaf4de..7b3f33bd4dbea7 100644
--- a/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll
+++ b/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail-inseltpoison.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; XFAIL: *
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
@@ -14,6 +15,25 @@ declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x
 ; CHECK: llvm.masked.scatter
 ; CHECK: llvm.masked.gather
 define spir_kernel void @test(<2 x ptr> %in1, <2 x ptr> %in2, ptr %out) {
+; CHECK-LABEL: define spir_kernel void @test(
+; CHECK-SAME: <2 x ptr> [[IN1:%.*]], <2 x ptr> [[IN2:%.*]], ptr [[OUT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP_1:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP_I:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP_0]], i32 0
+; CHECK-NEXT:    [[TMP:%.*]] = insertelement <2 x ptr> [[TMP_I]], ptr [[TMP_1]], i32 1
+; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN1]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN2]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> <i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> <i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; CHECK-NEXT:    [[TMP_V_1_0:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 0
+; CHECK-NEXT:    [[TMP_V_1_1:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 1
+; CHECK-NEXT:    store i32 [[TMP_V_1_0]], ptr [[OUT]], align 4
+; CHECK-NEXT:    [[OUT_1:%.*]] = getelementptr i32, ptr [[OUT]], i32 1
+; CHECK-NEXT:    store i32 [[TMP_V_1_1]], ptr [[OUT_1]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   ; Just some temporary storage
   %tmp.0 = alloca i32
diff --git a/llvm/test/Transforms/NewGVN/MemdepMiscompile.ll b/llvm/test/Transforms/NewGVN/MemdepMiscompile.ll
index f2d1827d87b0d0..a3f1f4d8182fb3 100644
--- a/llvm/test/Transforms/NewGVN/MemdepMiscompile.ll
+++ b/llvm/test/Transforms/NewGVN/MemdepMiscompile.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.0"
@@ -7,14 +8,38 @@ target triple = "x86_64-apple-macosx10.7.0"
 ; Make sure we do not replace load %shouldExit in while.cond.backedge
 ; with a phi node where the value from while.body is 0.
 define i32 @test() nounwind ssp {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHOULDEXIT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TASKSIDLE:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[SHOULDEXIT]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[TASKSIDLE]], align 4
+; CHECK-NEXT:    call void @CTestInitialize(ptr [[TASKSIDLE]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SHOULDEXIT]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[WHILE_BODY_LR_PH:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       while.body.lr.ph:
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    call void @RunInMode(i32 100) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TASKSIDLE]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_COND_BACKEDGE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 0, ptr [[TASKSIDLE]], align 4
+; CHECK-NEXT:    call void @TimerCreate(ptr [[SHOULDEXIT]]) #[[ATTR1]]
+; CHECK-NEXT:    br label [[WHILE_COND_BACKEDGE]]
+; CHECK:       while.cond.backedge:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SHOULDEXIT]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_COND_WHILE_END_CRIT_EDGE:%.*]]
+; CHECK:       while.cond.while.end_crit_edge:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
-; CHECK: test()
-; CHECK: while.body:
-; CHECK: call void @RunInMode
-; CHECK: br i1 %tobool, label %while.cond.backedge, label %if.then
-; CHECK: while.cond.backedge:
-; CHECK: load i32, ptr %shouldExit
-; CHECK: br i1 %cmp, label %while.body
   %shouldExit = alloca i32, align 4
   %tasksIdle = alloca i32, align 4
   store i32 0, ptr %shouldExit, align 4
diff --git a/llvm/test/Transforms/NewGVN/addrspacecast.ll b/llvm/test/Transforms/NewGVN/addrspacecast.ll
index fea8a2f3f85869..394db28425d98b 100644
--- a/llvm/test/Transforms/NewGVN/addrspacecast.ll
+++ b/llvm/test/Transforms/NewGVN/addrspacecast.ll
@@ -7,7 +7,7 @@ define ptr addrspace(1) @addrspacecast(ptr %ptr) {
 ; CHECK-NEXT:    [[Z1:%.*]] = addrspacecast ptr [[PTR:%.*]] to ptr addrspace(1)
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    store ptr addrspace(1) [[Z1]], ptr undef
+; CHECK-NEXT:    store ptr addrspace(1) [[Z1]], ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[Z1]]
 ;
 block1:
@@ -29,7 +29,7 @@ define ptr addrspace(1) @addrspacecast_different_result_types(ptr %ptr) {
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
 ; CHECK-NEXT:    [[Z2:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
-; CHECK-NEXT:    store ptr addrspace(2) [[Z1]], ptr undef
+; CHECK-NEXT:    store ptr addrspace(2) [[Z1]], ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[Z2]]
 ;
 block1:
@@ -48,7 +48,7 @@ define ptr addrspace(1) @addrspacecast_simplify(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    [[CAST0:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    store ptr addrspace(1) [[PTR]], ptr undef
+; CHECK-NEXT:    store ptr addrspace(1) [[PTR]], ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[PTR]]
 ;
 block1:
@@ -70,7 +70,7 @@ define ptr addrspace(1) @addrspacecast_constant() {
 ; CHECK-NEXT:    store ptr undef, ptr @h, align 4
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    store ptr addrspace(1) undef, ptr undef
+; CHECK-NEXT:    store ptr addrspace(1) undef, ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) undef
 ;
 block1:
@@ -88,11 +88,11 @@ block2:
 define ptr addrspace(1) @addrspacecast_leader(ptr %arg.ptr) {
 ; CHECK-LABEL: @addrspacecast_leader(
 ; CHECK-NEXT:  block1:
-; CHECK-NEXT:    [[LOAD0:%.*]] = load ptr, ptr [[ARG_PTR:%.*]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load ptr, ptr [[ARG_PTR:%.*]], align 8
 ; CHECK-NEXT:    [[Z1:%.*]] = addrspacecast ptr [[LOAD0]] to ptr addrspace(1)
 ; CHECK-NEXT:    br label [[BLOCK2:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    store ptr addrspace(1) [[Z1]], ptr undef
+; CHECK-NEXT:    store ptr addrspace(1) [[Z1]], ptr undef, align 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[Z1]]
 ;
 block1:
diff --git a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
index baef8b51fbc15e..53190466963a19 100644
--- a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
+++ b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
@@ -245,7 +245,7 @@ bb23:                                             ; preds = %bb4
 define i8 @irreducible_memoryphi(ptr noalias %arg, ptr noalias %arg2) {
 ; CHECK-LABEL: @irreducible_memoryphi(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    store i8 0, ptr [[ARG:%.*]]
+; CHECK-NEXT:    store i8 0, ptr [[ARG:%.*]], align 1
 ; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB2]]
diff --git a/llvm/test/Transforms/NewGVN/basic-undef-test.ll b/llvm/test/Transforms/NewGVN/basic-undef-test.ll
index 5b731fc5f9fa07..148c0227adf269 100644
--- a/llvm/test/Transforms/NewGVN/basic-undef-test.ll
+++ b/llvm/test/Transforms/NewGVN/basic-undef-test.ll
@@ -1,15 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 ; ModuleID = 'test3.ll'
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 define i32 @main(ptr %foo)  {
+; CHECK-LABEL: define i32 @main(
+; CHECK-SAME: ptr [[FOO:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[FOO]], align 4
+; CHECK-NEXT:    store i32 5, ptr undef, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
 entry:
-; CHECK: load i32, ptr %foo, align 4
   %0 = load i32, ptr %foo, align 4
   store i32 5, ptr undef, align 4
-; CHECK-NOT: load i32, ptr %foo, align 4
   %1 = load i32, ptr %foo, align 4
-; CHECK: add i32 %0, %0
   %2 = add i32 %0, %1
   ret i32 %2
 }
diff --git a/llvm/test/Transforms/NewGVN/br-identical.ll b/llvm/test/Transforms/NewGVN/br-identical.ll
index c99838525aa2d7..23f43b01f443b6 100644
--- a/llvm/test/Transforms/NewGVN/br-identical.ll
+++ b/llvm/test/Transforms/NewGVN/br-identical.ll
@@ -1,8 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S -o - %s | FileCheck %s
 
 ; If a branch has two identical successors, we cannot declare either dead.
 
 define void @widget(i1 %p) {
+; CHECK-LABEL: define void @widget(
+; CHECK-SAME: i1 [[P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[T2:%.*]], [[BB7:%.*]] ]
+; CHECK-NEXT:    [[T2]] = add i64 [[T1]], 1
+; CHECK-NEXT:    [[T3:%.*]] = icmp ult i64 0, [[T2]]
+; CHECK-NEXT:    br i1 [[T3]], label [[BB3:%.*]], label [[BB4:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[T4:%.*]] = call i64 @f()
+; CHECK-NEXT:    br label [[BB4]]
+; CHECK:       bb4:
+; CHECK-NEXT:    br i1 [[P]], label [[BB5:%.*]], label [[BB6:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    br i1 true, label [[BB7]], label [[BB7]]
+; CHECK:       bb6:
+; CHECK-NEXT:    br i1 true, label [[BB7]], label [[BB7]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br i1 [[P]], label [[BB2]], label [[BB8:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %bb2
 
@@ -17,7 +41,6 @@ bb3:
   br label %bb4
 
 bb4:
-  ; CHECK-NOT: phi {{.*}} undef
   %foo = phi i64 [ %t4, %bb3 ], [ 0, %bb2 ]
   br i1 %p, label %bb5, label %bb6
 
diff --git a/llvm/test/Transforms/NewGVN/calloc-load-removal.ll b/llvm/test/Transforms/NewGVN/calloc-load-removal.ll
index a8a1e66d97d95f..608f739e175fe3 100644
--- a/llvm/test/Transforms/NewGVN/calloc-load-removal.ll
+++ b/llvm/test/Transforms/NewGVN/calloc-load-removal.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn < %s | FileCheck %s
 ; Check that loads from calloc are recognized as being zero.
 
@@ -5,14 +6,15 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Function Attrs: nounwind uwtable
 define i32 @test1() {
+; CHECK-LABEL: define i32 @test1() {
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias ptr @calloc(i64 1, i64 4)
+; CHECK-NEXT:    ret i32 0
+;
   %1 = tail call noalias ptr @calloc(i64 1, i64 4)
   ; This load is trivially constant zero
   %2 = load i32, ptr %1, align 4
   ret i32 %2
 
-; CHECK-LABEL: @test1(
-; CHECK-NOT: %2 = load i32, ptr %1, align 4
-; CHECK: ret i32 0
 }
 
 declare noalias ptr @calloc(i64, i64) mustprogress nofree nounwind willreturn allockind("alloc,zeroed") allocsize(0,1) "alloc-family"="malloc"
diff --git a/llvm/test/Transforms/NewGVN/calls-readonly.ll b/llvm/test/Transforms/NewGVN/calls-readonly.ll
index 68d74c1aeda7fd..49f5d3aa685404 100644
--- a/llvm/test/Transforms/NewGVN/calls-readonly.ll
+++ b/llvm/test/Transforms/NewGVN/calls-readonly.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 ; Should delete the second call to strlen even though the intervening strchr call exists.
 
@@ -5,6 +6,22 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 target triple = "i386-apple-darwin7"
 
 define ptr @test(ptr %P, ptr %Q, i32 %x, i32 %y) nounwind readonly {
+; CHECK-LABEL: define ptr @test(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @strlen(ptr [[P]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB:%.*]], label [[BB1:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[X]], [[Y]]
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[X_ADDR_0:%.*]] = phi i32 [ [[TMP2]], [[BB]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr [[Q]], i32 97)
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X_ADDR_0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i32 [[X_ADDR_0]]
+; CHECK-NEXT:    ret ptr [[TMP5]]
+;
 entry:
   %0 = tail call i32 @strlen(ptr %P)              ; <i32> [#uses=2]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
@@ -24,21 +41,6 @@ bb1:                                              ; preds = %bb, %entry
   ret ptr %6
 }
 
-; CHECK: define ptr @test(ptr %P, ptr %Q, i32 %x, i32 %y) #0 {
-; CHECK: entry:
-; CHECK-NEXT:   %0 = tail call i32 @strlen(ptr %P)
-; CHECK-NEXT:   %1 = icmp eq i32 %0, 0
-; CHECK-NEXT:   br i1 %1, label %bb, label %bb1
-; CHECK: bb:
-; CHECK-NEXT:   %2 = sdiv i32 %x, %y
-; CHECK-NEXT:   br label %bb1
-; CHECK: bb1:
-; CHECK-NEXT:   %x_addr.0 = phi i32 [ %2, %bb ], [ %x, %entry ]
-; CHECK-NEXT:   %3 = tail call ptr @strchr(ptr %Q, i32 97)
-; CHECK-NEXT:   %4 = add i32 %x_addr.0, %0
-; CHECK-NEXT:   %5 = getelementptr i8, ptr %3, i32 %x_addr.0
-; CHECK-NEXT:   ret ptr %5
-; CHECK: }
 
 declare i32 @strlen(ptr) nounwind readonly
 
diff --git a/llvm/test/Transforms/NewGVN/completeness.ll b/llvm/test/Transforms/NewGVN/completeness.ll
index d968c785ceff0f..4841e2e958b281 100644
--- a/llvm/test/Transforms/NewGVN/completeness.ll
+++ b/llvm/test/Transforms/NewGVN/completeness.ll
@@ -6,9 +6,12 @@ define i32 @test1(i32, ptr) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
-; CHECK:         br label [[TMP6:%.*]]
-; CHECK:         br label [[TMP6]]
-; CHECK:         [[PHIOFOPS:%.*]] = phi i32 [ 105, [[TMP5]] ], [ 75, [[TMP4]] ]
+; CHECK:       4:
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       5:
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       6:
+; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i32 [ 105, [[TMP5]] ], [ 75, [[TMP4]] ]
 ; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 5, [[TMP4]] ], [ 7, [[TMP5]] ]
 ; CHECK-NEXT:    ret i32 [[PHIOFOPS]]
 ;
@@ -31,9 +34,12 @@ define i32 @test1b(i32, ptr) {
 ; CHECK-LABEL: @test1b(
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
-; CHECK:         br label [[TMP6:%.*]]
-; CHECK:         br label [[TMP6]]
-; CHECK:         [[PHIOFOPS1:%.*]] = phi i32 [ 105, [[TMP5]] ], [ 75, [[TMP4]] ]
+; CHECK:       4:
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       5:
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       6:
+; CHECK-NEXT:    [[PHIOFOPS1:%.*]] = phi i32 [ 105, [[TMP5]] ], [ 75, [[TMP4]] ]
 ; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i32 [ 1575, [[TMP5]] ], [ 1125, [[TMP4]] ]
 ; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 5, [[TMP4]] ], [ 7, [[TMP5]] ]
 ; CHECK-NEXT:    ret i32 [[PHIOFOPS]]
@@ -58,9 +64,12 @@ define i32 @test2(i32) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; CHECK:         br label [[TMP5:%.*]]
-; CHECK:         br label [[TMP5]]
-; CHECK:         [[DOT01:%.*]] = phi i32 [ 3, [[TMP3]] ], [ 2, [[TMP4]] ]
+; CHECK:       3:
+; CHECK-NEXT:    br label [[TMP5:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    br label [[TMP5]]
+; CHECK:       5:
+; CHECK-NEXT:    [[DOT01:%.*]] = phi i32 [ 3, [[TMP3]] ], [ 2, [[TMP4]] ]
 ; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ 2, [[TMP3]] ], [ 3, [[TMP4]] ]
 ; CHECK-NEXT:    ret i32 5
 ;
@@ -158,9 +167,12 @@ define i32 @test4(i32, ptr, ptr noalias, ptr noalias) {
 ; CHECK-NEXT:    store i32 7, ptr [[TMP3:%.*]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP0:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:         br label [[TMP8:%.*]]
-; CHECK:         br label [[TMP8]]
-; CHECK:         [[DOT01:%.*]] = phi i32 [ 5, [[TMP6]] ], [ 7, [[TMP7]] ]
+; CHECK:       6:
+; CHECK-NEXT:    br label [[TMP8:%.*]]
+; CHECK:       7:
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:       8:
+; CHECK-NEXT:    [[DOT01:%.*]] = phi i32 [ 5, [[TMP6]] ], [ 7, [[TMP7]] ]
 ; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ [[TMP2]], [[TMP6]] ], [ [[TMP3]], [[TMP7]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOT0]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[TMP9]], 15
@@ -287,19 +299,19 @@ bb28:                                             ; preds = %bb27, %bb
 define i8 @test6(ptr %addr) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry-block:
-; CHECK-NEXT:    br label %main-loop
+; CHECK-NEXT:    br label [[MAIN_LOOP:%.*]]
 ; CHECK:       main-loop:
-; CHECK-NEXT:    [[PHIOFOPS1:%.*]] = phi i1 [ true, %entry-block ], [ false, [[CORE:%.*]] ]
-; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i1 [ false, %entry-block ], [ true, [[CORE]] ]
-; CHECK-NEXT:    [[PHI:%.*]] = phi i8 [ 0, %entry-block ], [ 1, [[CORE]] ]
-; CHECK-NEXT:    store volatile i8 0, ptr [[ADDR:%.*]]
-; CHECK-NEXT:    br i1 [[PHIOFOPS1]], label %busy-wait-phi-0, label [[EXIT:%.*]]
+; CHECK-NEXT:    [[PHIOFOPS1:%.*]] = phi i1 [ true, [[ENTRY_BLOCK:%.*]] ], [ false, [[CORE:%.*]] ]
+; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i1 [ false, [[ENTRY_BLOCK]] ], [ true, [[CORE]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i8 [ 0, [[ENTRY_BLOCK]] ], [ 1, [[CORE]] ]
+; CHECK-NEXT:    store volatile i8 0, ptr [[ADDR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[PHIOFOPS1]], label [[BUSY_WAIT_PHI_0:%.*]], label [[EXIT:%.*]]
 ; CHECK:       busy-wait-phi-0:
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i8, ptr [[ADDR]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i8, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[LOAD]], 0
-; CHECK-NEXT:    br i1 [[ICMP]], label %busy-wait-phi-0, label [[CORE]]
+; CHECK-NEXT:    br i1 [[ICMP]], label [[BUSY_WAIT_PHI_0]], label [[CORE]]
 ; CHECK:       core:
-; CHECK-NEXT:    br i1 [[PHIOFOPS]], label [[TRAP:%.*]], label %main-loop
+; CHECK-NEXT:    br i1 [[PHIOFOPS]], label [[TRAP:%.*]], label [[MAIN_LOOP]]
 ; CHECK:       trap:
 ; CHECK-NEXT:    ret i8 1
 ; CHECK:       exit:
@@ -507,13 +519,13 @@ declare ptr @wombat()
 define void @test12(ptr %p) {
 ; CHECK-LABEL: @test12(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr %p
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[TMP]], 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[BB2:%.*]], label [[BB8:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    br i1 true, label [[BB6:%.*]], label [[BB7]]
+; CHECK-NEXT:    br i1 true, label [[BB6:%.*]], label [[BB7:%.*]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    br label [[BB7]]
 ; CHECK:       bb7:
@@ -551,7 +563,7 @@ define void @test13() {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP:%.*]] = load i8, ptr null
+; CHECK-NEXT:    [[TMP:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[PHIOFOPS:%.*]] = phi i8 [ [[TMP]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3]] ]
@@ -560,7 +572,7 @@ define void @test13() {
 ; CHECK-NEXT:    [[TMP6]] = getelementptr i8, ptr [[TMP4]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext i8 [[PHIOFOPS]] to i32
 ; CHECK-NEXT:    [[TMP9]] = mul i32 [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10]] = load i8, ptr [[TMP6]]
+; CHECK-NEXT:    [[TMP10]] = load i8, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[BB12:%.*]], label [[BB3]]
 ; CHECK:       bb12:
diff --git a/llvm/test/Transforms/NewGVN/cond_br.ll b/llvm/test/Transforms/NewGVN/cond_br.ll
index 3dbeb394c7cfa7..930e5b30c08884 100644
--- a/llvm/test/Transforms/NewGVN/cond_br.ll
+++ b/llvm/test/Transforms/NewGVN/cond_br.ll
@@ -1,12 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 @y = external global i32
 @z = external global i32
 
 ; Function Attrs: nounwind ssp uwtable
 define void @foo(i32 %x) {
-; CHECK: @foo(i32 %x)
-; CHECK: %.pre = load i32, ptr @y
-; CHECK: call void @bar(i32 %.pre)
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @y, align 4
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]]
+; CHECK:       entry.if.end_crit_edge:
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    tail call void @bar(i32 [[DOTPRE]])
+; CHECK-NEXT:    ret void
+;
 
   %t = sub i32 %x, %x
   %.pre = load i32, ptr @y, align 4
@@ -28,9 +39,21 @@ if.end:                                           ; preds = %entry.if.end_crit_e
 }
 
 define void @foo2(i32 %x) {
-; CHECK: @foo2(i32 %x)
-; CHECK: %.pre = load i32, ptr @y
-; CHECK: tail call void @bar(i32 %.pre)
+; CHECK-LABEL: define void @foo2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @y, align 4
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    store i32 1, ptr @z, align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    tail call void @bar(i32 [[DOTPRE]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %t = sub i32 %x, %x
   %.pre = load i32, ptr @y, align 4
diff --git a/llvm/test/Transforms/NewGVN/condprop.ll b/llvm/test/Transforms/NewGVN/condprop.ll
index e685dfedab1095..d97fd380c730b2 100644
--- a/llvm/test/Transforms/NewGVN/condprop.ll
+++ b/llvm/test/Transforms/NewGVN/condprop.ll
@@ -134,11 +134,11 @@ define void @test4(i1 %b, i32 %x) {
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
 ; CHECK:       sw:
 ; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[CASE0:%.*]]
-; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
-; CHECK-NEXT:    i32 2, label [[CASE0]]
-; CHECK-NEXT:    i32 3, label [[CASE3]]
-; CHECK-NEXT:    i32 4, label [[DEFAULT]]
+; CHECK-NEXT:      i32 0, label [[CASE0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE0]]
+; CHECK-NEXT:      i32 3, label [[CASE3]]
+; CHECK-NEXT:      i32 4, label [[DEFAULT]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default:
 ; CHECK-NEXT:    call void @bar(i32 [[X]])
diff --git a/llvm/test/Transforms/NewGVN/crash-no-aa.ll b/llvm/test/Transforms/NewGVN/crash-no-aa.ll
index 55e2bcb00fb758..30f2e379a37603 100644
--- a/llvm/test/Transforms/NewGVN/crash-no-aa.ll
+++ b/llvm/test/Transforms/NewGVN/crash-no-aa.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -disable-basic-aa -passes=newgvn -S < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/llvm/test/Transforms/NewGVN/crash-usecounts.ll b/llvm/test/Transforms/NewGVN/crash-usecounts.ll
index 5527beabbf3421..5cae740fd484c8 100644
--- a/llvm/test/Transforms/NewGVN/crash-usecounts.ll
+++ b/llvm/test/Transforms/NewGVN/crash-usecounts.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 define void @test(i1 %arg, i1 %arg1) {
diff --git a/llvm/test/Transforms/NewGVN/crash.ll b/llvm/test/Transforms/NewGVN/crash.ll
index c886bd384eee2f..26eaa766a05434 100644
--- a/llvm/test/Transforms/NewGVN/crash.ll
+++ b/llvm/test/Transforms/NewGVN/crash.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 ; PR5631
@@ -106,7 +107,7 @@ if.then21.i:
   ret ptr undef
 
 do.body36.i:
-  %ivar38.i = load i64, ptr @g 
+  %ivar38.i = load i64, ptr @g
   %add.ptr39.sum.i = add i64 %ivar38.i, 8
   %tmp40.i = getelementptr inbounds i8, ptr %tmp18.i, i64 %add.ptr39.sum.i
   %tmp41.i = load i64, ptr %tmp40.i
@@ -132,14 +133,14 @@ declare i32 @foo2()
 define i32 @test4() {
 entry:
   ret i32 0
-  
+
 dead:
   %P2 = getelementptr i32, ptr %P2, i32 52
   %Q2 = getelementptr i32, ptr %Q2, i32 52
   store i32 4, ptr %P2
   %A = load i32, ptr %Q2
   br i1 true, label %dead, label %dead2
-  
+
 dead2:
   ret i32 %A
 }
diff --git a/llvm/test/Transforms/NewGVN/cyclic-phi-handling.ll b/llvm/test/Transforms/NewGVN/cyclic-phi-handling.ll
index 4a2f0b972c9fe1..dc150799c849c6 100644
--- a/llvm/test/Transforms/NewGVN/cyclic-phi-handling.ll
+++ b/llvm/test/Transforms/NewGVN/cyclic-phi-handling.ll
@@ -5,15 +5,15 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define void @foo(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ %arg1, %bb ], [ [[TMP:%.*]]4, %bb7 ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ %arg, %bb ], [ [[TMP]], %bb7 ]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 %arg2(i32 [[TMP4]], i32 [[TMP]])
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ [[ARG1:%.*]], [[BB:%.*]] ], [ [[TMP4:%.*]], [[BB7:%.*]] ]
+; CHECK-NEXT:    [[TMP4]] = phi i32 [ [[ARG:%.*]], [[BB]] ], [ [[TMP]], [[BB7]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 [[ARG2:%.*]](i32 [[TMP4]], i32 [[TMP]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TMP6]], label %bb7, label %bb8
+; CHECK-NEXT:    br i1 [[TMP6]], label [[BB7]], label [[BB8:%.*]]
 ; CHECK:       bb7:
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb8:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/NewGVN/dbg-redundant-load.ll b/llvm/test/Transforms/NewGVN/dbg-redundant-load.ll
index 01d95aebdf2d41..cd2eca0de6d140 100644
--- a/llvm/test/Transforms/NewGVN/dbg-redundant-load.ll
+++ b/llvm/test/Transforms/NewGVN/dbg-redundant-load.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 ; Check that the redundant load from %if.then is removed.
@@ -6,15 +7,22 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK: @test_redundant_load(
-; CHECK-LABEL: entry:
-; CHECK-NEXT: load i32, ptr %Y, align 4, !dbg ![[LOC:[0-9]+]]
-; CHECK-LABEL: if.then:
-; CHECK-NOT: load
-; CHECK-LABEL: if.end:
-; CHECK: ![[LOC]] = !DILocation(line: 3, scope: !{{.*}})
 
 define i32 @test_redundant_load(i32 %X, ptr %Y) !dbg !6 {
+; CHECK-LABEL: define i32 @test_redundant_load(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[Y:%.*]]) !dbg [[DBG6:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], -1, !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !dbg [[DBG9]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP0]], !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    call void @foo(), !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    br label [[IF_END]], !dbg [[DBG12:![0-9]+]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RESULT_0:%.*]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0]], !dbg [[DBG13:![0-9]+]]
+;
 entry:
   %0 = load i32, ptr %Y, align 4, !dbg !8
   %cmp = icmp sgt i32 %X, -1, !dbg !9
@@ -50,3 +58,16 @@ declare void @foo()
 !11 = !DILocation(line: 7, scope: !6)
 !12 = !DILocation(line: 8, scope: !6)
 !13 = !DILocation(line: 10, scope: !6)
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: [[META2:![0-9]+]])
+; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: "")
+; CHECK: [[META2]] = !{}
+; CHECK: [[DBG6]] = distinct !DISubprogram(name: "test_redundant_load", scope: [[META1]], file: [[META1]], line: 2, type: [[META7:![0-9]+]], scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META7]] = !DISubroutineType(types: [[META2]])
+; CHECK: [[DBG8]] = !DILocation(line: 3, scope: [[DBG6]])
+; CHECK: [[DBG9]] = !DILocation(line: 5, scope: [[DBG6]])
+; CHECK: [[DBG10]] = !DILocation(line: 6, scope: [[DBG6]])
+; CHECK: [[DBG11]] = !DILocation(line: 7, scope: [[DBG6]])
+; CHECK: [[DBG12]] = !DILocation(line: 8, scope: [[DBG6]])
+; CHECK: [[DBG13]] = !DILocation(line: 10, scope: [[DBG6]])
+;.
diff --git a/llvm/test/Transforms/NewGVN/edge.ll b/llvm/test/Transforms/NewGVN/edge.ll
index 8699c85c9ed737..143e52cd139c5a 100644
--- a/llvm/test/Transforms/NewGVN/edge.ll
+++ b/llvm/test/Transforms/NewGVN/edge.ll
@@ -1,7 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 define i32 @f1(i32 %x) {
-  ; CHECK-LABEL: define i32 @f1(
+; CHECK-LABEL: define i32 @f1(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret i32 [[X]]
+;
 bb0:
   %cmp = icmp eq i32 %x, 0
   br i1 %cmp, label %bb2, label %bb1
@@ -11,12 +21,19 @@ bb2:
   %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
   %foo = add i32 %cond, %x
   ret i32 %foo
-  ; CHECK: bb2:
-  ; CHECK: ret i32 %x
 }
 
 define i32 @f2(i32 %x) {
-  ; CHECK-LABEL: define i32 @f2(
+; CHECK-LABEL: define i32 @f2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret i32 [[X]]
+;
 bb0:
   %cmp = icmp ne i32 %x, 0
   br i1 %cmp, label %bb1, label %bb2
@@ -26,12 +43,20 @@ bb2:
   %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
   %foo = add i32 %cond, %x
   ret i32 %foo
-  ; CHECK: bb2:
-  ; CHECK: ret i32 %x
 }
 
 define i32 @f3(i32 %x) {
-  ; CHECK-LABEL: define i32 @f3(
+; CHECK-LABEL: define i32 @f3(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    switch i32 [[X]], label [[BB1:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret i32 [[X]]
+;
 bb0:
   switch i32 %x, label %bb1 [ i32 0, label %bb2]
 bb1:
@@ -40,13 +65,21 @@ bb2:
   %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
   %foo = add i32 %cond, %x
   ret i32 %foo
-  ; CHECK: bb2:
-  ; CHECK: ret i32 %x
 }
 
 declare void @g(i1)
 define void @f4(ptr %x)  {
 ; CHECK-LABEL: define void @f4(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[Y:%.*]] = icmp eq ptr null, [[X]]
+; CHECK-NEXT:    br i1 [[Y]], label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @g(i1 [[Y]])
+; CHECK-NEXT:    ret void
+;
 bb0:
   %y = icmp eq ptr null, %x
   br i1 %y, label %bb2, label %bb1
@@ -55,11 +88,22 @@ bb1:
 bb2:
   %zed = icmp eq ptr null, %x
   call void @g(i1 %zed)
-; CHECK: call void @g(i1 %y)
   ret void
 }
 
 define double @fcmp_oeq_not_zero(double %x, double %y) {
+; CHECK-LABEL: define double @fcmp_oeq_not_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 2.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], 2.000000e+00
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
   %cmp = fcmp oeq double %y, 2.0
   br i1 %cmp, label %if, label %return
@@ -72,11 +116,21 @@ return:
   %retval = phi double [ %div, %if ], [ %x, %entry ]
   ret double %retval
 
-; CHECK-LABEL: define double @fcmp_oeq_not_zero(
-; CHECK: %div = fdiv double %x, 2.0
 }
 
 define double @fcmp_une_not_zero(double %x, double %y) {
+; CHECK-LABEL: define double @fcmp_une_not_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y]], 2.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], 2.000000e+00
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
   %cmp = fcmp une double %y, 2.0
   br i1 %cmp, label %return, label %else
@@ -89,14 +143,24 @@ return:
   %retval = phi double [ %div, %else ], [ %x, %entry ]
   ret double %retval
 
-; CHECK-LABEL: define double @fcmp_une_not_zero(
-; CHECK: %div = fdiv double %x, 2.0
 }
 
-; PR22376 - We can't propagate zero constants because -0.0 
+; PR22376 - We can't propagate zero constants because -0.0
 ; compares equal to 0.0. If %y is -0.0 in this test case,
 ; we would produce the wrong sign on the infinity return value.
 define double @fcmp_oeq_zero(double %x, double %y) {
+; CHECK-LABEL: define double @fcmp_oeq_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], [[Y]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
   %cmp = fcmp oeq double %y, 0.0
   br i1 %cmp, label %if, label %return
@@ -109,11 +173,21 @@ return:
   %retval = phi double [ %div, %if ], [ %x, %entry ]
   ret double %retval
 
-; CHECK-LABEL: define double @fcmp_oeq_zero(
-; CHECK: %div = fdiv double %x, %y
 }
 
 define double @fcmp_une_zero(double %x, double %y) {
+; CHECK-LABEL: define double @fcmp_une_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y]], -0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], [[Y]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
   %cmp = fcmp une double %y, -0.0
   br i1 %cmp, label %return, label %else
@@ -126,45 +200,65 @@ return:
   %retval = phi double [ %div, %else ], [ %x, %entry ]
   ret double %retval
 
-; CHECK-LABEL: define double @fcmp_une_zero(
-; CHECK: %div = fdiv double %x, %y
 }
 
 ; We also cannot propagate a value if it's not a constant.
 ; This is because the value could be 0.0 or -0.0.
 
 define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2) {
+; CHECK-LABEL: define double @fcmp_oeq_maybe_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]], double [[Z1:%.*]], double [[Z2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1]], [[Z2]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y]], [[Z]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], [[Z]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
- %z = fadd double %z1, %z2
- %cmp = fcmp oeq double %y, %z
- br i1 %cmp, label %if, label %return
+  %z = fadd double %z1, %z2
+  %cmp = fcmp oeq double %y, %z
+  br i1 %cmp, label %if, label %return
 
 if:
- %div = fdiv double %x, %z
- br label %return
+  %div = fdiv double %x, %z
+  br label %return
 
 return:
- %retval = phi double [ %div, %if ], [ %x, %entry ]
- ret double %retval
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
 
-; CHECK-LABEL: define double @fcmp_oeq_maybe_zero(
-; CHECK: %div = fdiv double %x, %z
 }
 
 define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2) {
+; CHECK-LABEL: define double @fcmp_une_maybe_zero(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]], double [[Z1:%.*]], double [[Z2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1]], [[Z2]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y]], [[Z]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X]], [[Z]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
 entry:
- %z = fadd double %z1, %z2
- %cmp = fcmp une double %y, %z
- br i1 %cmp, label %return, label %else
+  %z = fadd double %z1, %z2
+  %cmp = fcmp une double %y, %z
+  br i1 %cmp, label %return, label %else
 
 else:
- %div = fdiv double %x, %z
- br label %return
+  %div = fdiv double %x, %z
+  br label %return
 
 return:
- %retval = phi double [ %div, %else ], [ %x, %entry ]
- ret double %retval
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
 
-; CHECK-LABEL: define double @fcmp_une_maybe_zero(
-; CHECK: %div = fdiv double %x, %z
 }
diff --git a/llvm/test/Transforms/NewGVN/eliminate-callsite-inline.ll b/llvm/test/Transforms/NewGVN/eliminate-callsite-inline.ll
index 748485c03c18ab..6cf543840cfe91 100644
--- a/llvm/test/Transforms/NewGVN/eliminate-callsite-inline.ll
+++ b/llvm/test/Transforms/NewGVN/eliminate-callsite-inline.ll
@@ -1,15 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=inline,newgvn -S < %s | FileCheck %s
 
-; CHECK-LABEL: @f2()
-; CHECK-NEXT:    ret void
 
 define void @f2() {
+; CHECK-LABEL: define void @f2() {
+; CHECK-NEXT:    ret void
+;
   call void @f1()
   call void @f1()
   ret void
 }
 
 define internal void @f1() #1 {
+; CHECK-LABEL: define internal void @f1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
 entry:
   ret void
 }
diff --git a/llvm/test/Transforms/NewGVN/equivalent-phi.ll b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
index 925795d49af17c..ba4fc14fa2feb4 100644
--- a/llvm/test/Transforms/NewGVN/equivalent-phi.ll
+++ b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
@@ -11,22 +11,22 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) #0 {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ %arg, %bb ], [ [[TMP:%.*]]15, %bb17 ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ %arg2, %bb ], [ [[TMP18:%.*]], %bb17 ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ 0, %bb ], [ [[TMP14:%.*]], %bb17 ]
+; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ [[ARG:%.*]], [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB17:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[ARG2:%.*]], [[BB]] ], [ [[TMP18:%.*]], [[BB17]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP14:%.*]], [[BB17]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @global, i64 0, i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP6]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP14]] = add nsw i32 [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw i32 [[TMP]], %arg1
-; CHECK-NEXT:    br label %bb17
+; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[TMP]], [[ARG1:%.*]]
+; CHECK-NEXT:    br label [[BB17]]
 ; CHECK:       bb17:
 ; CHECK-NEXT:    [[TMP18]] = add i32 [[TMP4]], -1
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP19]], label %bb3, label %bb20
+; CHECK-NEXT:    br i1 [[TMP19]], label [[BB3]], label [[BB20:%.*]]
 ; CHECK:       bb20:
 ; CHECK-NEXT:    ret i32 [[TMP14]]
 ;
diff --git a/llvm/test/Transforms/NewGVN/fold-const-expr.ll b/llvm/test/Transforms/NewGVN/fold-const-expr.ll
index 2821791469494a..54020b88d65a80 100644
--- a/llvm/test/Transforms/NewGVN/fold-const-expr.ll
+++ b/llvm/test/Transforms/NewGVN/fold-const-expr.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; GVN failed to do constant expression folding and expanded
 ; them unfolded in many places, producing exponentially large const
 ; expressions. As a result, the compilation never fisished.
@@ -6,6 +7,16 @@
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 %2 = type { i32, i32, i32, i32, i32 }
 define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
+; CHECK-LABEL: define i32 @_Z16vector3util_mainv(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [[TMP0]], ptr [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    store <4 x i32> <i32 234567891, i32 345678912, i32 456789123, i32 0>, ptr [[TMP114]], align 4
+; CHECK-NEXT:    store i32 310393545, ptr [[TMP114]], align 4
+; CHECK-NEXT:    store i32 -383584258, ptr [[TMP114]], align 4
+; CHECK-NEXT:    store i32 -57163022, ptr [[TMP114]], align 4
+; CHECK-NEXT:    ret i32 0
+;
   %tmp1 = alloca %2, align 4
   %tmp114 = getelementptr inbounds %2, ptr %tmp1, i64 0, i32 1
   store <4 x i32> <i32 234567891, i32 345678912, i32 456789123, i32 0>, ptr %tmp114, align 4
@@ -36,7 +47,6 @@ define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
   %tmp1739 = shl i32 %tmp1738, 22
   %tmp1740 = xor i32 %tmp1739, %tmp1738
   store i32 %tmp1740, ptr %tmp1683, align 4
-; CHECK: store i32 310393545, ptr %tmp114, align 4
   %tmp1756 = getelementptr inbounds %2, ptr %tmp1, i64 0, i32 1
   %tmp1761 = load i32, ptr %tmp1756, align 4
   %tmp1766 = shl i32 %tmp1761, 5
@@ -64,7 +74,6 @@ define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
   %tmp1812 = shl i32 %tmp1811, 22
   %tmp1813 = xor i32 %tmp1812, %tmp1811
   store i32 %tmp1813, ptr %tmp1756, align 4
-; CHECK: store i32 -383584258, ptr %tmp114, align 4
   %tmp2645 = getelementptr inbounds %2, ptr %tmp1, i64 0, i32 1
   %tmp2650 = load i32, ptr %tmp2645, align 4
   %tmp2655 = shl i32 %tmp2650, 5
@@ -92,6 +101,5 @@ define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
   %tmp2701 = shl i32 %tmp2700, 22
   %tmp2702 = xor i32 %tmp2701, %tmp2700
   store i32 %tmp2702, ptr %tmp2645, align 4
-; CHECK: store i32 -57163022, ptr %tmp114, align 4
   ret i32 0
 }
diff --git a/llvm/test/Transforms/NewGVN/fpmath.ll b/llvm/test/Transforms/NewGVN/fpmath.ll
index e8cec8af3e0212..d936c01bbe7887 100644
--- a/llvm/test/Transforms/NewGVN/fpmath.ll
+++ b/llvm/test/Transforms/NewGVN/fpmath.ll
@@ -1,10 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 define double @test1(double %x, double %y) {
-; CHECK: @test1(double %x, double %y)
-; CHECK: %add1 = fadd double %x, %y
-; CHECK-NOT: fpmath
-; CHECK: %foo = fadd double %add1, %add1
+; CHECK-LABEL: define double @test1(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[X]], [[Y]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[ADD1]], [[ADD1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %add1 = fadd double %x, %y, !fpmath !0
   %add2 = fadd double %x, %y
   %foo = fadd double %add1, %add2
@@ -12,9 +15,12 @@ define double @test1(double %x, double %y) {
 }
 
 define double @test2(double %x, double %y) {
-; CHECK: @test2(double %x, double %y)
-; CHECK: %add1 = fadd double %x, %y, !fpmath ![[MD0:[0-9]+]]
-; CHECK: %foo = fadd double %add1, %add1
+; CHECK-LABEL: define double @test2(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[X]], [[Y]], !fpmath [[META0:![0-9]+]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[ADD1]], [[ADD1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %add1 = fadd double %x, %y, !fpmath !0
   %add2 = fadd double %x, %y, !fpmath !0
   %foo = fadd double %add1, %add2
@@ -22,9 +28,12 @@ define double @test2(double %x, double %y) {
 }
 
 define double @test3(double %x, double %y) {
-; CHECK: @test3(double %x, double %y)
-; CHECK: %add1 = fadd double %x, %y, !fpmath ![[MD1:[0-9]+]]
-; CHECK: %foo = fadd double %add1, %add1
+; CHECK-LABEL: define double @test3(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[X]], [[Y]], !fpmath [[META1:![0-9]+]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[ADD1]], [[ADD1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %add1 = fadd double %x, %y, !fpmath !1
   %add2 = fadd double %x, %y, !fpmath !0
   %foo = fadd double %add1, %add2
@@ -32,9 +41,12 @@ define double @test3(double %x, double %y) {
 }
 
 define double @test4(double %x, double %y) {
-; CHECK: @test4(double %x, double %y)
-; CHECK: %add1 = fadd double %x, %y, !fpmath ![[MD1]]
-; CHECK: %foo = fadd double %add1, %add1
+; CHECK-LABEL: define double @test4(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) {
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[X]], [[Y]], !fpmath [[META1]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[ADD1]], [[ADD1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %add1 = fadd double %x, %y, !fpmath !0
   %add2 = fadd double %x, %y, !fpmath !1
   %foo = fadd double %add1, %add2
@@ -42,17 +54,22 @@ define double @test4(double %x, double %y) {
 }
 
 define double @test5(double %x) {
-; CHECK: @test5(double %x)
-; CHECK: %neg1 = fneg double %x, !fpmath ![[MD1]]
-; CHECK: %foo = fadd double %neg1, %neg1
+; CHECK-LABEL: define double @test5(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:    [[NEG1:%.*]] = fneg double [[X]], !fpmath [[META1]]
+; CHECK-NEXT:    [[FOO:%.*]] = fadd double [[NEG1]], [[NEG1]]
+; CHECK-NEXT:    ret double [[FOO]]
+;
   %neg1 = fneg double %x, !fpmath !0
   %neg2 = fneg double %x, !fpmath !1
   %foo = fadd double %neg1, %neg2
   ret double %foo
 }
 
-; CHECK: ![[MD0]] = !{float 5.000000e+00}
-; CHECK: ![[MD1]] = !{float 2.500000e+00}
 
 !0 = !{ float 5.0 }
 !1 = !{ float 2.5 }
+;.
+; CHECK: [[META0]] = !{float 5.000000e+00}
+; CHECK: [[META1]] = !{float 2.500000e+00}
+;.
diff --git a/llvm/test/Transforms/NewGVN/funclet.ll b/llvm/test/Transforms/NewGVN/funclet.ll
index 3df3f940ec2d79..8c1cbd6b08fd7f 100644
--- a/llvm/test/Transforms/NewGVN/funclet.ll
+++ b/llvm/test/Transforms/NewGVN/funclet.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i686-pc-windows-msvc"
@@ -8,13 +9,35 @@ target triple = "i686-pc-windows-msvc"
 @"_TI1?AUA@@" = external constant %eh.ThrowInfo
 
 define i8 @f() personality ptr @__CxxFrameHandler3 {
+; CHECK-LABEL: define i8 @f() personality ptr @__CxxFrameHandler3 {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[C:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[B]], align 1
+; CHECK-NEXT:    store i8 13, ptr [[C]], align 1
+; CHECK-NEXT:    invoke void @_CxxThrowException(ptr [[B]], ptr nonnull @"_TI1?AUA@@")
+; CHECK-NEXT:            to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK:       catch.dispatch:
+; CHECK-NEXT:    [[CS1:%.*]] = catchswitch within none [label %catch] unwind to caller
+; CHECK:       catch:
+; CHECK-NEXT:    [[CATCHPAD:%.*]] = catchpad within [[CS1]] [ptr null, i32 64, ptr null]
+; CHECK-NEXT:    store i8 5, ptr [[B]], align 1
+; CHECK-NEXT:    catchret from [[CATCHPAD]] to label [[TRY_CONT:%.*]]
+; CHECK:       try.cont:
+; CHECK-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[B]], align 1
+; CHECK-NEXT:    [[LOAD_C:%.*]] = load i8, ptr [[C]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD_B]], [[LOAD_C]]
+; CHECK-NEXT:    ret i8 [[ADD]]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+;
 entry:
   %b = alloca i8
   %c = alloca i8
   store i8 42, ptr %b
   store i8 13, ptr %c
   invoke void @_CxxThrowException(ptr %b, ptr nonnull @"_TI1?AUA@@")
-          to label %unreachable unwind label %catch.dispatch
+  to label %unreachable unwind label %catch.dispatch
 
 catch.dispatch:                                   ; preds = %entry
   %cs1 = catchswitch within none [label %catch] unwind to caller
@@ -33,11 +56,6 @@ try.cont:                                         ; preds = %catch
 unreachable:                                      ; preds = %entry
   unreachable
 }
-; CHECK-LABEL: define i8 @f(
-; CHECK:       %[[load_b:.*]] = load i8, ptr %b
-; CHECK-NEXT:  %[[load_c:.*]] = load i8, ptr %c
-; CHECK-NEXT:  %[[add:.*]] = add i8 %[[load_b]], %[[load_c]]
-; CHECK-NEXT:  ret i8 %[[add]]
 
 declare i32 @__CxxFrameHandler3(...)
 
diff --git a/llvm/test/Transforms/NewGVN/int_sideeffect.ll b/llvm/test/Transforms/NewGVN/int_sideeffect.ll
index f715d022ba010a..a2c54bd38e93c5 100644
--- a/llvm/test/Transforms/NewGVN/int_sideeffect.ll
+++ b/llvm/test/Transforms/NewGVN/int_sideeffect.ll
@@ -1,27 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S < %s -passes=newgvn | FileCheck %s
 
 declare void @llvm.sideeffect()
 
 ; Store-to-load forwarding across a @llvm.sideeffect.
 
-; CHECK-LABEL: s2l
-; CHECK-NOT: load
 define float @s2l(ptr %p) {
-    store float 0.0, ptr %p
-    call void @llvm.sideeffect()
-    %t = load float, ptr %p
-    ret float %t
+; CHECK-LABEL: define float @s2l(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[P]], align 4
+; CHECK-NEXT:    call void @llvm.sideeffect()
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  store float 0.0, ptr %p
+  call void @llvm.sideeffect()
+  %t = load float, ptr %p
+  ret float %t
 }
 
 ; Redundant load elimination across a @llvm.sideeffect.
 
-; CHECK-LABEL: rle
-; CHECK: load
-; CHECK-NOT: load
 define float @rle(ptr %p) {
-    %r = load float, ptr %p
-    call void @llvm.sideeffect()
-    %s = load float, ptr %p
-    %t = fadd float %r, %s
-    ret float %t
+; CHECK-LABEL: define float @rle(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = load float, ptr [[P]], align 4
+; CHECK-NEXT:    call void @llvm.sideeffect()
+; CHECK-NEXT:    [[T:%.*]] = fadd float [[R]], [[R]]
+; CHECK-NEXT:    ret float [[T]]
+;
+  %r = load float, ptr %p
+  call void @llvm.sideeffect()
+  %s = load float, ptr %p
+  %t = fadd float %r, %s
+  ret float %t
 }
diff --git a/llvm/test/Transforms/NewGVN/invariant.group.ll b/llvm/test/Transforms/NewGVN/invariant.group.ll
index 81e733f84ddb11..7c14059c88c677 100644
--- a/llvm/test/Transforms/NewGVN/invariant.group.ll
+++ b/llvm/test/Transforms/NewGVN/invariant.group.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 %struct.A = type { ptr }
@@ -6,86 +7,131 @@
 
 @unknownPtr = external global i8
 
-; CHECK-LABEL: define i8 @simple() {
 define i8 @simple() {
+; CHECK-LABEL: define i8 @simple() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0:![0-9]+]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
-
-    %a = load i8, ptr %ptr, !invariant.group !0
-    %b = load i8, ptr %ptr, !invariant.group !0
-    %c = load i8, ptr %ptr, !invariant.group !0
-; CHECK: ret i8 42
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
+
+  %a = load i8, ptr %ptr, !invariant.group !0
+  %b = load i8, ptr %ptr, !invariant.group !0
+  %c = load i8, ptr %ptr, !invariant.group !0
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @optimizable1() {
 define i8 @optimizable1() {
+; CHECK-LABEL: define i8 @optimizable1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    [[PTR2:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[PTR]])
+; CHECK-NEXT:    call void @foo(ptr [[PTR2]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    %ptr2 = call ptr @llvm.launder.invariant.group.p0(ptr %ptr)
-    %a = load i8, ptr %ptr, !invariant.group !0
-    
-    call void @foo(ptr %ptr2); call to use %ptr2
-; CHECK: ret i8 42
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  %ptr2 = call ptr @llvm.launder.invariant.group.p0(ptr %ptr)
+  %a = load i8, ptr %ptr, !invariant.group !0
+
+  call void @foo(ptr %ptr2); call to use %ptr2
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @optimizable2() {
 define i8 @optimizable2() {
+; CHECK-LABEL: define i8 @optimizable2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    store i8 13, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @bar(i8 13)
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
-    
-    store i8 13, ptr %ptr ; can't use this store with invariant.group
-    %a = load i8, ptr %ptr 
-    call void @bar(i8 %a) ; call to use %a
-    
-    call void @foo(ptr %ptr)
-    %b = load i8, ptr %ptr, !invariant.group !0
-    
-; CHECK: ret i8 42
-    ret i8 %b
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
+
+  store i8 13, ptr %ptr ; can't use this store with invariant.group
+  %a = load i8, ptr %ptr
+  call void @bar(i8 %a) ; call to use %a
+
+  call void @foo(ptr %ptr)
+  %b = load i8, ptr %ptr, !invariant.group !0
+
+  ret i8 %b
 }
 
-; CHECK-LABEL: define i1 @proveEqualityForStrip(
 define i1 @proveEqualityForStrip(ptr %a) {
+; CHECK-LABEL: define i1 @proveEqualityForStrip(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
   %b1 = call ptr @llvm.strip.invariant.group.p0(ptr %a)
-; CHECK-NOT: llvm.strip.invariant.group
   %b2 = call ptr @llvm.strip.invariant.group.p0(ptr %a)
   %r = icmp eq ptr %b1, %b2
-; CHECK: ret i1 true
   ret i1 %r
 }
 
-; CHECK-LABEL: define i8 @unoptimizable1() {
 define i8 @unoptimizable1() {
+; CHECK-LABEL: define i8 @unoptimizable1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    ret i8 [[A]]
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr
-    call void @foo(ptr %ptr)
-    %a = load i8, ptr %ptr, !invariant.group !0
-; CHECK: ret i8 %a
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr
+  call void @foo(ptr %ptr)
+  %a = load i8, ptr %ptr, !invariant.group !0
+  ret i8 %a
 }
 
 ; NewGVN doesn't support assumes.
-; CHECK-LABEL: define void @indirectLoads() {
 define void @indirectLoads() {
+; CHECK-LABEL: define void @indirectLoads() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr @getPointer(ptr null)
+; CHECK-NEXT:    call void @_ZN1AC1Ev(ptr [[CALL]])
+; CHECK-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CALL]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    [[CMP_VTABLES:%.*]] = icmp eq ptr [[VTABLE]], getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2)
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_VTABLES]])
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; CHECK-NEXT:    call void [[TMP0]](ptr [[CALL]])
+; CHECK-NEXT:    [[VTABLE2:%.*]] = load ptr, ptr [[CALL]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VTABLE2]], align 8
+; CHECK-NEXT:    call void [[TMP1]](ptr [[CALL]])
+; CHECK-NEXT:    [[VTABLE4:%.*]] = load ptr, ptr [[CALL]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VTABLE4]], align 8
+; CHECK-NEXT:    call void [[TMP2]](ptr [[CALL]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; CHECK-NEXT:    call void [[TMP3]](ptr [[CALL]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %a = alloca ptr, align 8
-  
-  %call = call ptr @getPointer(ptr null) 
+
+  %call = call ptr @getPointer(ptr null)
   call void @_ZN1AC1Ev(ptr %call)
-  
-; CHECK: %vtable = load {{.*}} !invariant.group
+
   %vtable = load ptr, ptr %call, align 8, !invariant.group !0
   %cmp.vtables = icmp eq ptr %vtable, getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2)
   call void @llvm.assume(i1 %cmp.vtables)
-  
+
   store ptr %call, ptr %a, align 8
   %0 = load ptr, ptr %a, align 8
 
@@ -98,36 +144,45 @@ entry:
 ; FIXME: call void @_ZN1A3fooEv(
   %vtable2 = load ptr, ptr %2, align 8, !invariant.group !0
   %3 = load ptr, ptr %vtable2, align 8
-  
+
   call void %3(ptr %2)
   %4 = load ptr, ptr %a, align 8
-  
+
   %vtable4 = load ptr, ptr %4, align 8, !invariant.group !0
   %5 = load ptr, ptr %vtable4, align 8
 ; FIXME: call void @_ZN1A3fooEv(
   call void %5(ptr %4)
- 
+
   %vtable5 = load ptr, ptr %call, align 8, !invariant.group !0
   %6 = load ptr, ptr %vtable5, align 8
 ; FIXME: call void @_ZN1A3fooEv(
   call void %6(ptr %4)
-  
+
   ret void
 }
 
 ; NewGVN won't CSE loads with different pointee types.
-; CHECK-LABEL: define void @combiningBitCastWithLoad() {
 define void @combiningBitCastWithLoad() {
+; CHECK-LABEL: define void @combiningBitCastWithLoad() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr @getPointer(ptr null)
+; CHECK-NEXT:    call void @_ZN1AC1Ev(ptr [[CALL]])
+; CHECK-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CALL]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; CHECK-NEXT:    call void [[TMP0]](ptr [[CALL]])
+; CHECK-NEXT:    ret void
+;
 entry:
   %a = alloca ptr, align 8
-  
-  %call = call ptr @getPointer(ptr null) 
+
+  %call = call ptr @getPointer(ptr null)
   call void @_ZN1AC1Ev(ptr %call)
-  
-; CHECK: %vtable = load {{.*}} !invariant.group
+
   %vtable = load ptr, ptr %call, align 8, !invariant.group !0
   %cmp.vtables = icmp eq ptr %vtable, getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2)
-  
+
   store ptr %call, ptr %a, align 8
 ; FIXME-NOT: !invariant.group
   %0 = load ptr, ptr %a, align 8
@@ -139,173 +194,245 @@ entry:
   ret void
 }
 
-; CHECK-LABEL:define void @loadCombine() {
 define void @loadCombine() {
+; CHECK-LABEL: define void @loadCombine() {
+; CHECK-NEXT:  enter:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[A]])
+; CHECK-NEXT:    call void @bar(i8 [[A]])
+; CHECK-NEXT:    ret void
+;
 enter:
   %ptr = alloca i8
   store i8 42, ptr %ptr
   call void @foo(ptr %ptr)
-; CHECK: %[[A:.*]] = load i8, ptr %ptr, align 1, !invariant.group
   %a = load i8, ptr %ptr, !invariant.group !0
-; CHECK-NOT: load
   %b = load i8, ptr %ptr, !invariant.group !0
-; CHECK: call void @bar(i8 %[[A]])
   call void @bar(i8 %a)
-; CHECK: call void @bar(i8 %[[A]])
   call void @bar(i8 %b)
   ret void
 }
 
-; CHECK-LABEL: define void @loadCombine1() {
 define void @loadCombine1() {
+; CHECK-LABEL: define void @loadCombine1() {
+; CHECK-NEXT:  enter:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[C:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[C]])
+; CHECK-NEXT:    call void @bar(i8 [[C]])
+; CHECK-NEXT:    ret void
+;
 enter:
   %ptr = alloca i8
   store i8 42, ptr %ptr
   call void @foo(ptr %ptr)
-; CHECK: %[[D:.*]] = load i8, ptr %ptr, align 1, !invariant.group
   %c = load i8, ptr %ptr
-; CHECK-NOT: load
   %d = load i8, ptr %ptr, !invariant.group !0
-; CHECK: call void @bar(i8 %[[D]])
   call void @bar(i8 %c)
-; CHECK: call void @bar(i8 %[[D]])
   call void @bar(i8 %d)
   ret void
 }
 
-; CHECK-LABEL: define void @loadCombine2() {    
 define void @loadCombine2() {
+; CHECK-LABEL: define void @loadCombine2() {
+; CHECK-NEXT:  enter:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[E]])
+; CHECK-NEXT:    call void @bar(i8 [[E]])
+; CHECK-NEXT:    ret void
+;
 enter:
   %ptr = alloca i8
   store i8 42, ptr %ptr
   call void @foo(ptr %ptr)
-; CHECK: %[[E:.*]] = load i8, ptr %ptr, align 1, !invariant.group
   %e = load i8, ptr %ptr, !invariant.group !0
-; CHECK-NOT: load
   %f = load i8, ptr %ptr
-; CHECK: call void @bar(i8 %[[E]])
   call void @bar(i8 %e)
-; CHECK: call void @bar(i8 %[[E]])
   call void @bar(i8 %f)
   ret void
 }
 
-; CHECK-LABEL: define void @loadCombine3() {
 define void @loadCombine3() {
+; CHECK-LABEL: define void @loadCombine3() {
+; CHECK-NEXT:  enter:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[E:%.*]] = load i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[E]])
+; CHECK-NEXT:    call void @bar(i8 [[E]])
+; CHECK-NEXT:    ret void
+;
 enter:
   %ptr = alloca i8
   store i8 42, ptr %ptr
   call void @foo(ptr %ptr)
-; CHECK: %[[E:.*]] = load i8, ptr %ptr, align 1, !invariant.group
   %e = load i8, ptr %ptr, !invariant.group !0
-; CHECK-NOT: load
   %f = load i8, ptr %ptr, !invariant.group !0
-; CHECK: call void @bar(i8 %[[E]])
   call void @bar(i8 %e)
-; CHECK: call void @bar(i8 %[[E]])
   call void @bar(i8 %f)
   ret void
 }
 
-; CHECK-LABEL: define i8 @unoptimizable2() {
 define i8 @unoptimizable2() {
+; CHECK-LABEL: define i8 @unoptimizable2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    ret i8 [[A]]
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr
-    call void @foo(ptr %ptr)
-    %a = load i8, ptr %ptr
-    call void @foo(ptr %ptr)
-    %b = load i8, ptr %ptr, !invariant.group !0
-    
-; CHECK: ret i8 %a
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr
+  call void @foo(ptr %ptr)
+  %a = load i8, ptr %ptr
+  call void @foo(ptr %ptr)
+  %b = load i8, ptr %ptr, !invariant.group !0
+
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @unoptimizable3() {
 define i8 @unoptimizable3() {
+; CHECK-LABEL: define i8 @unoptimizable3() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    [[PTR2:%.*]] = call ptr @getPointer(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR2]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    ret i8 [[A]]
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    %ptr2 = call ptr @getPointer(ptr %ptr)
-    %a = load i8, ptr %ptr2, !invariant.group !0
-    
-; CHECK: ret i8 %a
-    ret i8 %a
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  %ptr2 = call ptr @getPointer(ptr %ptr)
+  %a = load i8, ptr %ptr2, !invariant.group !0
+
+  ret i8 %a
 }
 
 ; NewGVN cares about the launder for some reason.
-; CHECK-LABEL: define i8 @optimizable4() {
 define i8 @optimizable4() {
+; CHECK-LABEL: define i8 @optimizable4() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1
+; CHECK-NEXT:    [[PTR2:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[PTR]])
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[PTR2]], align 1
+; CHECK-NEXT:    ret i8 [[A]]
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr
-    %ptr2 = call ptr @llvm.launder.invariant.group.p0(ptr %ptr)
+  %ptr = alloca i8
+  store i8 42, ptr %ptr
+  %ptr2 = call ptr @llvm.launder.invariant.group.p0(ptr %ptr)
 ; FIXME-NOT: load
-    %a = load i8, ptr %ptr2
-    
+  %a = load i8, ptr %ptr2
+
 ; FIXME: ret i8 42
-    ret i8 %a
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @volatile1() {
 define i8 @volatile1() {
+; CHECK-LABEL: define i8 @volatile1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[B:%.*]] = load volatile i8, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @bar(i8 [[B]])
+; CHECK-NEXT:    [[C:%.*]] = load volatile i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[C]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
-    %a = load i8, ptr %ptr, !invariant.group !0
-    %b = load volatile i8, ptr %ptr
-; CHECK: call void @bar(i8 %b)
-    call void @bar(i8 %b)
-
-    %c = load volatile i8, ptr %ptr, !invariant.group !0
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
+  %a = load i8, ptr %ptr, !invariant.group !0
+  %b = load volatile i8, ptr %ptr
+  call void @bar(i8 %b)
+
+  %c = load volatile i8, ptr %ptr, !invariant.group !0
 ; We might be able to optimize this, but nobody cares
-; CHECK: call void @bar(i8 %c)
-    call void @bar(i8 %c)
-; CHECK: ret i8 42
-    ret i8 %a
+  call void @bar(i8 %c)
+  ret i8 %a
 }
 
-; CHECK-LABEL: define i8 @volatile2() {
 define i8 @volatile2() {
+; CHECK-LABEL: define i8 @volatile2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    [[B:%.*]] = load volatile i8, ptr [[PTR]], align 1
+; CHECK-NEXT:    call void @bar(i8 [[B]])
+; CHECK-NEXT:    [[C:%.*]] = load volatile i8, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[C]])
+; CHECK-NEXT:    ret i8 42
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
-    %a = load i8, ptr %ptr, !invariant.group !0
-    %b = load volatile i8, ptr %ptr
-; CHECK: call void @bar(i8 %b)
-    call void @bar(i8 %b)
-
-    %c = load volatile i8, ptr %ptr, !invariant.group !0
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
+  %a = load i8, ptr %ptr, !invariant.group !0
+  %b = load volatile i8, ptr %ptr
+  call void @bar(i8 %b)
+
+  %c = load volatile i8, ptr %ptr, !invariant.group !0
 ; We might be able to optimize this, but nobody cares
-; CHECK: call void @bar(i8 %c)
-    call void @bar(i8 %c)
-; CHECK: ret i8 42
-    ret i8 %a
+  call void @bar(i8 %c)
+  ret i8 %a
 }
 
-; CHECK-LABEL: define void @fun() {
 define void @fun() {
+; CHECK-LABEL: define void @fun() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 42, ptr [[PTR]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo(ptr [[PTR]])
+; CHECK-NEXT:    call void @bar(i8 42)
+; CHECK-NEXT:    ret void
+;
 entry:
-    %ptr = alloca i8
-    store i8 42, ptr %ptr, !invariant.group !0
-    call void @foo(ptr %ptr)
+  %ptr = alloca i8
+  store i8 42, ptr %ptr, !invariant.group !0
+  call void @foo(ptr %ptr)
 
-    %a = load i8, ptr %ptr, !invariant.group !0 ; Can assume that value under %ptr didn't change
-; CHECK: call void @bar(i8 42)
-    call void @bar(i8 %a)
+  %a = load i8, ptr %ptr, !invariant.group !0 ; Can assume that value under %ptr didn't change
+  call void @bar(i8 %a)
 
-    ret void
+  ret void
 }
 
 ; FIXME: NewGVN doesn't run instsimplify on a load from a vtable definition?
 ; This test checks if invariant.group understands gep with zeros
-; CHECK-LABEL: define void @testGEP0() {
 define void @testGEP0() {
+; CHECK-LABEL: define void @testGEP0() {
+; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), ptr [[A]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @_ZN1A3fooEv(ptr nonnull dereferenceable(8) [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @unknownPtr, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[_Z1GR1A_EXIT:%.*]], label [[TMP3:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), align 8
+; CHECK-NEXT:    call void [[TMP4]](ptr nonnull [[A]])
+; CHECK-NEXT:    br label [[_Z1GR1A_EXIT]]
+; CHECK:       _Z1gR1A.exit:
+; CHECK-NEXT:    ret void
+;
   %a = alloca %struct.A, align 8
   store ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), ptr %a, align 8, !invariant.group !0
-; CHECK: call void @_ZN1A3fooEv(ptr nonnull dereferenceable(8) %a)
   call void @_ZN1A3fooEv(ptr nonnull dereferenceable(8) %a) ; This call may change vptr
   %1 = load i8, ptr @unknownPtr, align 4
   %2 = icmp eq i8 %1, 0
@@ -325,54 +452,93 @@ _Z1gR1A.exit:                                     ; preds = %0, %3
 ; Check if no optimizations are performed with global pointers.
 ; FIXME: we could do the optimizations if we would check if dependency comes
 ; from the same function.
-; CHECK-LABEL: define void @testGlobal() {
 define void @testGlobal() {
-; CHECK:  %a = load i8, ptr @unknownPtr, align 1, !invariant.group !0
-   %a = load i8, ptr @unknownPtr, !invariant.group !0
-   call void @foo2(ptr @unknownPtr, i8 %a)
-; CHECK:  %1 = load i8, ptr @unknownPtr, align 1, !invariant.group !0
-   %1 = load i8, ptr @unknownPtr, !invariant.group !0
-   call void @bar(i8 %1)
-
-   call void @fooBit(ptr @unknownPtr, i1 1)
+; CHECK-LABEL: define void @testGlobal() {
+; CHECK-NEXT:    [[A:%.*]] = load i8, ptr @unknownPtr, align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo2(ptr @unknownPtr, i8 [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @unknownPtr, align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @bar(i8 [[TMP1]])
+; CHECK-NEXT:    call void @fooBit(ptr @unknownPtr, i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i1, ptr @unknownPtr, align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @fooBit(ptr @unknownPtr, i1 [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load i1, ptr @unknownPtr, align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @fooBit(ptr @unknownPtr, i1 [[TMP3]])
+; CHECK-NEXT:    ret void
+;
+  %a = load i8, ptr @unknownPtr, !invariant.group !0
+  call void @foo2(ptr @unknownPtr, i8 %a)
+  %1 = load i8, ptr @unknownPtr, !invariant.group !0
+  call void @bar(i8 %1)
+
+  call void @fooBit(ptr @unknownPtr, i1 1)
 ; Adding regex because of canonicalization of bitcasts
-; CHECK: %2 = load i1, ptr {{.*}}, !invariant.group !0
-   %2 = load i1, ptr @unknownPtr, !invariant.group !0
-   call void @fooBit(ptr @unknownPtr, i1 %2)
-; CHECK:  %3 = load i1, ptr {{.*}}, !invariant.group !0
-   %3 = load i1, ptr @unknownPtr, !invariant.group !0
-   call void @fooBit(ptr @unknownPtr, i1 %3)
-   ret void
+  %2 = load i1, ptr @unknownPtr, !invariant.group !0
+  call void @fooBit(ptr @unknownPtr, i1 %2)
+  %3 = load i1, ptr @unknownPtr, !invariant.group !0
+  call void @fooBit(ptr @unknownPtr, i1 %3)
+  ret void
 }
 
 ; Might be similar to above where NewGVN doesn't handle loads of different types from the same location.
 ; Not super important anyway.
-; CHECK-LABEL: define void @testTrunc() {
 define void @testTrunc() {
-   %a = alloca i8
-   call void @foo(ptr %a)
-; CHECK:  %b = load i8, ptr %a, align 1, !invariant.group !0
-   %b = load i8, ptr %a, !invariant.group !0
-   call void @foo2(ptr %a, i8 %b)
-
-   %1 = load i8, ptr %a, !invariant.group !0
-; CHECK: call void @bar(i8 %b)
-   call void @bar(i8 %1)
-
-   call void @fooBit(ptr %a, i1 1)
+; CHECK-LABEL: define void @testTrunc() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @foo(ptr [[A]])
+; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[A]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @foo2(ptr [[A]], i8 [[B]])
+; CHECK-NEXT:    call void @bar(i8 [[B]])
+; CHECK-NEXT:    call void @fooBit(ptr [[A]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = load i1, ptr [[A]], align 1, !invariant.group [[META0]]
+; CHECK-NEXT:    call void @fooBit(ptr [[A]], i1 [[TMP1]])
+; CHECK-NEXT:    call void @fooBit(ptr [[A]], i1 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8
+  call void @foo(ptr %a)
+  %b = load i8, ptr %a, !invariant.group !0
+  call void @foo2(ptr %a, i8 %b)
+
+  %1 = load i8, ptr %a, !invariant.group !0
+  call void @bar(i8 %1)
+
+  call void @fooBit(ptr %a, i1 1)
 ; FIXME: %1 = trunc i8 %b to i1
-   %2 = load i1, ptr %a, !invariant.group !0
+  %2 = load i1, ptr %a, !invariant.group !0
 ; FIXME-NEXT: call void @fooBit(ptr %a, i1 %1)
-   call void @fooBit(ptr %a, i1 %2)
-   %3 = load i1, ptr %a, !invariant.group !0
+  call void @fooBit(ptr %a, i1 %2)
+  %3 = load i1, ptr %a, !invariant.group !0
 ; FIXME-NEXT: call void @fooBit(ptr %a, i1 %1)
-   call void @fooBit(ptr %a, i1 %3)
-   ret void
+  call void @fooBit(ptr %a, i1 %3)
+  ret void
 }
 
 ; See comment in @testGEP0 on what NewGVN is lacking.
-; CHECK-LABEL: define void @handling_loops()
 define void @handling_loops() {
+; CHECK-LABEL: define void @handling_loops() {
+; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_A:%.*]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), ptr [[A]], align 8, !invariant.group [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @unknownPtr, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[DOTLR_PH_I:%.*]], label [[_Z2G2R1A_EXIT:%.*]]
+; CHECK:       .lr.ph.i:
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i8 [[TMP1]], 1
+; CHECK-NEXT:    br i1 [[TMP3]], label [[DOT_CRIT_EDGE_PREHEADER:%.*]], label [[_Z2G2R1A_EXIT]]
+; CHECK:       ._crit_edge.preheader:
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i8 [ [[TMP6:%.*]], [[DOT_CRIT_EDGE]] ], [ 1, [[DOT_CRIT_EDGE_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), align 8
+; CHECK-NEXT:    call void [[TMP5]](ptr nonnull [[A]])
+; CHECK-NEXT:    [[TMP6]] = add nuw nsw i8 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr @unknownPtr, align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[DOT_CRIT_EDGE]], label [[_Z2G2R1A_EXIT_LOOPEXIT:%.*]]
+; CHECK:       _Z2g2R1A.exit.loopexit:
+; CHECK-NEXT:    br label [[_Z2G2R1A_EXIT]]
+; CHECK:       _Z2g2R1A.exit:
+; CHECK-NEXT:    ret void
+;
   %a = alloca %struct.A, align 8
   store ptr getelementptr inbounds ([3 x ptr], ptr @_ZTV1A, i64 0, i64 2), ptr %a, align 8, !invariant.group !0
   %1 = load i8, ptr @unknownPtr, align 4
@@ -424,3 +590,6 @@ declare void @llvm.assume(i1 %cmp.vtables) #0
 
 attributes #0 = { nounwind }
 !0 = !{}
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/NewGVN/invariant.start.ll b/llvm/test/Transforms/NewGVN/invariant.start.ll
index 100b79fd3bff28..9bf1c5530006bd 100644
--- a/llvm/test/Transforms/NewGVN/invariant.start.ll
+++ b/llvm/test/Transforms/NewGVN/invariant.start.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; Test to make sure llvm.invariant.start calls are not treated as clobbers.
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
@@ -7,10 +8,12 @@ declare void @llvm.invariant.end.p0(ptr, i64, ptr nocapture) nounwind
 
 ; We forward store to the load across the invariant.start intrinsic
 define i8 @forward_store() {
-; CHECK-LABEL: @forward_store
-; CHECK: call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
-; CHECK-NOT: load
-; CHECK: ret i8 0
+; CHECK-LABEL: define i8 @forward_store() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
+; CHECK-NEXT:    [[I:%.*]] = call ptr @llvm.invariant.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    ret i8 0
+;
   %a = alloca i8
   store i8 0, ptr %a
   %i = call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
@@ -23,10 +26,18 @@ declare i8 @dummy(ptr nocapture) nounwind readonly
 ; We forward store to the load in the non-local analysis case,
 ; i.e. invariant.start is in another basic block.
 define i8 @forward_store_nonlocal(i1 %cond) {
-; CHECK-LABEL: forward_store_nonlocal
-; CHECK: call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
-; CHECK: ret i8 0
-; CHECK: ret i8 %val
+; CHECK-LABEL: define i8 @forward_store_nonlocal(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
+; CHECK-NEXT:    [[I:%.*]] = call ptr @llvm.invariant.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    br i1 [[COND]], label [[LOADBLOCK:%.*]], label [[EXIT:%.*]]
+; CHECK:       loadblock:
+; CHECK-NEXT:    ret i8 0
+; CHECK:       exit:
+; CHECK-NEXT:    [[VAL:%.*]] = call i8 @dummy(ptr [[A]])
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
   %a = alloca i8
   store i8 0, ptr %a
   %i = call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
@@ -43,12 +54,14 @@ exit:
 
 ; We should not value forward %foo to the invariant.end corresponding to %bar.
 define i8 @forward_store1() {
-; CHECK-LABEL: forward_store1
-; CHECK: %foo = call ptr @llvm.invariant.start.p0
-; CHECK-NOT: load
-; CHECK: %bar = call ptr @llvm.invariant.start.p0
-; CHECK: call void @llvm.invariant.end.p0(ptr %bar, i64 1, ptr %a)
-; CHECK: ret i8 0
+; CHECK-LABEL: define i8 @forward_store1() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
+; CHECK-NEXT:    [[FOO:%.*]] = call ptr @llvm.invariant.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    [[BAR:%.*]] = call ptr @llvm.invariant.start.p0(i64 1, ptr [[A]])
+; CHECK-NEXT:    call void @llvm.invariant.end.p0(ptr [[BAR]], i64 1, ptr [[A]])
+; CHECK-NEXT:    ret i8 0
+;
   %a = alloca i8
   store i8 0, ptr %a
   %foo = call ptr @llvm.invariant.start.p0(i64 1, ptr %a)
diff --git a/llvm/test/Transforms/NewGVN/lifetime-simple.ll b/llvm/test/Transforms/NewGVN/lifetime-simple.ll
index 5d31101218ee67..55e46111fc9c93 100644
--- a/llvm/test/Transforms/NewGVN/lifetime-simple.ll
+++ b/llvm/test/Transforms/NewGVN/lifetime-simple.ll
@@ -1,12 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
 
 define i8 @test(ptr %P) nounwind {
-; CHECK: lifetime.start
-; CHECK-NOT: load
-; CHECK: lifetime.end
+; CHECK-LABEL: define i8 @test(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[P]])
+; CHECK-NEXT:    store i8 1, ptr [[P]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[P]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
 entry:
   call void @llvm.lifetime.start.p0(i64 32, ptr %P)
   %0 = load i8, ptr %P
diff --git a/llvm/test/Transforms/NewGVN/load-constant-mem.ll b/llvm/test/Transforms/NewGVN/load-constant-mem.ll
index 06439c59f9d2d7..ae91147237fade 100644
--- a/llvm/test/Transforms/NewGVN/load-constant-mem.ll
+++ b/llvm/test/Transforms/NewGVN/load-constant-mem.ll
@@ -7,14 +7,14 @@ define i32 @test(ptr %p, i32 %i) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr [4 x i32], ptr @G, i32 0, i32 [[I:%.*]]
-; CHECK-NEXT:    store i8 4, ptr [[P:%.*]]
+; CHECK-NEXT:    store i8 4, ptr [[P1:%.*]], align 1
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
-  %P = getelementptr [4 x i32], ptr @G, i32 0, i32 %i
-  %A = load i32, ptr %P
+  %p.i = getelementptr [4 x i32], ptr @G, i32 0, i32 %i
+  %A = load i32, ptr %p.i
   store i8 4, ptr %p
-  %B = load i32, ptr %P
+  %B = load i32, ptr %p.i
   %C = sub i32 %A, %B
   ret i32 %C
 }
diff --git a/llvm/test/Transforms/NewGVN/load-from-unreachable-predecessor.ll b/llvm/test/Transforms/NewGVN/load-from-unreachable-predecessor.ll
index 74cb70066eb1dc..3ca9b9efbe4c26 100644
--- a/llvm/test/Transforms/NewGVN/load-from-unreachable-predecessor.ll
+++ b/llvm/test/Transforms/NewGVN/load-from-unreachable-predecessor.ll
@@ -1,12 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 ; Check that an unreachable predecessor to a PHI node doesn't cause a crash.
 ; PR21625.
 
 define i32 @f(ptr %f) {
-; CHECK: bb0:
+; CHECK-LABEL: define i32 @f(
+; CHECK-SAME: ptr [[F:%.*]]) {
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br i1 false, label [[BB1:%.*]], label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    ret i32 [[STOREMERGE]]
+;
 ; Load should be removed, since it's ignored.
-; CHECK-NEXT: br label
 bb0:
   %bar = load ptr, ptr %f
   br label %bb2
diff --git a/llvm/test/Transforms/NewGVN/loadforward.ll b/llvm/test/Transforms/NewGVN/loadforward.ll
index d8a9022dc66f6b..85ceafd433f45a 100644
--- a/llvm/test/Transforms/NewGVN/loadforward.ll
+++ b/llvm/test/Transforms/NewGVN/loadforward.ll
@@ -9,8 +9,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ;; Test that we forward the first store to the second load
 define i16 @bazinga() {
 ; CHECK-LABEL: @bazinga(
-; CHECK-NEXT:    [[_TMP10:%.*]] = load i16, ptr getelementptr inbounds (%rec11, ptr @str, i64 0, i32 1)
-; CHECK-NEXT:    store i16 [[_TMP10]], ptr @str
+; CHECK-NEXT:    [[_TMP10:%.*]] = load i16, ptr getelementptr inbounds ([[REC11:%.*]], ptr @str, i64 0, i32 1), align 2
+; CHECK-NEXT:    store i16 [[_TMP10]], ptr @str, align 2
 ; CHECK-NEXT:    [[_TMP15:%.*]] = icmp eq i16 [[_TMP10]], 3
 ; CHECK-NEXT:    [[_TMP16:%.*]] = select i1 [[_TMP15]], i16 1, i16 0
 ; CHECK-NEXT:    br label [[BB1:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/malloc-load-removal.ll b/llvm/test/Transforms/NewGVN/malloc-load-removal.ll
index b487acfe93e476..3e11d9221c9dd2 100644
--- a/llvm/test/Transforms/NewGVN/malloc-load-removal.ll
+++ b/llvm/test/Transforms/NewGVN/malloc-load-removal.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn < %s | FileCheck %s
 ; PR13694
 
@@ -7,6 +8,17 @@ target triple = "x86_64-apple-macosx10.8.0"
 declare ptr @malloc(i64) nounwind allockind("alloc,uninitialized") allocsize(0) "alloc-family"="malloc"
 
 define noalias ptr @test1() nounwind uwtable ssp {
+; CHECK-LABEL: define noalias ptr @test1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @malloc(i64 100) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 0, ptr [[CALL]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
 entry:
   %call = tail call ptr @malloc(i64 100) nounwind
   %0 = load i8, ptr %call, align 1
@@ -20,14 +32,22 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   ret ptr %call
 
-; CHECK-LABEL: @test1(
-; CHECK-NOT: load
-; CHECK-NOT: icmp
 }
 
 declare ptr @_Znwm(i64) nounwind
 
 define noalias ptr @test2() nounwind uwtable ssp {
+; CHECK-LABEL: define noalias ptr @test2(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @_Znwm(i64 100) #[[ATTR2]]
+; CHECK-NEXT:    br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 0, ptr [[CALL]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
 entry:
   %call = tail call ptr @_Znwm(i64 100) nounwind
   %0 = load i8, ptr %call, align 1
@@ -41,14 +61,22 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   ret ptr %call
 
-; CHECK-LABEL: @test2(
-; CHECK-NOT: load
-; CHECK-NOT: icmp
 }
 
 declare ptr @aligned_alloc(i64 allocalign, i64) nounwind allockind("alloc,uninitialized,aligned") allocsize(1) "alloc-family"="malloc"
 
 define noalias ptr @test3() nounwind uwtable ssp {
+; CHECK-LABEL: define noalias ptr @test3(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @aligned_alloc(i64 256, i64 32) #[[ATTR2]]
+; CHECK-NEXT:    br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 0, ptr [[CALL]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret ptr [[CALL]]
+;
 entry:
   %call = tail call ptr @aligned_alloc(i64 256, i64 32) nounwind
   %0 = load i8, ptr %call, align 32
@@ -62,7 +90,4 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   ret ptr %call
 
-; CHECK-LABEL: @test3(
-; CHECK-NOT: load
-; CHECK-NOT: icmp
 }
diff --git a/llvm/test/Transforms/NewGVN/memory-handling.ll b/llvm/test/Transforms/NewGVN/memory-handling.ll
index b1e1c4f2de3e1d..c72ff749ba410d 100644
--- a/llvm/test/Transforms/NewGVN/memory-handling.ll
+++ b/llvm/test/Transforms/NewGVN/memory-handling.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ;; This test is really dependent on propagating a lot of memory info around, but in the end, not
 ;; screwing up a single add.
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
@@ -20,6 +21,121 @@ declare ptr @__ctype_b_loc() local_unnamed_addr #1
 
 ; Function Attrs: nounwind uwtable
 define void @BuildMask(ptr nocapture readonly) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @BuildMask(
+; CHECK-SAME: ptr nocapture readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 16 @alPhrase, i8 0, i64 416, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainMask, i8 0, i64 16, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainSign, i8 0, i64 16, i1 false)
+; CHECK-NEXT:    br label [[DOTSINK_SPLIT:%.*]]
+; CHECK:       .sink.split:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ [[TMP0]], [[TMP1:%.*]] ], [ [[TMP3:%.*]], [[TMP14:%.*]] ]
+; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP22:%.*]], [[TMP14]] ]
+; CHECK-NEXT:    store i32 [[DOTSINK]], ptr @cchPhraseLength, align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    br label [[TMP2:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[DOT1:%.*]] = phi ptr [ [[DOT0]], [[DOTSINK_SPLIT]] ], [ [[TMP3]], [[TMP6:%.*]] ]
+; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i8, ptr [[DOT1]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[DOT1]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[DOTPREHEADER_PREHEADER:%.*]], label [[TMP6]]
+; CHECK:       .preheader.preheader:
+; CHECK-NEXT:    br label [[DOTPREHEADER:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call ptr @__ctype_b_loc() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i8 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP10]], align 2, !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and i16 [[TMP11]], 1024
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i16 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label [[TMP2]], label [[TMP14]]
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call i32 @tolower(i32 [[TMP15]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], -97
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 16, !tbaa [[TBAA9:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 1
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 16, !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP22]] = add nsw i32 [[DOTSINK]], 1
+; CHECK-NEXT:    br label [[DOTSINK_SPLIT]]
+; CHECK:       .preheader:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[DOTPREHEADER_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[TMP57:%.*]] ]
+; CHECK-NEXT:    [[DOT04961:%.*]] = phi i32 [ [[DOT2:%.*]], [[TMP57]] ], [ 0, [[DOTPREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[DOT05160:%.*]] = phi i32 [ [[DOT253:%.*]], [[TMP57]] ], [ 0, [[DOTPREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 16, !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP24]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [26 x i32], ptr @auGlobalFrequency, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[TMP27:%.*]], label [[TMP28:%.*]]
+; CHECK:       27:
+; CHECK-NEXT:    store i32 -1, ptr [[TMP26]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    br label [[TMP57]]
+; CHECK:       28:
+; CHECK-NEXT:    store i32 0, ptr [[TMP26]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP24]] to i64
+; CHECK-NEXT:    br i1 false, label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[DOT04658:%.*]] = phi i64 [ [[TMP31:%.*]], [[DOTLR_PH]] ], [ 1, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[DOT04857:%.*]] = phi i32 [ [[TMP30:%.*]], [[DOTLR_PH]] ], [ 1, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP30]] = add nuw nsw i32 [[DOT04857]], 1
+; CHECK-NEXT:    [[TMP31]] = shl i64 [[DOT04658]], 1
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ult i64 [[TMP29]], [[TMP31]]
+; CHECK-NEXT:    br i1 [[TMP32]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[DOTLR_PH]]
+; CHECK:       ._crit_edge.loopexit:
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[DOT048_LCSSA:%.*]] = phi i32 [ poison, [[TMP28]] ], [ [[TMP30]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    [[DOT046_LCSSA:%.*]] = phi i64 [ poison, [[TMP28]] ], [ [[TMP31]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP33:%.*]] = add nsw i32 [[DOT048_LCSSA]], [[DOT04961]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ugt i32 [[TMP33]], 64
+; CHECK-NEXT:    br i1 [[TMP34]], label [[TMP35:%.*]], label [[TMP39:%.*]]
+; CHECK:       35:
+; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[DOT05160]], 1
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ugt i32 [[TMP36]], 1
+; CHECK-NEXT:    br i1 [[TMP37]], label [[TMP38:%.*]], label [[TMP39]]
+; CHECK:       38:
+; CHECK-NEXT:    tail call void @Fatal(ptr @.str.7, i32 0)
+; CHECK-NEXT:    br label [[TMP39]]
+; CHECK:       39:
+; CHECK-NEXT:    [[DOT152:%.*]] = phi i32 [ [[DOT05160]], [[DOT_CRIT_EDGE]] ], [ [[TMP36]], [[TMP38]] ], [ [[TMP36]], [[TMP35]] ]
+; CHECK-NEXT:    [[DOT150:%.*]] = phi i32 [ [[DOT04961]], [[DOT_CRIT_EDGE]] ], [ 0, [[TMP38]] ], [ 0, [[TMP35]] ]
+; CHECK-NEXT:    [[TMP40:%.*]] = add i64 [[DOT046_LCSSA]], 4294967295
+; CHECK-NEXT:    [[TMP41:%.*]] = trunc i64 [[TMP40]] to i32
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 2
+; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP42]], align 8, !tbaa [[TBAA11:![0-9]+]]
+; CHECK-NEXT:    [[TMP43:%.*]] = zext i32 [[DOT150]] to i64
+; CHECK-NEXT:    [[DOT046_:%.*]] = shl i64 [[DOT046_LCSSA]], [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = zext i32 [[DOT152]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [2 x i64], ptr @aqMainSign, i64 0, i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP45]], align 8, !tbaa [[TBAA12:![0-9]+]]
+; CHECK-NEXT:    [[TMP47:%.*]] = or i64 [[TMP46]], [[DOT046_]]
+; CHECK-NEXT:    store i64 [[TMP47]], ptr [[TMP45]], align 8, !tbaa [[TBAA12]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP23]], align 16, !tbaa [[TBAA9]]
+; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP48]] to i64
+; CHECK-NEXT:    [[TMP50:%.*]] = shl i64 [[TMP49]], [[TMP43]]
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [2 x i64], ptr @aqMainMask, i64 0, i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP51]], align 8, !tbaa [[TBAA12]]
+; CHECK-NEXT:    [[TMP53:%.*]] = or i64 [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    store i64 [[TMP53]], ptr [[TMP51]], align 8, !tbaa [[TBAA12]]
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 1
+; CHECK-NEXT:    store i32 [[DOT150]], ptr [[TMP54]], align 4, !tbaa [[TBAA14:![0-9]+]]
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [26 x %struct.Letter], ptr @alPhrase, i64 0, i64 [[INDVARS_IV]], i32 3
+; CHECK-NEXT:    store i32 [[DOT152]], ptr [[TMP55]], align 4, !tbaa [[TBAA15:![0-9]+]]
+; CHECK-NEXT:    [[TMP56:%.*]] = add nsw i32 [[DOT150]], [[DOT048_LCSSA]]
+; CHECK-NEXT:    br label [[TMP57]]
+; CHECK:       57:
+; CHECK-NEXT:    [[DOT253]] = phi i32 [ [[DOT05160]], [[TMP27]] ], [ [[DOT152]], [[TMP39]] ]
+; CHECK-NEXT:    [[DOT2]] = phi i32 [ [[DOT04961]], [[TMP27]] ], [ [[TMP56]], [[TMP39]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 26
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOTPREHEADER]], label [[TMP58:%.*]]
+; CHECK:       58:
+; CHECK-NEXT:    ret void
+;
   tail call void @llvm.memset.p0.i64(ptr align 16 @alPhrase, i8 0, i64 416, i1 false)
   tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainMask, i8 0, i64 16, i1 false)
   tail call void @llvm.memset.p0.i64(ptr align 16 @aqMainSign, i8 0, i64 16, i1 false)
@@ -113,7 +229,6 @@ define void @BuildMask(ptr nocapture readonly) local_unnamed_addr #0 {
 ; If we screw up the revisitation of the users of store of %sink above
 ; we will end up propagating and simplifying this to 1 in the final output
 ; because we keep an optimistic assumption we should not.
-; CHECK:  add i32 %.05160, 1
   %37 = add i32 %.05160, 1
   %38 = icmp ugt i32 %37, 1
   br i1 %38, label %39, label %40
@@ -193,3 +308,20 @@ attributes #5 = { nounwind readonly }
 !14 = !{!"long", !3, i64 0}
 !15 = !{!11, !2, i64 4}
 !16 = !{!11, !2, i64 12}
+;.
+; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"any pointer", [[META3]], i64 0}
+; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[META8]] = !{!"short", [[META3]], i64 0}
+; CHECK: [[TBAA9]] = !{[[META10:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META10]] = !{!"", [[META2]], i64 0, [[META2]], i64 4, [[META2]], i64 8, [[META2]], i64 12}
+; CHECK: [[TBAA11]] = !{[[META10]], [[META2]], i64 8}
+; CHECK: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; CHECK: [[META13]] = !{!"long", [[META3]], i64 0}
+; CHECK: [[TBAA14]] = !{[[META10]], [[META2]], i64 4}
+; CHECK: [[TBAA15]] = !{[[META10]], [[META2]], i64 12}
+;.
diff --git a/llvm/test/Transforms/NewGVN/metadata-nonnull.ll b/llvm/test/Transforms/NewGVN/metadata-nonnull.ll
index cd0922f901af4c..5de4c581c051c3 100644
--- a/llvm/test/Transforms/NewGVN/metadata-nonnull.ll
+++ b/llvm/test/Transforms/NewGVN/metadata-nonnull.ll
@@ -150,7 +150,7 @@ define ptr @test7(ptr %v0) {
 ; CHECK-LABEL: define ptr @test7
 ; CHECK-SAME: (ptr [[V0:%.*]]) {
 ; CHECK-NEXT:  top:
-; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[V0]], align 8, !nonnull !0
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[V0]], align 8, !nonnull [[META0:![0-9]+]]
 ; CHECK-NEXT:    call void @use2(ptr [[V1]])
 ; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
diff --git a/llvm/test/Transforms/NewGVN/metadata-simplify.ll b/llvm/test/Transforms/NewGVN/metadata-simplify.ll
index a84c581165c3ae..e981e3702d5601 100644
--- a/llvm/test/Transforms/NewGVN/metadata-simplify.ll
+++ b/llvm/test/Transforms/NewGVN/metadata-simplify.ll
@@ -9,11 +9,11 @@ define i1 @test1(ptr %arg, i1 %arg2) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[ARG:%.*]], !nonnull !0
+; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[ARG:%.*]], align 8, !nonnull [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[LOAD1]], null
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[ARG]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[ARG]], align 8
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq ptr [[LOAD2]], null
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -34,11 +34,11 @@ define i1 @test2(ptr %arg, i1 %arg2) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[ARG:%.*]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq ptr [[LOAD1]], null
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[ARG]], !nonnull !0
+; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr, ptr [[ARG]], align 8, !nonnull [[META0]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq ptr [[LOAD2]], null
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -60,11 +60,11 @@ define i1 @test3(ptr %ptr, i1 %arg2) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], !range !1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !range [[RNG1:![0-9]+]]
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[LOAD1]], 999
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[LOAD2]], 999
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -85,11 +85,11 @@ define i1 @test4(ptr %ptr, i1 %arg2) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[LOAD1]], 999
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], !range !1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4, !range [[RNG1]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[LOAD2]], 999
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -110,11 +110,11 @@ define i1 @test5(ptr %ptr, i1 %arg2) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], !range !1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !range [[RNG1]]
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[LOAD1]], 999
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[LOAD2]], 999
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
@@ -135,11 +135,11 @@ define i1 @test6(ptr %ptr, i1 %arg2) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    br i1 [[ARG2:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[LOAD1]], 999
 ; CHECK-NEXT:    ret i1 [[CMP1]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], !range !1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4, !range [[RNG1]]
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[LOAD2]], 999
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
diff --git a/llvm/test/Transforms/NewGVN/noalias.ll b/llvm/test/Transforms/NewGVN/noalias.ll
index 5f0c5dd8a3aeaf..2cb5c196a7abf7 100644
--- a/llvm/test/Transforms/NewGVN/noalias.ll
+++ b/llvm/test/Transforms/NewGVN/noalias.ll
@@ -1,10 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 define i32 @test1(ptr %p, ptr %q) {
-; CHECK-LABEL: @test1(ptr %p, ptr %q)
-; CHECK: load i32, ptr %p
-; CHECK-NOT: noalias
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test1(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = load i32, ptr %p, !noalias !3
   %b = load i32, ptr %p
   %c = add i32 %a, %b
@@ -12,9 +15,12 @@ define i32 @test1(ptr %p, ptr %q) {
 }
 
 define i32 @test2(ptr %p, ptr %q) {
-; CHECK-LABEL: @test2(ptr %p, ptr %q)
-; CHECK: load i32, ptr %p, align 4, !alias.scope ![[SCOPE1:[0-9]+]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test2(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = load i32, ptr %p, !alias.scope !3
   %b = load i32, ptr %p, !alias.scope !3
   %c = add i32 %a, %b
@@ -22,17 +28,18 @@ define i32 @test2(ptr %p, ptr %q) {
 }
 
 define i32 @test3(ptr %p, ptr %q) {
-; CHECK-LABEL: @test3(ptr %p, ptr %q)
-; CHECK: load i32, ptr %p, align 4, !alias.scope ![[SCOPE2:[0-9]+]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test3(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[P]], align 4, !alias.scope [[META3:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = load i32, ptr %p, !alias.scope !4
   %b = load i32, ptr %p, !alias.scope !5
   %c = add i32 %a, %b
   ret i32 %c
 }
 
-; CHECK:   ![[SCOPE1]] = !{!{{[0-9]+}}}
-; CHECK:   ![[SCOPE2]] = !{!{{[0-9]+}}, !{{[0-9]+}}}
 declare i32 @foo(ptr) readonly
 
 !0 = distinct !{!0, !2, !"callee0: %a"}
@@ -42,3 +49,10 @@ declare i32 @foo(ptr) readonly
 !3 = !{!0}
 !4 = !{!1}
 !5 = !{!0, !1}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"callee0: %a"}
+; CHECK: [[META2]] = distinct !{[[META2]], !"callee0"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META1]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]], !"callee0: %b"}
+;.
diff --git a/llvm/test/Transforms/NewGVN/nomemlocation.ll b/llvm/test/Transforms/NewGVN/nomemlocation.ll
index 0f716b30801b25..332e6c6304c315 100644
--- a/llvm/test/Transforms/NewGVN/nomemlocation.ll
+++ b/llvm/test/Transforms/NewGVN/nomemlocation.ll
@@ -1,8 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -S -p='newgvn' | FileCheck %s
 ; MemorySSA should be able to handle a clobber query with an empty MemoryLocation.
 
-; CHECK: @userread
 define ptr @userread(ptr %p) {
+; CHECK-LABEL: define ptr @userread(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[POS:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[DIFF:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[POS]]
+; CHECK-NEXT:    [[LD:%.*]] = load ptr, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[READVAL:%.*]] = call i64 @fread(ptr noundef nonnull [[GEP]], i64 noundef 1, i64 noundef [[POS]], ptr noundef [[LD]])
+; CHECK-NEXT:    [[READVALISPOS:%.*]] = icmp eq i64 [[READVAL]], [[POS]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[READVALISPOS]])
+; CHECK-NEXT:    [[DIFF]] = sub i64 0, [[POS]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/NewGVN/non-integral-pointers.ll b/llvm/test/Transforms/NewGVN/non-integral-pointers.ll
index 6119577d3b2422..1e3da6e295d16b 100644
--- a/llvm/test/Transforms/NewGVN/non-integral-pointers.ll
+++ b/llvm/test/Transforms/NewGVN/non-integral-pointers.ll
@@ -1,37 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:4"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @f0(i1 %alwaysFalse, i64 %val, ptr %loc) {
-; CHECK-LABEL: @f0(
-; CHECK-NOT: inttoptr
-; CHECK-NOT: ptrtoint
- entry:
+; CHECK-LABEL: define void @f0(
+; CHECK-SAME: i1 [[ALWAYSFALSE:%.*]], i64 [[VAL:%.*]], ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 [[VAL]], ptr [[LOC]], align 8
+; CHECK-NEXT:    br i1 [[ALWAYSFALSE]], label [[NEVERTAKEN:%.*]], label [[ALWAYSTAKEN:%.*]]
+; CHECK:       neverTaken:
+; CHECK-NEXT:    [[PTR:%.*]] = load ptr addrspace(4), ptr [[LOC]], align 8
+; CHECK-NEXT:    store i8 5, ptr addrspace(4) [[PTR]], align 1
+; CHECK-NEXT:    ret void
+; CHECK:       alwaysTaken:
+; CHECK-NEXT:    ret void
+;
+  entry:
   store i64 %val, ptr %loc
   br i1 %alwaysFalse, label %neverTaken, label %alwaysTaken
 
- neverTaken:
+  neverTaken:
   %ptr = load ptr addrspace(4), ptr %loc
   store i8 5, ptr addrspace(4) %ptr
   ret void
 
- alwaysTaken:
+  alwaysTaken:
   ret void
 }
 
 define i64 @f1(i1 %alwaysFalse, ptr addrspace(4) %val, ptr %loc) {
-; CHECK-LABEL: @f1(
-; CHECK-NOT: inttoptr
-; CHECK-NOT: ptrtoint
- entry:
+; CHECK-LABEL: define i64 @f1(
+; CHECK-SAME: i1 [[ALWAYSFALSE:%.*]], ptr addrspace(4) [[VAL:%.*]], ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store ptr addrspace(4) [[VAL]], ptr [[LOC]], align 8
+; CHECK-NEXT:    br i1 [[ALWAYSFALSE]], label [[NEVERTAKEN:%.*]], label [[ALWAYSTAKEN:%.*]]
+; CHECK:       neverTaken:
+; CHECK-NEXT:    [[INT:%.*]] = load i64, ptr [[LOC]], align 8
+; CHECK-NEXT:    ret i64 [[INT]]
+; CHECK:       alwaysTaken:
+; CHECK-NEXT:    ret i64 42
+;
+  entry:
   store ptr addrspace(4) %val, ptr %loc
   br i1 %alwaysFalse, label %neverTaken, label %alwaysTaken
 
- neverTaken:
+  neverTaken:
   %int = load i64, ptr %loc
   ret i64 %int
 
- alwaysTaken:
+  alwaysTaken:
   ret i64 42
 }
diff --git a/llvm/test/Transforms/NewGVN/null-aliases-nothing.ll b/llvm/test/Transforms/NewGVN/null-aliases-nothing.ll
index 666621119d4cb4..25295fac7400d9 100644
--- a/llvm/test/Transforms/NewGVN/null-aliases-nothing.ll
+++ b/llvm/test/Transforms/NewGVN/null-aliases-nothing.ll
@@ -1,19 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 %t = type { i32 }
 declare void @test1f(ptr)
 
 define void @test1(ptr noalias %stuff ) {
-    %before = load i32, ptr %stuff
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: ptr noalias [[STUFF:%.*]]) {
+; CHECK-NEXT:    [[BEFORE:%.*]] = load i32, ptr [[STUFF]], align 4
+; CHECK-NEXT:    call void @test1f(ptr null)
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[BEFORE]], [[BEFORE]]
+; CHECK-NEXT:    store i32 [[SUM]], ptr [[STUFF]], align 4
+; CHECK-NEXT:    ret void
+;
+  %before = load i32, ptr %stuff
 
-    call void @test1f(ptr null)
+  call void @test1f(ptr null)
 
-    %after = load i32, ptr %stuff ; <--- This should be a dead load
-    %sum = add i32 %before, %after
+  %after = load i32, ptr %stuff ; <--- This should be a dead load
+  %sum = add i32 %before, %after
 
-    store i32 %sum, ptr %stuff
-    ret void
-; CHECK: load
-; CHECK-NOT: load
-; CHECK: ret void
+  store i32 %sum, ptr %stuff
+  ret void
 }
diff --git a/llvm/test/Transforms/NewGVN/phi-edge-handling.ll b/llvm/test/Transforms/NewGVN/phi-edge-handling.ll
index 8dfac7942210ce..0e871b6b261a43 100644
--- a/llvm/test/Transforms/NewGVN/phi-edge-handling.ll
+++ b/llvm/test/Transforms/NewGVN/phi-edge-handling.ll
@@ -9,8 +9,8 @@ define i16 @hoge() {
 ; CHECK-LABEL: @hoge(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    switch i8 undef, label [[BB7:%.*]] [
-; CHECK-NEXT:    i8 0, label [[BB1:%.*]]
-; CHECK-NEXT:    i8 12, label [[BB2:%.*]]
+; CHECK-NEXT:      i8 0, label [[BB1:%.*]]
+; CHECK-NEXT:      i8 12, label [[BB2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB6:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll b/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll
index ac07148c75521f..573a4c0bd0bcb1 100644
--- a/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll
+++ b/llvm/test/Transforms/NewGVN/phi-of-ops-simplified-to-existing-value-then-changes-again.ll
@@ -88,8 +88,8 @@ define void @pr42422(i1 %c.1, i1 %c.2) {
 ; CHECK:       bb16:
 ; CHECK-NEXT:    [[TMP17:%.*]] = phi i32 [ poison, [[BB15]] ], [ 1, [[BB14]] ], [ 9, [[BB7]] ]
 ; CHECK-NEXT:    switch i32 [[TMP17]], label [[BB19]] [
-; CHECK-NEXT:    i32 0, label [[BB6]]
-; CHECK-NEXT:    i32 9, label [[BB18:%.*]]
+; CHECK-NEXT:      i32 0, label [[BB6]]
+; CHECK-NEXT:      i32 9, label [[BB18:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb18:
 ; CHECK-NEXT:    br label [[BB19]]
diff --git a/llvm/test/Transforms/NewGVN/phi-translate-partial-alias.ll b/llvm/test/Transforms/NewGVN/phi-translate-partial-alias.ll
index cb24b050a599e3..7dd41904970c07 100644
--- a/llvm/test/Transforms/NewGVN/phi-translate-partial-alias.ll
+++ b/llvm/test/Transforms/NewGVN/phi-translate-partial-alias.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
@@ -6,12 +7,19 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; not actually redundant around the loop backedge, despite appearances
 ; if phi-translation is ignored.
 
-; CHECK: define void @test0(ptr %begin)
-; CHECK: loop:
-; CHECK:   %l0 = load i8, ptr %phi
-; CHECK:   call void @bar(i8 %l0)
-; CHECK:   %l1 = load i8, ptr %phi
 define void @test0(ptr %begin) {
+; CHECK-LABEL: define void @test0(
+; CHECK-SAME: ptr [[BEGIN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[BEGIN]], [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr [[PHI]], align 1
+; CHECK-NEXT:    call void @bar(i8 [[L0]])
+; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[PHI]], align 1
+; CHECK-NEXT:    [[NEXT]] = getelementptr inbounds i8, ptr [[PHI]], i8 [[L1]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/NewGVN/pr17732.ll b/llvm/test/Transforms/NewGVN/pr17732.ll
index 427543d3aae7a7..e66547cb9e1933 100644
--- a/llvm/test/Transforms/NewGVN/pr17732.ll
+++ b/llvm/test/Transforms/NewGVN/pr17732.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S -o - < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -12,6 +13,12 @@ target triple = "x86_64-unknown-linux-gnu"
 @vector_with_zeroinit = common global %struct.with_vector zeroinitializer, align 4
 
 define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 @array_with_zeroinit, ptr align 4 @main.obj_with_array, i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 @vector_with_zeroinit, ptr align 4 @main.obj_with_vector, i64 12, i1 false)
+; CHECK-NEXT:    ret i32 1
+;
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 4 @array_with_zeroinit, ptr align 4 @main.obj_with_array, i64 12, i1 false)
   %0 = load i8, ptr getelementptr inbounds (%struct.with_array, ptr @array_with_zeroinit, i64 0, i32 2), align 4
@@ -22,8 +29,6 @@ entry:
   %conv1 = sext i8 %1 to i32
   %and = and i32 %conv0, %conv1
   ret i32 %and
-; CHECK-LABEL: define i32 @main(
-; CHECK: ret i32 1
 }
 
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
diff --git a/llvm/test/Transforms/NewGVN/pr17852.ll b/llvm/test/Transforms/NewGVN/pr17852.ll
index 5858982ce1cd85..bffde124164448 100644
--- a/llvm/test/Transforms/NewGVN/pr17852.ll
+++ b/llvm/test/Transforms/NewGVN/pr17852.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=newgvn
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 %struct.S0 = type { [2 x i8], [2 x i8], [4 x i8], [2 x i8], i32, i32, i32, i32 }
diff --git a/llvm/test/Transforms/NewGVN/pr24397.ll b/llvm/test/Transforms/NewGVN/pr24397.ll
index f3f112ae1fe319..d9981443522e7f 100644
--- a/llvm/test/Transforms/NewGVN/pr24397.ll
+++ b/llvm/test/Transforms/NewGVN/pr24397.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/NewGVN/pr24426.ll b/llvm/test/Transforms/NewGVN/pr24426.ll
index e8a88f5b267ab6..8f0974aa0bd356 100644
--- a/llvm/test/Transforms/NewGVN/pr24426.ll
+++ b/llvm/test/Transforms/NewGVN/pr24426.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=memcpyopt,mldst-motion,newgvn -S | FileCheck %s
 
 declare void @check(i8)
@@ -5,11 +6,17 @@ declare void @check(i8)
 declare void @write(ptr %res)
 
 define void @test1() {
+; CHECK-LABEL: define void @test1() {
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca [10 x i8], align 1
+; CHECK-NEXT:    call void @write(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
+; CHECK-NEXT:    call void @check(i8 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
   %1 = alloca [10 x i8]
   call void @write(ptr %1)
   %2 = load i8, ptr %1
 
-; CHECK-NOT: undef
   call void @check(i8 %2)
 
   ret void
diff --git a/llvm/test/Transforms/NewGVN/pr25440.ll b/llvm/test/Transforms/NewGVN/pr25440.ll
index b3ebf447afc2e5..9d9c4cda342581 100644
--- a/llvm/test/Transforms/NewGVN/pr25440.ll
+++ b/llvm/test/Transforms/NewGVN/pr25440.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ;RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
@@ -10,19 +11,51 @@ target triple = "thumbv7--linux-gnueabi"
 
 ; Function Attrs: nounwind
 define fastcc void @foo(ptr nocapture readonly %x) {
-;CHECK-LABEL: foo
+; CHECK-LABEL: define fastcc void @foo(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB0:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[X_TR:%.*]] = phi ptr [ [[X]], [[ENTRY:%.*]] ], [ null, [[LAND_LHS_TRUE:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[X_TR]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    switch i32 [[CONV]], label [[IF_END_50:%.*]] [
+; CHECK-NEXT:      i32 43, label [[CLEANUP:%.*]]
+; CHECK-NEXT:      i32 52, label [[IF_THEN_5:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       if.then.5:
+; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE]], label [[IF_THEN_26:%.*]]
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br i1 undef, label [[CLEANUP]], label [[BB0]]
+; CHECK:       if.then.26:
+; CHECK-NEXT:    br i1 undef, label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[MODE:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[X_TR]], i32 0, i32 1
+; CHECK-NEXT:    [[BF_LOAD:%.*]] = load i16, ptr [[MODE]], align 2
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN_44:%.*]], label [[CLEANUP]]
+; CHECK:       if.then.44:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end.50:
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds [0 x i32], ptr @length, i32 0, i32 [[CONV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX52]], align 4
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY_57:%.*]], label [[CLEANUP]]
+; CHECK:       for.body.57:
+; CHECK-NEXT:    unreachable
+; CHECK:       cleanup:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %bb0
 
 bb0:                                      ; preds = %land.lhs.true, %entry
-;CHECK: bb0:
   %x.tr = phi ptr [ %x, %entry ], [ null, %land.lhs.true ]
   %0 = load i16, ptr %x.tr, align 4
-; CHECK: load i16, ptr
   %conv = zext i16 %0 to i32
   switch i32 %conv, label %if.end.50 [
-    i32 43, label %cleanup
-    i32 52, label %if.then.5
+  i32 43, label %cleanup
+  i32 52, label %if.then.5
   ]
 
 if.then.5:                                        ; preds = %bb0
@@ -36,8 +69,6 @@ if.then.26:                                       ; preds = %if.then.5
   br i1 undef, label %cond.end, label %cond.false
 
 cond.false:                                       ; preds = %if.then.26
-; CHECK: cond.false:
-; CHECK: load i16
   %mode = getelementptr inbounds %struct.a, ptr %x.tr.lcssa163, i32 0, i32 1
   %bf.load = load i16, ptr %mode, align 2
   %bf.shl = shl i16 %bf.load, 8
@@ -50,7 +81,6 @@ if.then.44:                                       ; preds = %cond.end
   unreachable
 
 if.end.50:                                        ; preds = %bb0
-;%CHECK: if.end.50:
   %conv.lcssa = phi i32 [ %conv, %bb0 ]
   %arrayidx52 = getelementptr inbounds [0 x i32], ptr @length, i32 0, i32 %conv.lcssa
   %1 = load i32, ptr %arrayidx52, align 4
@@ -68,7 +98,34 @@ cleanup:                                          ; preds = %if.end.50, %cond.en
 @dfg_text = external global ptr, align 4
 
 define void @dfg_lex() {
-;CHECK-LABEL: dfg_lex
+; CHECK-LABEL: define void @dfg_lex() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_BODYTHREAD_PRE_SPLIT:%.*]]
+; CHECK:       while.bodythread-pre-split:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN_14:%.*]], label [[IF_END_15:%.*]]
+; CHECK:       if.then.14:
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr @dfg_text, align 4
+; CHECK-NEXT:    br label [[IF_END_15]]
+; CHECK:       if.end.15:
+; CHECK-NEXT:    [[V2:%.*]] = load ptr, ptr @yy_c_buf_p, align 4
+; CHECK-NEXT:    br label [[WHILE_COND_16:%.*]]
+; CHECK:       while.cond.16:
+; CHECK-NEXT:    br i1 undef, label [[WHILE_COND_16]], label [[WHILE_END:%.*]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[V2]], i32 undef
+; CHECK-NEXT:    store ptr [[ADD_PTR]], ptr @dfg_text, align 4
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST25:%.*]] = ptrtoint ptr [[ADD_PTR]] to i32
+; CHECK-NEXT:    switch i32 undef, label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 65, label [[WHILE_BODYTHREAD_PRE_SPLIT]]
+; CHECK-NEXT:      i32 3, label [[RETURN:%.*]]
+; CHECK-NEXT:      i32 57, label [[WHILE_BODYTHREAD_PRE_SPLIT]]
+; CHECK-NEXT:      i32 60, label [[IF_THEN_14]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.default:
+; CHECK-NEXT:    unreachable
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %while.bodythread-pre-split
 
@@ -93,10 +150,10 @@ while.end:                                        ; preds = %while.cond.16
   %sub.ptr.rhs.cast25 = ptrtoint ptr %add.ptr to i32
   %sub.ptr.sub26 = sub i32 0, %sub.ptr.rhs.cast25
   switch i32 undef, label %sw.default [
-    i32 65, label %while.bodythread-pre-split
-    i32 3, label %return
-    i32 57, label %while.bodythread-pre-split
-    i32 60, label %if.then.14
+  i32 65, label %while.bodythread-pre-split
+  i32 3, label %return
+  i32 57, label %while.bodythread-pre-split
+  i32 60, label %if.then.14
   ]
 
 sw.default:                                       ; preds = %while.end
diff --git a/llvm/test/Transforms/NewGVN/pr28562.ll b/llvm/test/Transforms/NewGVN/pr28562.ll
index a62fdd31da7210..320224c24b5bc0 100644
--- a/llvm/test/Transforms/NewGVN/pr28562.ll
+++ b/llvm/test/Transforms/NewGVN/pr28562.ll
@@ -1,9 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn < %s | FileCheck %s
 define ptr @test1(ptr %a) {
+; CHECK-LABEL: define ptr @test1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:    [[X2:%.*]] = getelementptr i32, ptr [[A]], i32 10
+; CHECK-NEXT:    ret ptr [[X2]]
+;
   %x1 = getelementptr inbounds i32, ptr %a, i32 10
   %x2 = getelementptr i32, ptr %a, i32 10
   ret ptr %x2
-; CHECK-LABEL: @test1(
-; CHECK: %[[x:.*]] = getelementptr i32, ptr %a, i32 10
-; CHECK: ret ptr %[[x]]
 }
diff --git a/llvm/test/Transforms/NewGVN/pr31472.ll b/llvm/test/Transforms/NewGVN/pr31472.ll
index 8bb9a14e813653..8eeb61400f8141 100644
--- a/llvm/test/Transforms/NewGVN/pr31472.ll
+++ b/llvm/test/Transforms/NewGVN/pr31472.ll
@@ -9,11 +9,13 @@ target triple = "x86_64-apple-macosx10.12.0"
 define i32 @main() personality ptr @__gxx_personality_v0{
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:    [[TMP1:%.*]] = invoke i32 @foo()
-; CHECK-NEXT:    to label %good unwind label %bad
+; CHECK-NEXT:            to label [[GOOD:%.*]] unwind label [[BAD:%.*]]
 ; CHECK:       good:
 ; CHECK-NEXT:    ret i32 5
 ; CHECK:       bad:
-; CHECK-NEXT:    [[TMP2:%.*]] = landingpad { ptr, i32
+; CHECK-NEXT:    [[TMP2:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    resume { ptr, i32 } [[TMP2]]
 ;
   %1 = invoke i32 @foo()
   to label %good unwind label %bad
diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll
index fe957dec72cf12..0e7461c2612b9f 100644
--- a/llvm/test/Transforms/NewGVN/pr31483.ll
+++ b/llvm/test/Transforms/NewGVN/pr31483.ll
@@ -10,20 +10,20 @@ define signext i32 @ham(ptr %arg, ptr %arg1) #0 {
 ; CHECK-LABEL: @ham(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    store ptr %arg1, ptr [[TMP]], align 8
-; CHECK-NEXT:    br label %bb2
+; CHECK-NEXT:    store ptr [[ARG1:%.*]], ptr [[TMP]], align 8
+; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ %arg, %bb ], [ %tmp7, %bb22 ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[ARG:%.*]], [[BB:%.*]] ], [ [[TMP7:%.*]], [[BB22:%.*]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %bb6, label %bb23
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BB6:%.*]], label [[BB23:%.*]]
 ; CHECK:       bb6:
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7]] = getelementptr inbounds i8, ptr [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP4]] to i32
-; CHECK-NEXT:    switch i32 [[TMP9]], label %bb22 [
-; CHECK-NEXT:    i32 115, label %bb10
-; CHECK-NEXT:    i32 105, label %bb16
-; CHECK-NEXT:    i32 99, label %bb16
+; CHECK-NEXT:    switch i32 [[TMP9]], label [[BB22]] [
+; CHECK-NEXT:      i32 115, label [[BB10:%.*]]
+; CHECK-NEXT:      i32 105, label [[BB16:%.*]]
+; CHECK-NEXT:      i32 99, label [[BB16]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb10:
 ; CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8
@@ -31,15 +31,15 @@ define signext i32 @ham(ptr %arg, ptr %arg1) #0 {
 ; CHECK-NEXT:    store ptr [[TMP12]], ptr [[TMP]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = call signext i32 (ptr, ...) @zot(ptr @global, ptr [[TMP14]])
-; CHECK-NEXT:    br label %bb22
+; CHECK-NEXT:    br label [[BB22]]
 ; CHECK:       bb16:
 ; CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 8
 ; CHECK-NEXT:    store ptr [[TMP18]], ptr [[TMP]], align 8
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 4
-; CHECK-NEXT:    br label %bb22
+; CHECK-NEXT:    br label [[BB22]]
 ; CHECK:       bb22:
-; CHECK-NEXT:    br label %bb2
+; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb23:
 ; CHECK-NEXT:    call void @llvm.va_end(ptr [[TMP]])
 ; CHECK-NEXT:    ret i32 undef
diff --git a/llvm/test/Transforms/NewGVN/pr31491.ll b/llvm/test/Transforms/NewGVN/pr31491.ll
index 5f6b371333bbf0..f27f13ed24e08a 100644
--- a/llvm/test/Transforms/NewGVN/pr31491.ll
+++ b/llvm/test/Transforms/NewGVN/pr31491.ll
@@ -7,13 +7,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define internal i32 @pr31491() {
 ; CHECK-LABEL: @pr31491(
 ; CHECK-NEXT:  bb5:
-; CHECK-NEXT:    br label %bb7
+; CHECK-NEXT:    br label [[BB7:%.*]]
 ; CHECK:       bb7:
-; CHECK-NEXT:    [[TMP:%.*]] = phi ptr [ [[TMP:%.*]]11, %bb10 ], [ undef, %bb5 ]
-; CHECK-NEXT:    br label %bb10
+; CHECK-NEXT:    [[TMP:%.*]] = phi ptr [ [[TMP11:%.*]], [[BB10:%.*]] ], [ undef, [[BB5:%.*]] ]
+; CHECK-NEXT:    br label [[BB10]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[TMP11:%.*]] = tail call ptr @patatino(ptr [[TMP]])
-; CHECK-NEXT:    br label %bb7
+; CHECK-NEXT:    [[TMP11]] = tail call ptr @patatino(ptr [[TMP]])
+; CHECK-NEXT:    br label [[BB7]]
 ;
 bb5:
   br label %bb7
diff --git a/llvm/test/Transforms/NewGVN/pr31501.ll b/llvm/test/Transforms/NewGVN/pr31501.ll
index 55195fd6ada764..18bfcd1b9ca09c 100644
--- a/llvm/test/Transforms/NewGVN/pr31501.ll
+++ b/llvm/test/Transforms/NewGVN/pr31501.ll
@@ -52,30 +52,30 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr #0 align 2 {
 ; CHECK-LABEL: @quux(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds %struct.barney, ptr %arg, i64 0, i32 3, i32 0, i32 0, i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa !2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds %struct.barney, ptr %arg, i64 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8, !tbaa !7
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_BARNEY:%.*]], ptr [[ARG:%.*]], i64 0, i32 3, i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa [[TBAA2:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_BARNEY]], ptr [[ARG]], i64 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP4]], align 8, !tbaa [[TBAA7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq ptr [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %bb21, label %bb8
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB21:%.*]], label [[BB8:%.*]]
 ; CHECK:       bb8:
-; CHECK-NEXT:    br label %bb11
+; CHECK-NEXT:    br label [[BB11:%.*]]
 ; CHECK:       bb9:
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq ptr [[TMP18:%.*]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP10]], label %bb19, label %bb11
+; CHECK-NEXT:    br i1 [[TMP10]], label [[BB19:%.*]], label [[BB11]]
 ; CHECK:       bb11:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP17:%.*]], %bb9 ], [ undef, %bb8 ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[TMP18]], %bb9 ], [ [[TMP3]], %bb8 ]
-; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP13]], align 8, !tbaa !8
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq ptr [[TMP15]], %arg1
+; CHECK-NEXT:    [[TMP12:%.*]] = phi ptr [ [[TMP17:%.*]], [[BB9:%.*]] ], [ undef, [[BB8]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[TMP18]], [[BB9]] ], [ [[TMP3]], [[BB8]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP13]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq ptr [[TMP15]], [[ARG1:%.*]]
 ; CHECK-NEXT:    [[TMP17]] = select i1 [[TMP16]], ptr [[TMP13]], ptr [[TMP12]]
-; CHECK-NEXT:    [[TMP18]] = getelementptr inbounds %struct.foo, ptr [[TMP13]], i64 1
-; CHECK-NEXT:    br i1 [[TMP16]], label %bb19, label %bb9
+; CHECK-NEXT:    [[TMP18]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[TMP13]], i64 1
+; CHECK-NEXT:    br i1 [[TMP16]], label [[BB19]], label [[BB9]]
 ; CHECK:       bb19:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi ptr [ null, %bb9 ], [ [[TMP17]], %bb11 ]
-; CHECK-NEXT:    br label %bb21
+; CHECK-NEXT:    [[TMP20:%.*]] = phi ptr [ null, [[BB9]] ], [ [[TMP17]], [[BB11]] ]
+; CHECK-NEXT:    br label [[BB21]]
 ; CHECK:       bb21:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ null, %bb ], [ [[TMP20]], %bb19 ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi ptr [ null, [[BB:%.*]] ], [ [[TMP20]], [[BB19]] ]
 ; CHECK-NEXT:    ret ptr [[TMP22]]
 ;
 bb:
diff --git a/llvm/test/Transforms/NewGVN/pr31573.ll b/llvm/test/Transforms/NewGVN/pr31573.ll
index 2382c487c19af0..7835e9d45a6713 100644
--- a/llvm/test/Transforms/NewGVN/pr31573.ll
+++ b/llvm/test/Transforms/NewGVN/pr31573.ll
@@ -10,7 +10,7 @@ define void @patatino(ptr %blah) {
 ; CHECK:       while.cond:
 ; CHECK-NEXT:    [[MEH:%.*]] = phi ptr [ [[BLAH:%.*]], [[ENTRY:%.*]] ], [ null, [[WHILE_BODY:%.*]] ]
 ; CHECK-NEXT:    switch i32 undef, label [[WHILE_BODY]] [
-; CHECK-NEXT:    i32 666, label [[WHILE_END:%.*]]
+; CHECK-NEXT:      i32 666, label [[WHILE_END:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    br label [[WHILE_COND]]
diff --git a/llvm/test/Transforms/NewGVN/pr31594.ll b/llvm/test/Transforms/NewGVN/pr31594.ll
index 47294d5640e468..d1a02d62934dee 100644
--- a/llvm/test/Transforms/NewGVN/pr31594.ll
+++ b/llvm/test/Transforms/NewGVN/pr31594.ll
@@ -10,8 +10,8 @@ define i1 @patatino(ptr %blah, i32 %choice) {
 ; CHECK:       while.cond:
 ; CHECK-NEXT:    [[FOO:%.*]] = phi ptr [ [[BLAH:%.*]], [[ENTRY:%.*]] ], [ null, [[WHILE_BODY:%.*]] ]
 ; CHECK-NEXT:    switch i32 [[CHOICE:%.*]], label [[WHILE_BODY]] [
-; CHECK-NEXT:    i32 -1, label [[WHILE_END:%.*]]
-; CHECK-NEXT:    i32 40, label [[LAND_END:%.*]]
+; CHECK-NEXT:      i32 -1, label [[WHILE_END:%.*]]
+; CHECK-NEXT:      i32 40, label [[LAND_END:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       land.end:
 ; CHECK-NEXT:    br label [[WHILE_END]]
@@ -66,7 +66,7 @@ define void @foo(ptr %arg) {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i8 0, ptr [[TMP]], align 1, !g !0
+; CHECK-NEXT:    store i8 0, ptr [[TMP]], align 1, !g [[META0:![0-9]+]]
 ; CHECK-NEXT:    br label [[BB4:%.*]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    br label [[BB6:%.*]]
@@ -74,13 +74,13 @@ define void @foo(ptr %arg) {
 ; CHECK-NEXT:    br i1 undef, label [[BB9:%.*]], label [[BB7:%.*]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    switch i8 0, label [[BB6]] [
-; CHECK-NEXT:    i8 6, label [[BB8:%.*]]
+; CHECK-NEXT:      i8 6, label [[BB8:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb8:
 ; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[BB4]]
 ; CHECK:       bb9:
-; CHECK-NEXT:    store i8 0, ptr [[ARG]], align 1, !g !0
+; CHECK-NEXT:    store i8 0, ptr [[ARG]], align 1, !g [[META0]]
 ; CHECK-NEXT:    unreachable
 ;
 bb:
diff --git a/llvm/test/Transforms/NewGVN/pr31613.ll b/llvm/test/Transforms/NewGVN/pr31613.ll
index 943cdbc113dc40..0bcf86a5711189 100644
--- a/llvm/test/Transforms/NewGVN/pr31613.ll
+++ b/llvm/test/Transforms/NewGVN/pr31613.ll
@@ -74,7 +74,7 @@ declare void @c.d.p(i64, ptr)
 define void @e(i32 %a0, i32 %a1, ptr %p2) {
 ; CHECK-LABEL: @e(
 ; CHECK-NEXT:    [[F:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[A0:%.*]], ptr [[F]], align 4, !g !0
+; CHECK-NEXT:    store i32 [[A0:%.*]], ptr [[F]], align 4, !g [[META0:![0-9]+]]
 ; CHECK-NEXT:    br label [[H:%.*]]
 ; CHECK:       h:
 ; CHECK-NEXT:    call void @c.d.p(i64 8, ptr undef)
@@ -88,10 +88,10 @@ define void @e(i32 %a0, i32 %a1, ptr %p2) {
 ; CHECK-NEXT:    br label [[R]]
 ; CHECK:       r:
 ; CHECK-NEXT:    switch i32 undef, label [[N:%.*]] [
-; CHECK-NEXT:    i32 0, label [[S:%.*]]
+; CHECK-NEXT:      i32 0, label [[S:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       s:
-; CHECK-NEXT:    store i32 [[A1:%.*]], ptr [[F]], align 4, !g !0
+; CHECK-NEXT:    store i32 [[A1:%.*]], ptr [[F]], align 4, !g [[META0]]
 ; CHECK-NEXT:    br label [[H]]
 ; CHECK:       n:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/NewGVN/pr31682.ll b/llvm/test/Transforms/NewGVN/pr31682.ll
index 00a1bf2b84505a..3d8c9e28635c66 100644
--- a/llvm/test/Transforms/NewGVN/pr31682.ll
+++ b/llvm/test/Transforms/NewGVN/pr31682.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define void @bar() {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr @global
+; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr @global, align 8
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br i1 undef, label [[BB2]], label [[BB7:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr31758.ll b/llvm/test/Transforms/NewGVN/pr31758.ll
index 43188430e34c22..274d605802a6bc 100644
--- a/llvm/test/Transforms/NewGVN/pr31758.ll
+++ b/llvm/test/Transforms/NewGVN/pr31758.ll
@@ -12,7 +12,7 @@ define void @tinkywinky() {
 ; CHECK:       bb90:
 ; CHECK-NEXT:    br label [[BB90]]
 ; CHECK:       bb138:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[BB138:%.*]]
 ;
 bb:
diff --git a/llvm/test/Transforms/NewGVN/pr32607.ll b/llvm/test/Transforms/NewGVN/pr32607.ll
index 4770724dc2baa8..7460ab5ec27e04 100644
--- a/llvm/test/Transforms/NewGVN/pr32607.ll
+++ b/llvm/test/Transforms/NewGVN/pr32607.ll
@@ -7,7 +7,7 @@ define hidden void @foo() {
 ; CHECK:       if:
 ; CHECK-NEXT:    br i1 false, label [[L50:%.*]], label [[IF]]
 ; CHECK:       L50:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    ret void
 ;
 top:
diff --git a/llvm/test/Transforms/NewGVN/pr32836.ll b/llvm/test/Transforms/NewGVN/pr32836.ll
index 5488655b3683d3..00f3fb07b20259 100644
--- a/llvm/test/Transforms/NewGVN/pr32836.ll
+++ b/llvm/test/Transforms/NewGVN/pr32836.ll
@@ -5,19 +5,19 @@
 @b = external global %struct.anon
 define void @tinkywinky(i1 %patatino) {
 ; CHECK-LABEL: @tinkywinky(
-; CHECK-NEXT:    store i32 8, ptr null
+; CHECK-NEXT:    store i32 8, ptr null, align 4
 ; CHECK-NEXT:    br i1 [[PATATINO:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br label [[L:%.*]]
 ; CHECK:       L:
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr null
-; CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i32, ptr @b
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i32, ptr @b, align 4
 ; CHECK-NEXT:    [[BF_VALUE:%.*]] = and i32 [[TMP1]], 536870911
 ; CHECK-NEXT:    [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], -536870912
 ; CHECK-NEXT:    [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_VALUE]]
-; CHECK-NEXT:    store i32 [[BF_SET]], ptr @b
+; CHECK-NEXT:    store i32 [[BF_SET]], ptr @b, align 4
 ; CHECK-NEXT:    br label [[LOR_END:%.*]]
 ; CHECK:       lor.end:
 ; CHECK-NEXT:    br label [[L]]
diff --git a/llvm/test/Transforms/NewGVN/pr32838.ll b/llvm/test/Transforms/NewGVN/pr32838.ll
index 5ba68fa15c24dc..87c93d9312e67c 100644
--- a/llvm/test/Transforms/NewGVN/pr32838.ll
+++ b/llvm/test/Transforms/NewGVN/pr32838.ll
@@ -10,7 +10,7 @@ define void @fn1(i64 noundef %arg) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    br i1 false, label [[FIRSTPHIBLOCK:%.*]], label [[TEMP:%.*]]
 ; CHECK:       firstphiblock:
-; CHECK-NEXT:    br i1 undef, label %for.cond17thread-pre-split, label [[SECONDPHIBLOCK:%.*]]
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND17THREAD_PRE_SPLIT:%.*]], label [[SECONDPHIBLOCK:%.*]]
 ; CHECK:       secondphiblock:
 ; CHECK-NEXT:    [[SECONDPHI:%.*]] = phi i64 [ [[THIRDPHI:%.*]], [[THIRDPHIBLOCK:%.*]] ], [ undef, [[FIRSTPHIBLOCK]] ]
 ; CHECK-NEXT:    br i1 undef, label [[FIRSTPHIBLOCK]], label [[THIRDPHIBLOCK]]
@@ -55,7 +55,7 @@ define void @fn2(i64 noundef %arg) {
 ; CHECK-NEXT:    br i1 false, label [[FIRSTPHIBLOCK:%.*]], label [[TEMP:%.*]]
 ; CHECK:       firstphiblock:
 ; CHECK-NEXT:    [[FIRSTPHI:%.*]] = phi i64 [ poison, [[IF_THEN]] ], [ [[SECONDPHI:%.*]], [[SECONDPHIBLOCK:%.*]] ]
-; CHECK-NEXT:    br i1 undef, label %for.cond17thread-pre-split, label [[SECONDPHIBLOCK]]
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND17THREAD_PRE_SPLIT:%.*]], label [[SECONDPHIBLOCK]]
 ; CHECK:       secondphiblock:
 ; CHECK-NEXT:    [[SECONDPHI]] = phi i64 [ [[THIRDPHI:%.*]], [[THIRDPHIBLOCK:%.*]] ], [ [[FIRSTPHI]], [[FIRSTPHIBLOCK]] ]
 ; CHECK-NEXT:    br i1 undef, label [[FIRSTPHIBLOCK]], label [[THIRDPHIBLOCK]]
@@ -65,7 +65,7 @@ define void @fn2(i64 noundef %arg) {
 ; CHECK:       for.cond17thread-pre-split:
 ; CHECK-NEXT:    br label [[COND_TRUE]]
 ; CHECK:       cond.true:
-; CHECK-NEXT:    [[FOURTHPHI:%.*]] = phi i64 [ [[ARG:%.*]], [[ENTRY:%.*]] ], [ [[FIRSTPHI]], %for.cond17thread-pre-split ]
+; CHECK-NEXT:    [[FOURTHPHI:%.*]] = phi i64 [ [[ARG:%.*]], [[ENTRY:%.*]] ], [ [[FIRSTPHI]], [[FOR_COND17THREAD_PRE_SPLIT]] ]
 ; CHECK-NEXT:    [[DIV]] = sdiv i64 [[FOURTHPHI]], 4
 ; CHECK-NEXT:    br label [[THIRDPHIBLOCK]]
 ; CHECK:       temp:
@@ -105,7 +105,7 @@ define void @fn3() {
 ; CHECK-NEXT:    [[F_0:%.*]] = phi ptr [ @b, [[ENTRY:%.*]] ], [ @a, [[L1_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond.loopexit:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_END14:%.*]], label [[FOR_COND1_PREHEADER:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr32845.ll b/llvm/test/Transforms/NewGVN/pr32845.ll
index d1182a627c59c3..29b81b8e1c66a4 100644
--- a/llvm/test/Transforms/NewGVN/pr32845.ll
+++ b/llvm/test/Transforms/NewGVN/pr32845.ll
@@ -13,7 +13,7 @@ define void @tinkywinky() {
 ; CHECK-NEXT:    [[F_0:%.*]] = phi ptr [ @b, [[ENTRY:%.*]] ], [ @a, [[L1_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond.loopexit:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br i1 undef, label [[FOR_END14:%.*]], label [[FOR_COND1_PREHEADER:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr32852.ll b/llvm/test/Transforms/NewGVN/pr32852.ll
index 4fd5cf1a95fa72..ad5badd82fdf00 100644
--- a/llvm/test/Transforms/NewGVN/pr32852.ll
+++ b/llvm/test/Transforms/NewGVN/pr32852.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; Make sure GVN doesn't incorrectly think the branch terminating
 ; bb2 has a constant condition.
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
@@ -6,13 +7,26 @@
 @patatino = private unnamed_addr constant [3 x i8] c"0\0A\00"
 
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sge i32 [[TMP]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB2:%.*]], label [[BB7:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[BB5:%.*]], label [[BB7]]
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 (ptr, ...) @printf(ptr @patatino)
+; CHECK-NEXT:    br label [[BB7]]
+; CHECK:       bb7:
+; CHECK-NEXT:    ret void
+;
 bb:
   %tmp = load i32, ptr @a
   %tmp1 = icmp sge i32 %tmp, 0
   br i1 %tmp1, label %bb2, label %bb7
 bb2:
   %tmp4 = icmp sgt i32 %tmp, 0
-; CHECK: br i1 %tmp4, label %bb5, label %bb7
   br i1 %tmp4, label %bb5, label %bb7
 bb5:
   %tmp6 = call i32 (ptr, ...) @printf(ptr @patatino)
diff --git a/llvm/test/Transforms/NewGVN/pr32897.ll b/llvm/test/Transforms/NewGVN/pr32897.ll
index 35a3b0040bcaac..881c3a8e3ef538 100644
--- a/llvm/test/Transforms/NewGVN/pr32897.ll
+++ b/llvm/test/Transforms/NewGVN/pr32897.ll
@@ -6,7 +6,7 @@ define void @tinkywinky(ptr %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
-; CHECK-NEXT:    store i64 undef, ptr [[B:%.*]]
+; CHECK-NEXT:    store i64 undef, ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    br label [[BODY]]
diff --git a/llvm/test/Transforms/NewGVN/pr32934.ll b/llvm/test/Transforms/NewGVN/pr32934.ll
index fa725f8888e7e3..c8218c209face4 100644
--- a/llvm/test/Transforms/NewGVN/pr32934.ll
+++ b/llvm/test/Transforms/NewGVN/pr32934.ll
@@ -1,39 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
 
-; CHECK: define void @tinkywinky() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %d = alloca i32, align 4
-; CHECK-NEXT:   store i32 0, ptr null, align 4
-; CHECK-NEXT:   br label %for.cond
-; CHECK: for.cond:                                         ; preds = %if.end, %entry
-; CHECK-NEXT:   %0 = load i32, ptr null, align 4
-; CHECK-NEXT:   %cmp = icmp slt i32 %0, 1
-; CHECK-NEXT:   br i1 %cmp, label %for.body, label %while.cond
-; CHECK: for.body:                                         ; preds = %for.cond
-; CHECK-NEXT:   %1 = load i32, ptr @a, align 4
-; CHECK-NEXT:   store i32 %1, ptr %d, align 4
-; CHECK-NEXT:   br label %L
-; CHECK: L:                                                ; preds = %if.then, %for.body
-; CHECK-NEXT:   %tobool = icmp ne i32 %1, 0
-; CHECK-NEXT:   br i1 %tobool, label %if.then, label %if.end
-; CHECK: if.then:                                          ; preds = %L
-; CHECK-NEXT:   call void (ptr, ...) @printf(ptr @patatino)
-; CHECK-NEXT:   br label %L
-; CHECK: if.end:                                           ; preds = %L
-; CHECK-NEXT:   br label %for.cond
-; CHECK: while.cond:                                       ; preds = %while.body, %for.cond
-; CHECK-NEXT:   br i1 undef, label %while.body, label %while.end
-; CHECK: while.body:                                       ; preds = %while.cond
-; CHECK-NEXT:   call void (ptr, ...) @printf(ptr @patatino)
-; CHECK-NEXT:   br label %while.cond
-; CHECK: while.end:
-; CHECK-NEXT:   %2 = load i32, ptr @a, align 4
-; CHECK-NEXT:   store i32 %2, ptr undef, align 4
-; CHECK-NEXT:   ret void
 
 @a = external global i32, align 4
 @patatino = external unnamed_addr constant [2 x i8], align 1
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr null, align 4
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[D]], align 4
+; CHECK-NEXT:    br label [[L:%.*]]
+; CHECK:       L:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void (ptr, ...) @printf(ptr @patatino)
+; CHECK-NEXT:    br label [[L]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    br i1 undef, label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    call void (ptr, ...) @printf(ptr @patatino)
+; CHECK-NEXT:    br label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    store i32 [[TMP2]], ptr undef, align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %d = alloca i32, align 4
   store i32 0, ptr null, align 4
diff --git a/llvm/test/Transforms/NewGVN/pr32945.ll b/llvm/test/Transforms/NewGVN/pr32945.ll
index ebf3813c65bd5a..7aabe4df059520 100644
--- a/llvm/test/Transforms/NewGVN/pr32945.ll
+++ b/llvm/test/Transforms/NewGVN/pr32945.ll
@@ -1,9 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
-; CHECK-NOT: call i32 @llvm.ssa.copy
 
 @d = external global i32
 @e = external global i32
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:    br i1 true, label [[LOR_LHS_FALSE:%.*]], label [[COND_TRUE:%.*]]
+; CHECK:       lor.lhs.false:
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr @d, align 4
+; CHECK-NEXT:    [[PATATINO:%.*]] = load i32, ptr null, align 4
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[TMP]], [[PATATINO]]
+; CHECK-NEXT:    store i32 [[OR]], ptr @d, align 4
+; CHECK-NEXT:    br label [[COND_TRUE]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @e, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @d, align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE6:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.true6:
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[COND_FALSE]], label [[COND_FALSE]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %lor.lhs.false, label %cond.true
 lor.lhs.false:
   %tmp = load i32, ptr @d, align 4
diff --git a/llvm/test/Transforms/NewGVN/pr32952.ll b/llvm/test/Transforms/NewGVN/pr32952.ll
index 5157bb27e6c59b..49e4843294f574 100644
--- a/llvm/test/Transforms/NewGVN/pr32952.ll
+++ b/llvm/test/Transforms/NewGVN/pr32952.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; PR32952: Don't erroneously consider congruent two phi nodes which
 ; have the same arguments but different incoming edges.
 ; RUN: opt -passes=newgvn -S %s | FileCheck %s
@@ -6,6 +7,31 @@
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 define i32 @tinkywinky() {
+; CHECK-LABEL: define i32 @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @a, align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[NEG:%.*]] = xor i32 [[CONV]], -1
+; CHECK-NEXT:    [[CONV1:%.*]] = trunc i32 [[NEG]] to i16
+; CHECK-NEXT:    [[CONV3:%.*]] = zext i16 [[CONV1]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], [[CONV3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[TINKY:%.*]], label [[WINKY:%.*]]
+; CHECK:       tinky:
+; CHECK-NEXT:    store i16 2, ptr @a, align 2
+; CHECK-NEXT:    br label [[PATATINO:%.*]]
+; CHECK:       winky:
+; CHECK-NEXT:    br label [[PATATINO]]
+; CHECK:       patatino:
+; CHECK-NEXT:    [[MEH:%.*]] = phi i16 [ [[TMP0]], [[WINKY]] ], [ [[CONV1]], [[TINKY]] ]
+; CHECK-NEXT:    [[BANANA:%.*]] = phi i16 [ [[TMP0]], [[TINKY]] ], [ [[CONV1]], [[WINKY]] ]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PROMOTED:%.*]] = zext i16 [[BANANA]] to i32
+; CHECK-NEXT:    [[OTHER:%.*]] = zext i16 [[MEH]] to i32
+; CHECK-NEXT:    [[FIRST:%.*]] = tail call i32 (ptr, ...) @printf(ptr @.str, i32 [[PROMOTED]])
+; CHECK-NEXT:    [[SECOND:%.*]] = tail call i32 (ptr, ...) @printf(ptr @.str, i32 [[OTHER]])
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %0 = load i16, ptr @a, align 2
   %conv = sext i16 %0 to i32
@@ -23,15 +49,11 @@ winky:
   br label %patatino
 
 patatino:
-; CHECK: %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ]
-; CHECK: %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ]
   %meh = phi i16 [ %0, %winky ], [ %conv1, %tinky ]
   %banana = phi i16 [ %0, %tinky ], [ %conv1, %winky ]
   br label %end
 
 end:
-; CHECK: %promoted = zext i16 %banana to i32
-; CHECK: %other = zext i16 %meh to i32
   %promoted = zext i16 %banana to i32
   %other = zext i16 %meh to i32
   %first = tail call i32 (ptr, ...) @printf(ptr @.str, i32 %promoted)
diff --git a/llvm/test/Transforms/NewGVN/pr33014.ll b/llvm/test/Transforms/NewGVN/pr33014.ll
index f6e919770d6c62..04f9df2704a6c7 100644
--- a/llvm/test/Transforms/NewGVN/pr33014.ll
+++ b/llvm/test/Transforms/NewGVN/pr33014.ll
@@ -1,33 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; Make sure we don't end up in an infinite recursion in singleReachablePHIPath().
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 
 @c = external global i64, align 8
 
-; CHECK-LABEL: define void @tinkywinky() {
-; CHECK: entry:
-; CHECK-NEXT:   br i1 undef, label %l2, label %if.then
-; CHECK: if.then:                                          ; preds = %entry
-; CHECK-NEXT:   br label %for.body
-; CHECK: ph:                                               ; preds = %back, %ontrue
-; CHECK-NEXT:   br label %for.body
-; CHECK: for.body:                                         ; preds = %ph, %if.then
-; CHECK-NEXT:   br i1 undef, label %ontrue, label %onfalse
-; CHECK: onfalse:                                          ; preds = %for.body
-; CHECK-NEXT:   %patatino = load i64, ptr @c
-; CHECK-NEXT:   ret void
-; CHECK: ontrue:                                           ; preds = %for.body
-; CHECK-NEXT:   %dipsy = load i64, ptr @c
-; CHECK-NEXT:   br label %ph
-; CHECK: back:                                             ; preds = %l2
-; CHECK-NEXT:   store i8 poison, ptr null
-; CHECK-NEXT:   br label %ph
-; CHECK: end:                                              ; preds = %l2
-; CHECK-NEXT:   ret void
-; CHECK: l2:                                               ; preds = %entry
-; CHECK-NEXT:   br i1 false, label %back, label %end
-; CHECK-NEXT: }
 
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[L2:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       ph:
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[ONTRUE:%.*]], label [[ONFALSE:%.*]]
+; CHECK:       onfalse:
+; CHECK-NEXT:    [[PATATINO:%.*]] = load i64, ptr @c, align 4
+; CHECK-NEXT:    ret void
+; CHECK:       ontrue:
+; CHECK-NEXT:    [[DIPSY:%.*]] = load i64, ptr @c, align 4
+; CHECK-NEXT:    br label [[PH:%.*]]
+; CHECK:       back:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[PH]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+; CHECK:       l2:
+; CHECK-NEXT:    br i1 false, label [[BACK:%.*]], label [[END:%.*]]
+;
 entry:
   br i1 undef, label %l2, label %if.then
 if.then:
diff --git a/llvm/test/Transforms/NewGVN/pr33086.ll b/llvm/test/Transforms/NewGVN/pr33086.ll
index 54802cdbd92585..ab6c00dd1777ff 100644
--- a/llvm/test/Transforms/NewGVN/pr33086.ll
+++ b/llvm/test/Transforms/NewGVN/pr33086.ll
@@ -1,31 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S %s | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK-LABEL: define void @tinkywinky() {
-; CHECK: entry:
-; CHECK-NEXT:   br i1 undef, label %for.cond18, label %for.cond.preheader
-; CHECK: for.cond.preheader:
-; CHECK-NEXT:   br label %for.cond2thread-pre-split
-; CHECK: for.cond2thread-pre-split:
-; CHECK-NEXT:   %conv24 = phi i32 [ 0, %for.cond.preheader ], [ %conv, %for.inc.split ]
-; CHECK-NEXT:   br label %for.inc.split
-; CHECK: for.inc.split:
-; CHECK-NEXT:   %add = shl nsw i32 %conv24, 16
-; CHECK-NEXT:   %sext23 = add i32 %add, 65536
-; CHECK-NEXT:   %conv = ashr exact i32 %sext23, 16
-; CHECK-NEXT:   %cmp = icmp slt i32 %sext23, 3604480
-; CHECK-NEXT:   br i1 %cmp, label %for.cond2thread-pre-split, label %l1.loopexit
-; CHECK: l1.loopexit:
-; CHECK-NEXT:   br label %l1
-; CHECK: l1:
-; CHECK-NEXT:   %0 = load i16, ptr null, align 2
-; CHECK-NEXT:   %g.0.g.0..pr = load i16, ptr null, align 2
-; CHECK-NEXT:   ret void
-; CHECK: for.cond18:
-; CHECK-NEXT:   br label %l1
-; CHECK-NEXT: }
 
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND18:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    br label [[FOR_COND2THREAD_PRE_SPLIT:%.*]]
+; CHECK:       for.cond2thread-pre-split:
+; CHECK-NEXT:    [[CONV24:%.*]] = phi i32 [ 0, [[FOR_COND_PREHEADER]] ], [ [[CONV:%.*]], [[FOR_INC_SPLIT:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_INC_SPLIT]]
+; CHECK:       for.inc.split:
+; CHECK-NEXT:    [[ADD:%.*]] = shl nsw i32 [[CONV24]], 16
+; CHECK-NEXT:    [[SEXT23:%.*]] = add i32 [[ADD]], 65536
+; CHECK-NEXT:    [[CONV]] = ashr exact i32 [[SEXT23]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SEXT23]], 3604480
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND2THREAD_PRE_SPLIT]], label [[L1_LOOPEXIT:%.*]]
+; CHECK:       l1.loopexit:
+; CHECK-NEXT:    br label [[L1:%.*]]
+; CHECK:       l1:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr null, align 2
+; CHECK-NEXT:    [[G_0_G_0__PR:%.*]] = load i16, ptr null, align 2
+; CHECK-NEXT:    ret void
+; CHECK:       for.cond18:
+; CHECK-NEXT:    br label [[L1]]
+;
 entry:
   br i1 undef, label %for.cond18, label %for.cond.preheader
 
diff --git a/llvm/test/Transforms/NewGVN/pr33116.ll b/llvm/test/Transforms/NewGVN/pr33116.ll
index f5ef3ae7575ee5..6609ef9e72dac6 100644
--- a/llvm/test/Transforms/NewGVN/pr33116.ll
+++ b/llvm/test/Transforms/NewGVN/pr33116.ll
@@ -13,7 +13,7 @@ define void @b() {
 ; CHECK:       c:
 ; CHECK-NEXT:    br i1 undef, label [[IF_G:%.*]], label [[IF_E]]
 ; CHECK:       if.g:
-; CHECK-NEXT:    store i32 undef, ptr @a
+; CHECK-NEXT:    store i32 undef, ptr @a, align 4
 ; CHECK-NEXT:    br label [[WHILE_D]]
 ; CHECK:       if.e:
 ; CHECK-NEXT:    br label [[F]]
diff --git a/llvm/test/Transforms/NewGVN/pr33187.ll b/llvm/test/Transforms/NewGVN/pr33187.ll
index e5c3da2fb05a9e..37668bba3d5b60 100644
--- a/llvm/test/Transforms/NewGVN/pr33187.ll
+++ b/llvm/test/Transforms/NewGVN/pr33187.ll
@@ -30,7 +30,7 @@ define void @fn1() local_unnamed_addr #0 {
 ; CHECK:       while.body12:
 ; CHECK-NEXT:    br i1 undef, label [[IF_END18]], label [[L]]
 ; CHECK:       L.loopexit:
-; CHECK-NEXT:    store i8 poison, ptr null
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[L]]
 ; CHECK:       L:
 ; CHECK-NEXT:    [[H_125]] = phi i32 [ [[H_127]], [[WHILE_BODY12]] ], [ poison, [[L_LOOPEXIT]] ]
@@ -114,13 +114,13 @@ attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="fals
 define void @a() {
 ; CHECK-LABEL: @a(
 ; CHECK-NEXT:  b:
-; CHECK-NEXT:    store ptr null, ptr null
+; CHECK-NEXT:    store ptr null, ptr null, align 8
 ; CHECK-NEXT:    br label [[D:%.*]]
 ; CHECK:       d:
 ; CHECK-NEXT:    [[I:%.*]] = phi ptr [ null, [[B:%.*]] ], [ [[E:%.*]], [[F:%.*]] ]
 ; CHECK-NEXT:    br i1 undef, label [[F]], label [[G:%.*]]
 ; CHECK:       g:
-; CHECK-NEXT:    store ptr [[I]], ptr null
+; CHECK-NEXT:    store ptr [[I]], ptr null, align 8
 ; CHECK-NEXT:    unreachable
 ; CHECK:       f:
 ; CHECK-NEXT:    [[E]] = getelementptr i8, ptr [[I]], i64 1
diff --git a/llvm/test/Transforms/NewGVN/pr33196.ll b/llvm/test/Transforms/NewGVN/pr33196.ll
index c312d5ec979304..c04b895fa47e96 100644
--- a/llvm/test/Transforms/NewGVN/pr33196.ll
+++ b/llvm/test/Transforms/NewGVN/pr33196.ll
@@ -1,33 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
 
-; CHECK: define i32 @main() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %tmp = load i32, ptr @d, align 4
-; CHECK-NEXT:   %tmp1 = load i32, ptr @c, align 4
-; CHECK-NEXT:   %tobool = icmp eq i32 %tmp1, -1
-; CHECK-NEXT:   br i1 %tobool, label %if.end, label %if.then
-; CHECK: if.then:
-; CHECK-NEXT:   br label %L
-; CHECK: L:
-; CHECK-NEXT:   %e.0 = phi i32 [ 0, %if.then ], [ %e.1, %if.then4 ]
-; CHECK-NEXT:   br label %if.end
-; CHECK: if.end:
-; CHECK-NEXT:   %e.1 = phi i32 [ %e.0, %L ], [ %tmp, %entry ]
-; CHECK-NEXT:   store i32 %e.1, ptr @a, align 4
-; CHECK-NEXT:   %tmp2 = load i32, ptr @b, align 4
-; CHECK-NEXT:   store i32 0, ptr @b, align 4
-; CHECK-NEXT:   %sext = shl i32 %tmp2, 16
-; CHECK-NEXT:   %conv1 = ashr exact i32 %sext, 16
-; CHECK-NEXT:   %add = add nsw i32 %conv1, %tmp1
-; CHECK-NEXT:   %add2 = add nsw i32 %add, %e.1
-; CHECK-NEXT:   store i32 %add2, ptr @a, align 4
-; CHECK-NEXT:   %tobool3 = icmp eq i32 %add2, 0
-; CHECK-NEXT:   br i1 %tobool3, label %if.end5, label %if.then4
-; CHECK: if.then4:
-; CHECK-NEXT:   br label %L
-; CHECK: if.end5:
-; CHECK-NEXT:   ret i32 0
-; CHECK-NEXT: }
 
 @d = global i32 1, align 4
 @c = common global i32 0, align 4
@@ -35,6 +8,34 @@
 @b = common global i32 0, align 4
 
 define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr @d, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @c, align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP1]], -1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[L:%.*]]
+; CHECK:       L:
+; CHECK-NEXT:    [[E_0:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[E_1:%.*]], [[IF_THEN4:%.*]] ]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[E_1]] = phi i32 [ [[E_0]], [[L]] ], [ [[TMP]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    store i32 [[E_1]], ptr @a, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @b, align 4
+; CHECK-NEXT:    store i32 0, ptr @b, align 4
+; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 [[TMP2]], 16
+; CHECK-NEXT:    [[CONV1:%.*]] = ashr exact i32 [[SEXT]], 16
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV1]], [[TMP1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[ADD]], [[E_1]]
+; CHECK-NEXT:    store i32 [[ADD2]], ptr @a, align 4
+; CHECK-NEXT:    [[TOBOOL3:%.*]] = icmp eq i32 [[ADD2]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL3]], label [[IF_END5:%.*]], label [[IF_THEN4]]
+; CHECK:       if.then4:
+; CHECK-NEXT:    br label [[L]]
+; CHECK:       if.end5:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %tmp = load i32, ptr @d, align 4
   %tmp1 = load i32, ptr @c, align 4
diff --git a/llvm/test/Transforms/NewGVN/pr33204.ll b/llvm/test/Transforms/NewGVN/pr33204.ll
index 99c48241a75c68..482e35e7fdb11f 100644
--- a/llvm/test/Transforms/NewGVN/pr33204.ll
+++ b/llvm/test/Transforms/NewGVN/pr33204.ll
@@ -20,10 +20,10 @@ define void @hoge(i32 %arg) {
 ; CHECK-NEXT:    [[TMP:%.*]] = phi i32 [ 0, [[BB1:%.*]] ], [ [[ARG:%.*]], [[BB:%.*]] ]
 ; CHECK-NEXT:    br label [[BB6:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @global, align 4, !h !0
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @global, align 4, !h [[META0:![0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb6:
-; CHECK-NEXT:    store i32 [[TMP]], ptr @global.1, align 4, !h !0
+; CHECK-NEXT:    store i32 [[TMP]], ptr @global.1, align 4, !h [[META0]]
 ; CHECK-NEXT:    br i1 undef, label [[BB7:%.*]], label [[BB1]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    br i1 undef, label [[BB10:%.*]], label [[BB8:%.*]]
@@ -33,7 +33,7 @@ define void @hoge(i32 %arg) {
 ; CHECK-NEXT:    store i8 poison, ptr null, align 1
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    store i32 0, ptr @global, align 4, !h !0
+; CHECK-NEXT:    store i32 0, ptr @global, align 4, !h [[META0]]
 ; CHECK-NEXT:    br label [[BB7]]
 ;
 bb:
diff --git a/llvm/test/Transforms/NewGVN/pr33305.ll b/llvm/test/Transforms/NewGVN/pr33305.ll
index f87cf08e2abf73..3a19f610defcd7 100644
--- a/llvm/test/Transforms/NewGVN/pr33305.ll
+++ b/llvm/test/Transforms/NewGVN/pr33305.ll
@@ -19,14 +19,14 @@ target triple = "x86_64-apple-macosx10.12.0"
 define i32 @main() local_unnamed_addr #0 {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa !3
+; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[CMP13_I:%.*]] = icmp slt i32 [[DOTPR_I]], 1
 ; CHECK-NEXT:    br i1 [[CMP13_I]], label [[FOR_COND1_PREHEADER_LR_PH_I:%.*]], label [[ENTRY_FOR_END9_I_CRIT_EDGE:%.*]]
 ; CHECK:       entry.for.end9.i_crit_edge:
-; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @h, align 4, !tbaa !3
+; CHECK-NEXT:    [[DOTPRE:%.*]] = load i32, ptr @h, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[FOR_END9_I:%.*]]
 ; CHECK:       for.cond1.preheader.lr.ph.i:
-; CHECK-NEXT:    [[G_PROMOTED14_I:%.*]] = load i32, ptr @g, align 4, !tbaa !3
+; CHECK-NEXT:    [[G_PROMOTED14_I:%.*]] = load i32, ptr @g, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_I:%.*]]
 ; CHECK:       for.cond1.preheader.i:
 ; CHECK-NEXT:    [[INC816_I:%.*]] = phi i32 [ [[DOTPR_I]], [[FOR_COND1_PREHEADER_LR_PH_I]] ], [ [[INC8_I:%.*]], [[FOR_INC7_I:%.*]] ]
@@ -42,9 +42,9 @@ define i32 @main() local_unnamed_addr #0 {
 ; CHECK:       lor.rhs.i:
 ; CHECK-NEXT:    [[LNOT_I:%.*]] = xor i1 [[TOBOOL_I]], true
 ; CHECK-NEXT:    [[LNOT_EXT_I:%.*]] = zext i1 [[LNOT_I]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @e, align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @e, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[TMP3]], [[LNOT_EXT_I]]
-; CHECK-NEXT:    store i32 [[XOR_I]], ptr @e, align 4, !tbaa !3
+; CHECK-NEXT:    store i32 [[XOR_I]], ptr @e, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[LOR_END_I]]
 ; CHECK:       lor.end.i:
 ; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[INC12_I]], 1
@@ -55,28 +55,28 @@ define i32 @main() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[INC816_I]], 0
 ; CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_COND1_PREHEADER_I]], label [[FOR_COND_FOR_END9_CRIT_EDGE_I:%.*]]
 ; CHECK:       for.cond.for.end9_crit_edge.i:
-; CHECK-NEXT:    store i32 0, ptr @g, align 4, !tbaa !3
-; CHECK-NEXT:    store i32 2, ptr @h, align 4, !tbaa !3
-; CHECK-NEXT:    store i32 [[INC8_I]], ptr @c, align 4, !tbaa !3
+; CHECK-NEXT:    store i32 0, ptr @g, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 2, ptr @h, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 [[INC8_I]], ptr @c, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[FOR_END9_I]]
 ; CHECK:       for.end9.i:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[DOTPRE]], [[ENTRY_FOR_END9_I_CRIT_EDGE]] ], [ 2, [[FOR_COND_FOR_END9_CRIT_EDGE_I]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr @b, align 8, !tbaa !7
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4, !tbaa !3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr @e, align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr @b, align 8, !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP5]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr @e, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[CMP10_I:%.*]] = icmp slt i32 [[TMP6]], -1
 ; CHECK-NEXT:    br i1 [[CMP10_I]], label [[IF_THEN_I:%.*]], label [[FN1_EXIT:%.*]]
 ; CHECK:       if.then.i:
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr @f, align 4, !tbaa !3
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP5]], align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr @f, align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP5]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    br label [[FN1_EXIT]]
 ; CHECK:       fn1.exit:
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr @a, align 4, !tbaa !3
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr @a, align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[PUTS2:%.*]] = tail call i32 @puts(ptr @str.2)
-; CHECK-NEXT:    tail call void @abort()
+; CHECK-NEXT:    tail call void @abort() #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[PUTS:%.*]] = tail call i32 @puts(ptr @str)
diff --git a/llvm/test/Transforms/NewGVN/pr33367.ll b/llvm/test/Transforms/NewGVN/pr33367.ll
index dc5d190291e1cc..597caa2b34ef28 100644
--- a/llvm/test/Transforms/NewGVN/pr33367.ll
+++ b/llvm/test/Transforms/NewGVN/pr33367.ll
@@ -11,19 +11,19 @@ define %MNR_struct @f000316011717_2(ptr %pDS, ptr %pCG) #2 {
 ; CHECK-NEXT:  Entry:
 ; CHECK-NEXT:    [[RESTART:%.*]] = alloca [[MNR_STRUCT:%.*]], align 8
 ; CHECK-NEXT:    [[PCARRY:%.*]] = getelementptr [[DS_STRUCT:%.*]], ptr [[PDS:%.*]], i32 0, i32 1
-; CHECK-NEXT:    [[BASE:%.*]] = load ptr, ptr [[PDS]], align 8, !tbaa !14
+; CHECK-NEXT:    [[BASE:%.*]] = load ptr, ptr [[PDS]], align 8, !tbaa [[TBAA14:![0-9]+]]
 ; CHECK-NEXT:    [[ABSADDR:%.*]] = getelementptr i64, ptr [[BASE]], i64 9
-; CHECK-NEXT:    [[EXTARGET:%.*]] = load i64, ptr [[ABSADDR]], align 8, !tbaa !4
+; CHECK-NEXT:    [[EXTARGET:%.*]] = load i64, ptr [[ABSADDR]], align 8, !tbaa [[TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[TEMPLATE:%.*]] = icmp eq i64 [[EXTARGET]], 8593987412
 ; CHECK-NEXT:    br i1 [[TEMPLATE]], label %"BB3.000316011731#1", label [[BB2_000316011731_5:%.*]]
 ; CHECK:       "BB3.000316011731#1":
 ; CHECK-NEXT:    [[PBASE8:%.*]] = getelementptr [32 x ptr], ptr [[PDS]], i64 0, i64 29
-; CHECK-NEXT:    [[BASE9:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa !14
+; CHECK-NEXT:    [[BASE9:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa [[TBAA14]]
 ; CHECK-NEXT:    [[ABSADDR1:%.*]] = getelementptr i64, ptr [[BASE9]], i64 7
-; CHECK-NEXT:    [[RMEM:%.*]] = load i64, ptr [[ABSADDR1]], align 8, !tbaa !4
+; CHECK-NEXT:    [[RMEM:%.*]] = load i64, ptr [[ABSADDR1]], align 8, !tbaa [[TBAA4]]
 ; CHECK-NEXT:    [[PWT:%.*]] = getelementptr [[DS_STRUCT]], ptr [[PDS]], i32 0, i32 2
 ; CHECK-NEXT:    [[PWTE:%.*]] = getelementptr [32 x i16], ptr [[PWT]], i64 0, i64 8593987412
-; CHECK-NEXT:    [[SHIFTS:%.*]] = load i16, ptr [[PWTE]], align 2, !tbaa !18, !invariant.load !20
+; CHECK-NEXT:    [[SHIFTS:%.*]] = load i16, ptr [[PWTE]], align 2, !tbaa [[TBAA18:![0-9]+]], !invariant.load [[META20:![0-9]+]]
 ; CHECK-NEXT:    [[SLOWJ:%.*]] = icmp eq i16 [[SHIFTS]], 0
 ; CHECK-NEXT:    br i1 [[SLOWJ]], label [[BB2_000316011731_5]], label %"BB3.000316011731#1.1"
 ; CHECK:       BB2.000316011731.5:
@@ -34,22 +34,22 @@ define %MNR_struct @f000316011717_2(ptr %pDS, ptr %pCG) #2 {
 ; CHECK-NEXT:    [[SHIFTS1:%.*]] = zext i16 [[SHIFTS]] to i64
 ; CHECK-NEXT:    [[VAL:%.*]] = call i64 @llvm.x86.bmi.bextr.64(i64 [[RMEM]], i64 [[SHIFTS1]])
 ; CHECK-NEXT:    [[PREG:%.*]] = getelementptr [64 x i64], ptr [[PCG:%.*]], i64 0, i64 12
-; CHECK-NEXT:    store i64 [[VAL]], ptr [[PREG]], align 32, !tbaa !10
+; CHECK-NEXT:    store i64 [[VAL]], ptr [[PREG]], align 32, !tbaa [[TBAA10:![0-9]+]]
 ; CHECK-NEXT:    [[PREG2:%.*]] = getelementptr [64 x i64], ptr [[PCG]], i64 0, i64 14
-; CHECK-NEXT:    [[REG:%.*]] = load i64, ptr [[PREG2]], align 16, !tbaa !12
-; CHECK-NEXT:    [[BASE2:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa !14
+; CHECK-NEXT:    [[REG:%.*]] = load i64, ptr [[PREG2]], align 16, !tbaa [[TBAA12:![0-9]+]]
+; CHECK-NEXT:    [[BASE2:%.*]] = load ptr, ptr [[PBASE8]], align 8, !tbaa [[TBAA14]]
 ; CHECK-NEXT:    [[ABSADDR2:%.*]] = getelementptr i64, ptr [[BASE2]], i64 [[REG]]
-; CHECK-NEXT:    [[RMEM2:%.*]] = load i64, ptr [[ABSADDR2]], align 8, !tbaa !1
+; CHECK-NEXT:    [[RMEM2:%.*]] = load i64, ptr [[ABSADDR2]], align 8, !tbaa [[TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[PREG7:%.*]] = getelementptr [64 x i64], ptr [[PCG]], i64 0, i64 9
-; CHECK-NEXT:    store i64 [[RMEM2]], ptr [[PREG7]], align 8, !tbaa !8
+; CHECK-NEXT:    store i64 [[RMEM2]], ptr [[PREG7]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; CHECK-NEXT:    [[ADD2C279:%.*]] = add i64 [[RMEM2]], [[VAL]]
 ; CHECK-NEXT:    [[CCHK:%.*]] = icmp sge i64 [[ADD2C279]], 0
 ; CHECK-NEXT:    [[CFL:%.*]] = zext i1 [[CCHK]] to i8
-; CHECK-NEXT:    store i8 [[CFL]], ptr [[PCARRY]], align 1, !tbaa !16
+; CHECK-NEXT:    store i8 [[CFL]], ptr [[PCARRY]], align 1, !tbaa [[TBAA16:![0-9]+]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       Exit:
 ; CHECK-NEXT:    [[RESTART378:%.*]] = load [[MNR_STRUCT]], ptr [[RESTART]], align 8
-; CHECK-NEXT:    ret [[MNR_STRUCT]] %restart378
+; CHECK-NEXT:    ret [[MNR_STRUCT]] [[RESTART378]]
 ;
 Entry:
   %restart = alloca %MNR_struct
diff --git a/llvm/test/Transforms/NewGVN/pr34452.ll b/llvm/test/Transforms/NewGVN/pr34452.ll
index f5d5fda343037c..9e65349a1b47be 100644
--- a/llvm/test/Transforms/NewGVN/pr34452.ll
+++ b/llvm/test/Transforms/NewGVN/pr34452.ll
@@ -9,7 +9,7 @@ source_filename = "bugpoint-output-09f7a24.bc"
 define void @sgrep() local_unnamed_addr #0 {
 ; CHECK-LABEL: @sgrep(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa [[TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2048, i32 2047
 ; CHECK-NEXT:    br label [[WHILE_BODY_US:%.*]]
diff --git a/llvm/test/Transforms/NewGVN/pr42422-phi-of-ops.ll b/llvm/test/Transforms/NewGVN/pr42422-phi-of-ops.ll
index cbdf209d1ce50c..1312f9f4f02967 100644
--- a/llvm/test/Transforms/NewGVN/pr42422-phi-of-ops.ll
+++ b/llvm/test/Transforms/NewGVN/pr42422-phi-of-ops.ll
@@ -40,8 +40,8 @@ define void @d() {
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    [[CLEANUP_DEST:%.*]] = phi i32 [ poison, [[IF_END12]] ], [ 1, [[IF_THEN11]] ], [ 9, [[IF_THEN]] ]
 ; CHECK-NEXT:    switch i32 [[CLEANUP_DEST]], label [[CLEANUP14]] [
-; CHECK-NEXT:    i32 0, label [[FOR_COND4]]
-; CHECK-NEXT:    i32 9, label [[FOR_END13:%.*]]
+; CHECK-NEXT:      i32 0, label [[FOR_COND4]]
+; CHECK-NEXT:      i32 9, label [[FOR_END13:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       for.end13:
 ; CHECK-NEXT:    br label [[CLEANUP14]]
diff --git a/llvm/test/Transforms/NewGVN/pr43441.ll b/llvm/test/Transforms/NewGVN/pr43441.ll
index 5c4a9c3bc7b5d1..a5f711dbd69e5f 100644
--- a/llvm/test/Transforms/NewGVN/pr43441.ll
+++ b/llvm/test/Transforms/NewGVN/pr43441.ll
@@ -1,15 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK-LABEL: @print_long_format()
 define dso_local void @print_long_format() #0 {
+; CHECK-LABEL: define dso_local void @print_long_format(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 undef, label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 1, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 0, label [[SW_BB19:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB23:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    unreachable
+; CHECK:       sw.bb19:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN37:%.*]], label [[IF_END50:%.*]]
+; CHECK:       sw.bb23:
+; CHECK-NEXT:    unreachable
+; CHECK:       sw.default:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.then37:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end50:
+; CHECK-NEXT:    [[CALL180:%.*]] = call i32 @timespec_cmp() #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
 entry:
   switch i32 undef, label %sw.default [
-    i32 1, label %sw.bb
-    i32 0, label %sw.bb19
-    i32 2, label %sw.bb23
+  i32 1, label %sw.bb
+  i32 0, label %sw.bb19
+  i32 2, label %sw.bb23
   ]
 
 sw.bb:                                            ; preds = %entry
diff --git a/llvm/test/Transforms/NewGVN/pre-compare.ll b/llvm/test/Transforms/NewGVN/pre-compare.ll
index 9fd20fcc76ed69..8e4e5f813e89a2 100644
--- a/llvm/test/Transforms/NewGVN/pre-compare.ll
+++ b/llvm/test/Transforms/NewGVN/pre-compare.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 ; C source:
@@ -37,6 +38,28 @@
 @.str3 = private unnamed_addr constant [12 x i8] c"step 2: %d\0A\00", align 1
 
 define void @f(i32 %x) noreturn nounwind uwtable ssp {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_PREHEADER:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X]], 2
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP1]], ptr @.str, ptr @.str1
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @puts(ptr [[COND]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND_PREHEADER]]
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[X]], 2
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @puts(ptr @.str2) #[[ATTR1]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_COND_BACKEDGE:%.*]], label [[IF_END5:%.*]]
+; CHECK:       if.end5:
+; CHECK-NEXT:    [[CALL6:%.*]] = tail call i32 (ptr, ...) @printf(ptr @.str3, i32 [[X]]) #[[ATTR1]]
+; CHECK-NEXT:    br label [[FOR_COND_BACKEDGE]]
+; CHECK:       for.cond.backedge:
+; CHECK-NEXT:    br label [[FOR_COND]]
+;
 entry:
   %cmp = icmp eq i32 %x, 1
   br i1 %cmp, label %for.cond.preheader, label %if.then
diff --git a/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll b/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll
index 1ca24af84e48aa..a63ca131b5c0dd 100644
--- a/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll
+++ b/llvm/test/Transforms/NewGVN/preserve-metadata-for-predicate-replacements.ll
@@ -9,7 +9,7 @@ declare void @use(i32)
 define i32 @test(ptr %p1, ptr %p2, i1 %c) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[P1:%.*]], align 8, !tbaa !0
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[P1:%.*]], align 8, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[CMP_1:%.*]] = icmp slt i32 [[LV]], 1
 ; CHECK-NEXT:    br i1 [[CMP_1]], label [[EXIT:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.false:
diff --git a/llvm/test/Transforms/NewGVN/readattrs.ll b/llvm/test/Transforms/NewGVN/readattrs.ll
index 049a2fc911687b..544fe45be24976 100644
--- a/llvm/test/Transforms/NewGVN/readattrs.ll
+++ b/llvm/test/Transforms/NewGVN/readattrs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S -o - < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
@@ -6,12 +7,15 @@ target triple = "x86_64-unknown-linux-gnu"
 declare void @use(ptr readonly nocapture)
 
 define i8 @test() {
+; CHECK-LABEL: define i8 @test() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    store i8 1, ptr [[A]], align 1
+; CHECK-NEXT:    call void @use(ptr [[A]])
+; CHECK-NEXT:    ret i8 1
+;
   %a = alloca i8
   store i8 1, ptr %a
   call void @use(ptr %a)
   %b = load i8, ptr %a
   ret i8 %b
-; CHECK-LABEL: define i8 @test(
-; CHECK: call void @use(ptr %a)
-; CHECK-NEXT: ret i8 1
 }
diff --git a/llvm/test/Transforms/NewGVN/rle-nonlocal.ll b/llvm/test/Transforms/NewGVN/rle-nonlocal.ll
index c2fb39100da45f..efdfd1f6dc7b5b 100644
--- a/llvm/test/Transforms/NewGVN/rle-nonlocal.ll
+++ b/llvm/test/Transforms/NewGVN/rle-nonlocal.ll
@@ -7,14 +7,14 @@ define i32 @main(ptr %p, i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[BLOCK2:%.*]], label [[BLOCK3:%.*]]
 ; CHECK:       block2:
-; CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[P:%.*]]
+; CHECK-NEXT:    [[A:%.*]] = load ptr, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    br label [[BLOCK4:%.*]]
 ; CHECK:       block3:
-; CHECK-NEXT:    [[B:%.*]] = load ptr, ptr [[P]]
+; CHECK-NEXT:    [[B:%.*]] = load ptr, ptr [[P]], align 8
 ; CHECK-NEXT:    br label [[BLOCK4]]
 ; CHECK:       block4:
 ; CHECK-NEXT:    [[EXISTINGPHI:%.*]] = phi ptr [ [[A]], [[BLOCK2]] ], [ [[B]], [[BLOCK3]] ]
-; CHECK-NEXT:    [[C:%.*]] = load i32, ptr [[EXISTINGPHI]]
+; CHECK-NEXT:    [[C:%.*]] = load i32, ptr [[EXISTINGPHI]], align 4
 ; CHECK-NEXT:    [[E:%.*]] = add i32 [[C]], [[C]]
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
diff --git a/llvm/test/Transforms/NewGVN/rle.ll b/llvm/test/Transforms/NewGVN/rle.ll
index 1cfdc83f40da05..950c492d873652 100644
--- a/llvm/test/Transforms/NewGVN/rle.ll
+++ b/llvm/test/Transforms/NewGVN/rle.ll
@@ -1,15 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -passes=newgvn,dce -S | FileCheck %s
 ; RUN: opt < %s -data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32" -passes=newgvn,dce -S | FileCheck %s
 ; memset -> i16 forwarding.
 define signext i16 @memset_to_i16_local(ptr %A) nounwind ssp {
+; CHECK-LABEL: define signext i16 @memset_to_i16_local(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.memset.p0.i64(ptr [[A]], i8 1, i64 200, i1 false)
+; CHECK-NEXT:    ret i16 257
+;
 entry:
   tail call void @llvm.memset.p0.i64(ptr %A, i8 1, i64 200, i1 false)
   %arrayidx = getelementptr inbounds i16, ptr %A, i64 42
   %tmp2 = load i16, ptr %arrayidx
   ret i16 %tmp2
-; CHECK-LABEL: @memset_to_i16_local(
-; CHECK-NOT: load
-; CHECK: ret i16 257
 }
 
 @GCst = constant {i32, float, i32 } { i32 42, float 14., i32 97 }
@@ -17,37 +21,48 @@ entry:
 
 ; memset -> float forwarding.
 define float @memcpy_to_float_local(ptr %A) nounwind ssp {
+; CHECK-LABEL: define float @memcpy_to_float_local(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[A]], ptr @GCst, i64 12, i1 false)
+; CHECK-NEXT:    ret float 1.400000e+01
+;
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %A, ptr @GCst, i64 12, i1 false)
   %arrayidx = getelementptr inbounds float, ptr %A, i64 1 ; <ptr> [#uses=1]
   %tmp2 = load float, ptr %arrayidx                   ; <float> [#uses=1]
   ret float %tmp2
-; CHECK-LABEL: @memcpy_to_float_local(
-; CHECK-NOT: load
-; CHECK: ret float 1.400000e+01
 }
 ; memcpy from address space 1
 define float @memcpy_to_float_local_as1(ptr %A) nounwind ssp {
+; CHECK-LABEL: define float @memcpy_to_float_local_as1(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p1.i64(ptr [[A]], ptr addrspace(1) @GCst_as1, i64 12, i1 false)
+; CHECK-NEXT:    ret float 1.400000e+01
+;
 entry:
   tail call void @llvm.memcpy.p0.p1.i64(ptr %A, ptr addrspace(1) @GCst_as1, i64 12, i1 false)
   %arrayidx = getelementptr inbounds float, ptr %A, i64 1 ; <ptr> [#uses=1]
   %tmp2 = load float, ptr %arrayidx                   ; <float> [#uses=1]
   ret float %tmp2
-; CHECK-LABEL: @memcpy_to_float_local_as1(
-; CHECK-NOT: load
-; CHECK: ret float 1.400000e+01
 }
 
 ; PR6642
 define i32 @memset_to_load() nounwind readnone {
+; CHECK-LABEL: define i32 @memset_to_load(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = alloca [256 x i32], align 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[X]], i8 0, i64 1024, i1 false)
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   %x = alloca [256 x i32], align 4                ; <ptr> [#uses=2]
   call void @llvm.memset.p0.i64(ptr align 4 %x, i8 0, i64 1024, i1 false)
   %arraydecay = getelementptr inbounds [256 x i32], ptr %x, i32 0, i32 0 ; <ptr>
   %tmp1 = load i32, ptr %arraydecay                   ; <i32> [#uses=1]
   ret i32 %tmp1
-; CHECK-LABEL: @memset_to_load(
-; CHECK: ret i32 0
 }
 declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) nounwind
 
diff --git a/llvm/test/Transforms/NewGVN/simp-to-self.ll b/llvm/test/Transforms/NewGVN/simp-to-self.ll
index fb8a01962959bc..f9a0ec257259a1 100644
--- a/llvm/test/Transforms/NewGVN/simp-to-self.ll
+++ b/llvm/test/Transforms/NewGVN/simp-to-self.ll
@@ -1,13 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S < %s -passes=newgvn | FileCheck %s
 
-; CHECK-LABEL: for.cond:
-; CHECK-NEXT:    %lv = load i32, ptr @a
-; CHECK-NEXT:    %bf.clear = and i32 %lv, -131072
-; CHECK-NEXT:    %bf.set = or i32 1, %bf.clear
-; CHECK-NEXT:    br i1 %bc, label %for.cond, label %exit
 @a = external global i64
 
 define void @fn1(i1 %bc) {
+; CHECK-LABEL: define void @fn1(
+; CHECK-SAME: i1 [[BC:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr @a, align 4
+; CHECK-NEXT:    [[BF_CLEAR:%.*]] = and i32 [[LV]], -131072
+; CHECK-NEXT:    [[BF_SET:%.*]] = or i32 1, [[BF_CLEAR]]
+; CHECK-NEXT:    br i1 [[BC]], label [[FOR_COND]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i32 [[BF_SET]], ptr @a, align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.cond
 
diff --git a/llvm/test/Transforms/NewGVN/stale-loop-info.ll b/llvm/test/Transforms/NewGVN/stale-loop-info.ll
index 8870824d38ad6d..7abe80b005ecf2 100644
--- a/llvm/test/Transforms/NewGVN/stale-loop-info.ll
+++ b/llvm/test/Transforms/NewGVN/stale-loop-info.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes='require<loops>,newgvn' -S < %s | FileCheck %s
 
 ; This used to fail with ASAN enabled and if for some reason LoopInfo remained
@@ -14,6 +15,29 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 declare void @snork.1(ptr) local_unnamed_addr #0
 
 define hidden zeroext i1 @eggs(ptr %arg, i1 %arg2) unnamed_addr align 2 {
+; CHECK-LABEL: define hidden zeroext i1 @eggs(
+; CHECK-SAME: ptr [[ARG:%.*]], i1 [[ARG2:%.*]]) unnamed_addr align 2 {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 [[ARG2]], label [[BB14:%.*]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_WIBBLE_1028:%.*]], ptr [[ARG]], i64 0, i32 2, i32 0, i32 0, i64 0
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br i1 undef, label [[BB11:%.*]], label [[BB8:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP]], align 8
+; CHECK-NEXT:    br label [[BB12:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    br label [[BB12]]
+; CHECK:       bb12:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi ptr [ [[TMP]], [[BB11]] ], [ [[TMP9]], [[BB8]] ]
+; CHECK-NEXT:    call void @snork.1(ptr [[TMP13]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    br label [[BB6]]
+; CHECK:       bb14:
+; CHECK-NEXT:    ret i1 false
+;
 bb:
   br i1 %arg2, label %bb14, label %bb3
 
@@ -29,7 +53,6 @@ bb7:                                              ; preds = %bb6
 
 bb8:                                              ; preds = %bb7
   %tmp9 = load ptr, ptr %tmp, align 8
-; CHECK: %tmp9 = load ptr, ptr %tmp, align 8
   br label %bb12
 
 bb11:                                             ; preds = %bb7
diff --git a/llvm/test/Transforms/NewGVN/tbaa.ll b/llvm/test/Transforms/NewGVN/tbaa.ll
index e6d66dca0decff..335e782acc8bcd 100644
--- a/llvm/test/Transforms/NewGVN/tbaa.ll
+++ b/llvm/test/Transforms/NewGVN/tbaa.ll
@@ -1,10 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 define i32 @test1(ptr %p, ptr %q) {
-; CHECK-LABEL: @test1(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p)
-; CHECK-NOT: tbaa
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test1(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]])
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p)
   %c = add i32 %a, %b
@@ -12,9 +15,12 @@ define i32 @test1(ptr %p, ptr %q) {
 }
 
 define i32 @test2(ptr %p, ptr %q) {
-; CHECK-LABEL: @test2(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGC:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test2(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p), !tbaa !0
   %c = add i32 %a, %b
@@ -22,9 +28,12 @@ define i32 @test2(ptr %p, ptr %q) {
 }
 
 define i32 @test3(ptr %p, ptr %q) {
-; CHECK-LABEL: @test3(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGB:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test3(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !3
   %b = call i32 @foo(ptr %p), !tbaa !3
   %c = add i32 %a, %b
@@ -32,9 +41,12 @@ define i32 @test3(ptr %p, ptr %q) {
 }
 
 define i32 @test4(ptr %p, ptr %q) {
-; CHECK-LABEL: @test4(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGA:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test4(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !1
   %b = call i32 @foo(ptr %p), !tbaa !0
   %c = add i32 %a, %b
@@ -42,9 +54,12 @@ define i32 @test4(ptr %p, ptr %q) {
 }
 
 define i32 @test5(ptr %p, ptr %q) {
-; CHECK-LABEL: @test5(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGA]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test5(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p), !tbaa !1
   %c = add i32 %a, %b
@@ -52,9 +67,12 @@ define i32 @test5(ptr %p, ptr %q) {
 }
 
 define i32 @test6(ptr %p, ptr %q) {
-; CHECK-LABEL: @test6(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAGA]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test6(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA6]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !0
   %b = call i32 @foo(ptr %p), !tbaa !3
   %c = add i32 %a, %b
@@ -62,10 +80,12 @@ define i32 @test6(ptr %p, ptr %q) {
 }
 
 define i32 @test7(ptr %p, ptr %q) {
-; CHECK-LABEL: @test7(ptr %p, ptr %q)
-; CHECK: call i32 @foo(ptr %p)
-; CHECK-NOT: tbaa
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test7(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]])
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !4
   %b = call i32 @foo(ptr %p), !tbaa !3
   %c = add i32 %a, %b
@@ -73,9 +93,11 @@ define i32 @test7(ptr %p, ptr %q) {
 }
 
 define i32 @test8(ptr %p, ptr %q) {
-; CHECK-LABEL: @test8
-; CHECK-NEXT: store i32 15, ptr %p
-; CHECK-NEXT: ret i32 0
+; CHECK-LABEL: define i32 @test8(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    store i32 15, ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 0
+;
 ; Since we know the location is invariant, we can forward the
 ; load across the potentially aliasing store.
 
@@ -87,9 +109,11 @@ define i32 @test8(ptr %p, ptr %q) {
 }
 
 define i32 @test9(ptr %p, ptr %q) {
-; CHECK-LABEL: @test9
-; CHECK-NEXT: call void @clobber()
-; CHECK-NEXT: ret i32 0
+; CHECK-LABEL: define i32 @test9(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    ret i32 0
+;
 ; Since we know the location is invariant, we can forward the
 ; load across the potentially aliasing store (within the call).
 
@@ -103,9 +127,12 @@ define i32 @test9(ptr %p, ptr %q) {
 define i32 @test10(ptr %p, ptr %q) {
 ; If one access encloses the other, then the merged access is the enclosed one
 ; and not just the common final access type.
-; CHECK-LABEL: @test10
-; CHECK: call i32 @foo(ptr %p), !tbaa [[TAG_X_i:!.*]]
-; CHECK: %c = add i32 %a, %a
+; CHECK-LABEL: define i32 @test10(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = call i32 @foo(ptr [[P]]), !tbaa [[TBAA7:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[A]], [[A]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = call i32 @foo(ptr %p), !tbaa !15  ; TAG_X_i
   %b = call i32 @foo(ptr %p), !tbaa !19  ; TAG_Y_x_i
   %c = add i32 %a, %b
@@ -115,12 +142,6 @@ define i32 @test10(ptr %p, ptr %q) {
 declare void @clobber()
 declare i32 @foo(ptr) readonly
 
-; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
-; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]}
-; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}}
-; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0}
-; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]}
-; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0}
 !0 = !{!5, !5, i64 0}
 !1 = !{!6, !6, i64 0}
 !2 = !{!"tbaa root"}
@@ -132,9 +153,6 @@ declare i32 @foo(ptr) readonly
 !8 = !{!"another root"}
 !11 = !{!"scalar type", !8}
 
-; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0}
-; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0}
-; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0}
 !15 = !{!16, !17, i64 0}            ; TAG_X_i
 !16 = !{!"struct X", !17, i64 0}    ; struct X { int i; };
 !17 = !{!"int", !18, i64 0}
@@ -146,3 +164,16 @@ declare i32 @foo(ptr) readonly
 ; A TBAA structure who's only point is to have a constant location.
 !9 = !{!"yet another root"}
 !10 = !{!"node", !9, i64 1}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"C", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"A", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = !{!"tbaa root"}
+; CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+; CHECK: [[META5]] = !{!"B", [[META2]]}
+; CHECK: [[TBAA6]] = !{[[META2]], [[META2]], i64 0}
+; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 0}
+; CHECK: [[META8]] = !{!"struct X", [[META9]], i64 0}
+; CHECK: [[META9]] = !{!"int", [[META10:![0-9]+]], i64 0}
+; CHECK: [[META10]] = !{!"char", [[META3]], i64 0}
+;.
diff --git a/llvm/test/Transforms/NewGVN/unreachable_block_infinite_loop.ll b/llvm/test/Transforms/NewGVN/unreachable_block_infinite_loop.ll
index 7fbf50680a32e9..70e5e1a138da7e 100644
--- a/llvm/test/Transforms/NewGVN/unreachable_block_infinite_loop.ll
+++ b/llvm/test/Transforms/NewGVN/unreachable_block_infinite_loop.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -5,11 +6,11 @@ target triple = "x86_64-apple-darwin10.0"
 
 define i32 @test2() nounwind ssp {
 entry:
-    ret i32 0
+  ret i32 0
 
 unreachable_block:
-    %a = add i32 %a, 1
-    ret i32 %a
+  %a = add i32 %a, 1
+  ret i32 %a
 }
 
 define i32 @pr23096_test0() {
diff --git a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
index b863d23cffdc3f..2a1fcf35157f46 100644
--- a/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
+++ b/llvm/test/Transforms/NewGVN/verify-memoryphi.ll
@@ -1,21 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; Skip dead MemoryPhis when performing memory congruency verification
 ; in NewGVN.
 ; RUN: opt -S -passes=newgvn %s | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: define void @tinkywinky() {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   br i1 false, label %body, label %end
-; CHECK:      body:
-; CHECK-NEXT:   store i8 poison, ptr null
-; CHECK-NEXT:   br label %end
-; CHECK:      end:
-; CHECK-NEXT:   ret void
-; CHECK-NEXT: }
 
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 define void @tinkywinky() {
+; CHECK-LABEL: define void @tinkywinky() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[BODY:%.*]], label [[END:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    store i8 poison, ptr null, align 1
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @llvm.lifetime.start.p0(i64 4, ptr undef)
   br i1 false, label %body, label %end
diff --git a/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll b/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll
index 2febea7e948196..d6daff99591f9a 100644
--- a/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll
+++ b/llvm/test/Transforms/NewGVN/volatile-nonvolatile.ll
@@ -1,13 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=newgvn -S < %s | FileCheck %s
 
 %struct.t = type { ptr }
 
 ; The loaded address and the location of the address itself are not aliased,
 ; so the second reload is not necessary. Check that it can be eliminated.
-; CHECK-LABEL: test1
-; CHECK: load
-; CHECK-NOT: load
 define void @test1(ptr nocapture readonly %p, i32 %v) #0 {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load ptr, ptr %p, align 4, !tbaa !1
   store volatile i32 %v, ptr %0, align 4, !tbaa !6
@@ -18,11 +24,16 @@ entry:
 
 ; The store via the loaded address may overwrite the address itself.
 ; Make sure that both loads remain.
-; CHECK-LABEL: test2
-; CHECK: load
-; CHECK: store
-; CHECK: load
 define void @test2(ptr nocapture readonly %p, i32 %v) #0 {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[P]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP1]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load ptr, ptr %p, align 4, !tbaa !1
   store volatile i32 %v, ptr %0, align 4, !tbaa !1
@@ -33,11 +44,16 @@ entry:
 
 ; The loads are ordered and non-monotonic. Although they are not aliased to
 ; the stores, make sure both are preserved.
-; CHECK-LABEL: test3
-; CHECK: load
-; CHECK: store
-; CHECK: load
 define void @test3(ptr nocapture readonly %p, i32 %v) #0 {
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: ptr nocapture readonly [[P:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic ptr, ptr [[P]] acquire, align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP0]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load atomic ptr, ptr [[P]] acquire, align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[TMP1]], align 4, !tbaa [[TBAA5]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %0 = load atomic ptr, ptr %p acquire, align 4, !tbaa !1
   store volatile i32 %v, ptr %0, align 4, !tbaa !6
@@ -56,3 +72,12 @@ attributes #0 = { norecurse nounwind }
 !6 = !{!7, !7, i64 0}
 !7 = !{!"int", !4, i64 0}
 
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
+; CHECK: [[META1]] = !{!"", [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"any pointer", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"int", [[META3]], i64 0}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index 33cfd0aa8574e0..ad100c399c08ed 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -164,10 +164,10 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META11:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope [[META9]], !noalias [[META11]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[WIDE_LOAD10]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd <4 x float> [[TMP6]], [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[WIDE_LOAD11]]
-; CHECK-NEXT:    [[PREDPHI12:%.*]] = fadd <4 x float> [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[TMP6]], [[WIDE_LOAD10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[WIDE_LOAD11]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[TMP6]], <4 x float> [[TMP10]]
+; CHECK-NEXT:    [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4, !alias.scope [[META9]], !noalias [[META11]]
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI12]], ptr [[TMP12]], align 4, !alias.scope [[META9]], !noalias [[META11]]
diff --git a/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll b/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll
index b8f5d5dba0c4b2..05d9acd1919629 100644
--- a/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll
+++ b/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll
@@ -240,3 +240,32 @@ then:
 else:
   ret i16 0
 }
+
+define i1 @test_add_nuw_sub(i32 %a) {
+; CHECK-LABEL: @test_add_nuw_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A:%.*]], 10000
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[ADD]], -5000
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %add = add nuw i32 %a, 10000
+  %sub = add i32 %add, -5000
+  %cond = icmp ult i32 %sub, 5000
+  ret i1 %cond
+}
+
+define i1 @test_add_nsw_sub(i32 %a) {
+; CHECK-LABEL: @test_add_nsw_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A:%.*]], 10000
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[ADD]], -5000
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[SUB]], 5000
+; CHECK-NEXT:    ret i1 [[COND]]
+;
+entry:
+  %add = add nsw i32 %a, 10000
+  %sub = add i32 %add, -5000
+  %cond = icmp ult i32 %sub, 5000
+  ret i1 %cond
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index cef791633655a8..5e3fd156666f5f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -17,12 +17,13 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
 ; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
 ; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
index 705e425d3e445c..6907724729759f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -6,9 +6,10 @@ define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i32> [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i1> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16>
 ; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
index 9566c00dd63006..d51ef0bce3a4e0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll
@@ -5,11 +5,12 @@ define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = sub <8 x i32> zeroinitializer, zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> zeroinitializer, zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i32> zeroinitializer to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <8 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i1> [[TMP5]] to <8 x i16>
 ; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index 47485e514ec2fc..1cce52060c479f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
index 1986b51ec94828..7c5f9847db1f41 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -228,7 +228,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; YAML-NEXT: Function:        test_unrolled_select
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
-; YAML-NEXT:   - Cost:            '-36'
+; YAML-NEXT:   - Cost:            '-41'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '10'
 
@@ -246,15 +246,17 @@ define i32 @test_unrolled_select(ptr noalias nocapture readonly %blk1, ptr noali
 ; CHECK-NEXT:    [[P2_045:%.*]] = phi ptr [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR88:%.*]], [[IF_END_86]] ]
 ; CHECK-NEXT:    [[P1_044:%.*]] = phi ptr [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END_86]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[P1_044]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[P2_045]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP8]], [[S_047]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <8 x i16> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP7]], <8 x i16> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <8 x i16> [[TMP8]] to <8 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP10]], [[S_047]]
 ; CHECK-NEXT:    [[CMP83:%.*]] = icmp slt i32 [[OP_RDX]], [[LIM:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]]
 ; CHECK:       if.end.86:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
index d67fdc1cd6aa0e..a7a7f642ced538 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
@@ -28,21 +28,11 @@ entry:
 define i64 @red_zext_ld_4xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_zext_ld_4xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LD0:%.*]] = load i8, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[LD0]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[ZEXT_1:%.*]] = zext i8 [[LD1]] to i64
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[ZEXT]], [[ZEXT_1]]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_1]], align 1
-; CHECK-NEXT:    [[ZEXT_2:%.*]] = zext i8 [[LD2]] to i64
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[ADD_1]], [[ZEXT_2]]
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 3
-; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[GEP_2]], align 1
-; CHECK-NEXT:    [[ZEXT_3:%.*]] = zext i8 [[LD3]] to i64
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i64 [[ADD_2]], [[ZEXT_3]]
-; CHECK-NEXT:    ret i64 [[ADD_3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %ld0 = load i8, ptr %ptr
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll
new file mode 100644
index 00000000000000..a38f4bdc4640e9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -S -mtriple=aarch64 -vector-library=ArmPL -passes=slp-vectorizer | FileCheck %s
+
+@a = common global ptr null, align 8
+
+define void @frem_v2double() {
+; CHECK-LABEL: define void @frem_v2double() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr @a, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr @a, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = frem <2 x double> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr @a, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a0 = load double, ptr getelementptr inbounds (double, ptr @a, i64 0), align 8
+  %a1 = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8
+  %b0 = load double, ptr getelementptr inbounds (double, ptr @a, i64 0), align 8
+  %b1 = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8
+  %r0 = frem double %a0, %b0
+  %r1 = frem double %a1, %b1
+  store double %r0, ptr getelementptr inbounds (double, ptr @a, i64 0), align 8
+  store double %r1, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8
+  ret void
+}
+
+define void @frem_v4float() {
+; CHECK-LABEL: define void @frem_v4float() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr @a, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr @a, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = frem <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr @a, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a0 = load float, ptr getelementptr inbounds (float, ptr @a, i64 0), align 8
+  %a1 = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8
+  %a2 = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8
+  %a3 = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8
+  %b0 = load float, ptr getelementptr inbounds (float, ptr @a, i64 0), align 8
+  %b1 = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8
+  %b2 = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8
+  %b3 = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8
+  %r0 = frem float %a0, %b0
+  %r1 = frem float %a1, %b1
+  %r2 = frem float %a2, %b2
+  %r3 = frem float %a3, %b3
+  store float %r0, ptr getelementptr inbounds (float, ptr @a, i64 0), align 8
+  store float %r1, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8
+  store float %r2, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8
+  store float %r3, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index ed73f7b134465a..d87bdfe2689916 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -6,7 +6,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> <i64 4, i64 6>
@@ -37,10 +37,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP101:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sub <2 x i32> [[TMP101]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
@@ -70,9 +70,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
-; CHECK-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP42]], [[TMP41]]
+; CHECK-NEXT:    [[CONV:%.*]] = add i32 [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
 ; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
@@ -104,32 +104,32 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]]
-; CHECK-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
 ; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]]
 ; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
-; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP76]], 15
-; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
-; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15
-; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
-; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP77]], 15
-; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
-; CHECK-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
-; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
+; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
+; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[TMP79]], 15
 ; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
 ; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
+; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP107]], 15
+; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
+; CHECK-NEXT:    [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535
+; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15
+; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
+; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
+; CHECK-NEXT:    [[SHR_I49_5:%.*]] = lshr i32 [[CONV1]], 15
+; CHECK-NEXT:    [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
+; CHECK-NEXT:    [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
 ; CHECK-NEXT:    [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; CHECK-NEXT:    [[TMP79:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
+; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1
 ; CHECK-NEXT:    [[TMP81:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP80]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
@@ -147,20 +147,20 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP92]], [[TMP94]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP79]], i32 [[CONV33]], i32 1
+; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
 ; CHECK-NEXT:    [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]]
-; CHECK-NEXT:    [[TMP99:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> [[TMP79]], i32 [[CONV]], i32 0
-; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
-; CHECK-NEXT:    [[TMP102:%.*]] = add <2 x i32> [[TMP88]], [[TMP101]]
-; CHECK-NEXT:    [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> [[TMP102]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP104:%.*]] = add <2 x i32> [[TMP99]], [[TMP102]]
-; CHECK-NEXT:    [[TMP105:%.*]] = sub <2 x i32> [[TMP102]], [[TMP99]]
-; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <2 x i32> [[TMP104]], i32 0
-; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP104]], i32 1
-; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP107]], [[TMP106]]
+; CHECK-NEXT:    [[TMP104:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]]
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP103:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]]
+; CHECK-NEXT:    [[TMP200:%.*]] = add <2 x i32> [[TMP88]], [[TMP103]]
+; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP165:%.*]] = add <2 x i32> [[TMP104]], [[TMP200]]
+; CHECK-NEXT:    [[TMP105:%.*]] = sub <2 x i32> [[TMP200]], [[TMP104]]
+; CHECK-NEXT:    [[TMP238:%.*]] = extractelement <2 x i32> [[TMP165]], i32 0
+; CHECK-NEXT:    [[TMP143:%.*]] = extractelement <2 x i32> [[TMP165]], i32 1
+; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP143]], [[TMP238]]
 ; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1
-; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP107]], 15
+; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP143]], 15
 ; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
 ; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
 ; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15
@@ -185,7 +185,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> <i64 1, i64 3>
 ; CHECK-NEXT:    [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
-; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
+; CHECK-NEXT:    [[TMP153:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> <i64 5, i64 7>
 ; CHECK-NEXT:    [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
@@ -195,15 +195,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]]
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP128]]
+; CHECK-NEXT:    [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP153]]
 ; CHECK-NEXT:    [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]]
 ; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
 ; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]]
 ; CHECK-NEXT:    [[TMP142:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT:    [[TMP143:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]]
+; CHECK-NEXT:    [[TMP257:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]]
 ; CHECK-NEXT:    [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP139]]
-; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
+; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP257]], i32 0
+; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP257]], i32 1
 ; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]]
 ; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
@@ -217,51 +217,51 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
 ; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
 ; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
-; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP76]]
-; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I51_2]], [[ADD103]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP79]]
+; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51_3]], [[ADD105]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
 ; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]]
 ; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
-; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP107]]
+; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP143]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
 ; CHECK-NEXT:    [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB47_2]], i32 1
 ; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_2]], i32 1
-; CHECK-NEXT:    [[TMP153:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
+; CHECK-NEXT:    [[TMP163:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]]
 ; CHECK-NEXT:    [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP155]]
-; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP153]], i32 1
+; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
 ; CHECK-NEXT:    [[TMP158:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
-; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP158]], [[TMP157]]
-; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP153]], i32 0
+; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP158]], [[TMP157]]
+; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
 ; CHECK-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
-; CHECK-NEXT:    [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP161]], [[TMP160]]
-; CHECK-NEXT:    [[TMP163:%.*]] = sub <2 x i32> [[TMP153]], [[TMP156]]
-; CHECK-NEXT:    [[TMP164:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0
-; CHECK-NEXT:    [[TMP165:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1
-; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[TMP165]], [[TMP164]]
-; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[TMP164]], [[TMP165]]
+; CHECK-NEXT:    [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP163]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP161]], [[TMP160]]
+; CHECK-NEXT:    [[TMP164:%.*]] = sub <2 x i32> [[TMP163]], [[TMP156]]
+; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <2 x i32> [[TMP164]], i32 0
+; CHECK-NEXT:    [[TMP174:%.*]] = extractelement <2 x i32> [[TMP164]], i32 1
+; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[TMP174]], [[TMP173]]
+; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[TMP173]], [[TMP174]]
 ; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]]
-; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP77]]
-; CHECK-NEXT:    [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]]
+; CHECK-NEXT:    [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP167:%.*]] = lshr <2 x i32> [[TMP166]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP168:%.*]] = and <2 x i32> [[TMP167]], <i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP169:%.*]] = mul <2 x i32> [[TMP168]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0
-; CHECK-NEXT:    [[TMP171:%.*]] = shufflevector <2 x i32> [[TMP170]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0
-; CHECK-NEXT:    [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP171]], [[TMP173]]
-; CHECK-NEXT:    [[TMP175:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]]
-; CHECK-NEXT:    [[TMP176:%.*]] = shufflevector <2 x i32> [[TMP174]], <2 x i32> [[TMP175]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP176]]
+; CHECK-NEXT:    [[TMP171:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP208:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
+; CHECK-NEXT:    [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP282:%.*]] = add <2 x i32> [[TMP171]], [[TMP209]]
+; CHECK-NEXT:    [[TMP211:%.*]] = sub <2 x i32> [[TMP171]], [[TMP209]]
+; CHECK-NEXT:    [[TMP283:%.*]] = shufflevector <2 x i32> [[TMP282]], <2 x i32> [[TMP211]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP283]]
 ; CHECK-NEXT:    [[TMP178:%.*]] = xor <2 x i32> [[TMP177]], [[TMP166]]
 ; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
 ; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP108]]
@@ -271,90 +271,90 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP180:%.*]] = extractelement <2 x i32> [[TMP178]], i32 1
 ; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP180]]
 ; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP165]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_2]], i32 0
-; CHECK-NEXT:    [[TMP183:%.*]] = insertelement <2 x i32> [[TMP104]], i32 [[ADD46_2]], i32 0
+; CHECK-NEXT:    [[TMP183:%.*]] = insertelement <2 x i32> [[TMP165]], i32 [[CONV]], i32 0
 ; CHECK-NEXT:    [[TMP184:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]]
-; CHECK-NEXT:    [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP257]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP187:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]]
 ; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <2 x i32> [[TMP184]], i32 0
 ; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0
 ; CHECK-NEXT:    [[TMP190:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP189]], [[TMP188]]
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP189]], [[TMP188]]
 ; CHECK-NEXT:    [[TMP191:%.*]] = extractelement <2 x i32> [[TMP184]], i32 1
 ; CHECK-NEXT:    [[TMP192:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1
 ; CHECK-NEXT:    [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[TMP192]], [[TMP191]]
+; CHECK-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP192]], [[TMP191]]
 ; CHECK-NEXT:    [[TMP194:%.*]] = sub <2 x i32> [[TMP184]], [[TMP187]]
-; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP244:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0
+; CHECK-NEXT:    [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP244]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
 ; CHECK-NEXT:    [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP199:%.*]] = add <2 x i32> [[TMP196]], [[TMP198]]
-; CHECK-NEXT:    [[TMP200:%.*]] = sub <2 x i32> [[TMP196]], [[TMP198]]
-; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP202:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
+; CHECK-NEXT:    [[TMP246:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]]
+; CHECK-NEXT:    [[TMP247:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]]
+; CHECK-NEXT:    [[TMP248:%.*]] = shufflevector <2 x i32> [[TMP246]], <2 x i32> [[TMP247]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP215:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0
 ; CHECK-NEXT:    [[TMP203:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1
-; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP202]], [[TMP203]]
-; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP202]]
-; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
+; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP215]], [[TMP203]]
+; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP215]]
+; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_2]]
 ; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT:    [[TMP204:%.*]] = add <2 x i32> [[TMP149]], [[TMP201]]
-; CHECK-NEXT:    [[TMP205:%.*]] = xor <2 x i32> [[TMP204]], [[TMP110]]
-; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP106]], 15
+; CHECK-NEXT:    [[TMP266:%.*]] = add <2 x i32> [[TMP149]], [[TMP248]]
+; CHECK-NEXT:    [[TMP267:%.*]] = xor <2 x i32> [[TMP266]], [[TMP110]]
+; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP238]], 15
 ; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
 ; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
 ; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP106]]
+; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP238]]
 ; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 0
+; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP267]], i32 0
 ; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP206]]
-; CHECK-NEXT:    [[TMP207:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1
+; CHECK-NEXT:    [[TMP207:%.*]] = extractelement <2 x i32> [[TMP267]], i32 1
 ; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP207]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[TMP208:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB45_2]], i32 0
-; CHECK-NEXT:    [[TMP209:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB47_2]], i32 0
-; CHECK-NEXT:    [[TMP210:%.*]] = sub <2 x i32> [[TMP208]], [[TMP209]]
-; CHECK-NEXT:    [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP221:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB45_2]], i32 0
+; CHECK-NEXT:    [[TMP222:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB47_2]], i32 0
+; CHECK-NEXT:    [[TMP210:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]]
+; CHECK-NEXT:    [[TMP225:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 1, i32 2>
 ; CHECK-NEXT:    [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP212]]
+; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP225]], [[TMP212]]
 ; CHECK-NEXT:    [[TMP214:%.*]] = extractelement <2 x i32> [[TMP210]], i32 0
-; CHECK-NEXT:    [[TMP215:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP215]], [[TMP214]]
+; CHECK-NEXT:    [[TMP227:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0
+; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP227]], [[TMP214]]
 ; CHECK-NEXT:    [[TMP217:%.*]] = extractelement <2 x i32> [[TMP210]], i32 1
-; CHECK-NEXT:    [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
-; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP210]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[TMP218]], [[TMP217]]
-; CHECK-NEXT:    [[TMP220:%.*]] = sub <2 x i32> [[TMP210]], [[TMP213]]
-; CHECK-NEXT:    [[TMP221:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
-; CHECK-NEXT:    [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP221]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP218:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP226]], <2 x i32> [[TMP210]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SUB59:%.*]] = add i32 [[TMP218]], [[TMP217]]
+; CHECK-NEXT:    [[TMP220:%.*]] = sub <2 x i32> [[TMP210]], [[TMP226]]
+; CHECK-NEXT:    [[TMP274:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0
+; CHECK-NEXT:    [[TMP275:%.*]] = shufflevector <2 x i32> [[TMP274]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
 ; CHECK-NEXT:    [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP225:%.*]] = add <2 x i32> [[TMP222]], [[TMP224]]
-; CHECK-NEXT:    [[TMP226:%.*]] = sub <2 x i32> [[TMP222]], [[TMP224]]
-; CHECK-NEXT:    [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP276:%.*]] = add <2 x i32> [[TMP275]], [[TMP224]]
+; CHECK-NEXT:    [[TMP277:%.*]] = sub <2 x i32> [[TMP275]], [[TMP224]]
+; CHECK-NEXT:    [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP276]], <2 x i32> [[TMP277]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP228:%.*]] = extractelement <2 x i32> [[TMP220]], i32 0
 ; CHECK-NEXT:    [[TMP229:%.*]] = extractelement <2 x i32> [[TMP220]], i32 1
 ; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[TMP228]], [[TMP229]]
 ; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[TMP229]], [[TMP228]]
-; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]]
-; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]]
-; CHECK-NEXT:    [[TMP230:%.*]] = lshr <2 x i32> [[TMP79]], <i32 15, i32 15>
+; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_3]]
+; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
+; CHECK-NEXT:    [[TMP230:%.*]] = lshr <2 x i32> [[TMP102]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP231:%.*]] = and <2 x i32> [[TMP230]], <i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP232:%.*]] = mul <2 x i32> [[TMP231]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP233:%.*]] = add <2 x i32> [[TMP232]], [[TMP227]]
-; CHECK-NEXT:    [[TMP234:%.*]] = xor <2 x i32> [[TMP233]], [[TMP79]]
+; CHECK-NEXT:    [[TMP286:%.*]] = add <2 x i32> [[TMP232]], [[TMP278]]
+; CHECK-NEXT:    [[TMP287:%.*]] = xor <2 x i32> [[TMP286]], [[TMP102]]
 ; CHECK-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
 ; CHECK-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
 ; CHECK-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP235:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
+; CHECK-NEXT:    [[TMP235:%.*]] = extractelement <2 x i32> [[TMP287]], i32 0
 ; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP235]]
-; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
+; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP287]], i32 1
 ; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP236]]
 ; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
 ; CHECK-NEXT:    ret i32 [[ADD113_3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
index 000e7a56df3778..500f10659f04cb 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@@ -802,9 +802,10 @@ define i64 @red_zext_ld_4xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_zext_ld_4xi64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
-; CHECK-NEXT:    ret i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %ld0 = load i8, ptr %ptr
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
index 4565d4928ba4ad..05511f843a68fa 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
@@ -15,11 +15,12 @@ define { i64, i64 } @patatino(double %arg) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1
 ; CHECK-NEXT:    ret { i64, i64 } [[T17]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
index 12b227cbeb3d3c..a153c3ceeaf5dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
@@ -25,8 +25,8 @@ define i32 @fn1() {
 ; CHECK-NEXT:    store i32 [[AND]], ptr @a, align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    switch i32 [[AND]], label [[IF_END:%.*]] [
-; CHECK-NEXT:    i32 7, label [[SAVE_STATE_AND_RETURN]]
-; CHECK-NEXT:    i32 0, label [[SAVE_STATE_AND_RETURN]]
+; CHECK-NEXT:      i32 7, label [[SAVE_STATE_AND_RETURN]]
+; CHECK-NEXT:      i32 0, label [[SAVE_STATE_AND_RETURN]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    br label [[SAVE_STATE_AND_RETURN]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
index a0af8e36b36c79..5ee80160765387 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-3 < %s | FileCheck %s
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-6 < %s | FileCheck %s
 
 define void @t(i64 %v) {
 ; CHECK-LABEL: define void @t(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
new file mode 100644
index 00000000000000..fc28d7ab4ee746
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mcpu=cascadelake < %s | FileCheck %s
+
+define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], i64 [[TMP1:%.*]], ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT:    [[BF_LOAD_I1336:%.*]] = load i24, ptr [[TMP2]], align 16
+; CHECK-NEXT:    [[AND_I_I_I1342:%.*]] = and i64 [[TMP1]], -16
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[AND_I_I_I1342]] to ptr
+; CHECK-NEXT:    store ptr [[TMP3]], ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16
+; CHECK-NEXT:    [[BF_LOAD_I1345:%.*]] = load i24, ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i24> poison, i24 [[BF_LOAD_I1336]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i24> [[TMP6]], i24 [[BF_LOAD_I1345]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = and <2 x i24> [[TMP7]], <i24 255, i24 255>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
+; CHECK-NEXT:    [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
+; CHECK-NEXT:    [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <2 x i32> [[TMP14]], <i32 32, i32 32>
+; CHECK-NEXT:    [[TMP18:%.*]] = select <2 x i1> [[TMP15]], <2 x i8> <i8 31, i8 31>, <2 x i8> [[TMP25]]
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq <2 x i32> [[TMP16]], <i32 54, i32 54>
+; CHECK-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP17]], <2 x i8> <i8 53, i8 53>, <2 x i8> [[TMP18]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i8> [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[P1]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i8> [[TMP21]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[CMP210_NOT:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    ret i1 [[CMP210_NOT]]
+;
+newFuncRoot:
+  %2 = getelementptr inbounds i8, ptr %0, i64 16
+  %bf.load.i1336 = load i24, ptr %2, align 16
+  %bf.clear.i1337 = and i24 %bf.load.i1336, 255
+  %and.i.i.i1342 = and i64 %1, -16
+  %3 = inttoptr i64 %and.i.i.i1342 to ptr
+  store ptr %3, ptr %p, align 8
+  %4 = load ptr, ptr %3, align 16
+  %5 = getelementptr inbounds i8, ptr %4, i64 16
+  %bf.load.i1345 = load i24, ptr %5, align 16
+  %bf.clear.i1346 = and i24 %bf.load.i1345, 255
+  %cmp182 = icmp eq i24 %bf.clear.i1337, 24
+  %narrow = select i1 %cmp182, i24 23, i24 %bf.clear.i1337
+  %s = zext nneg i24 %narrow to i32
+  %cmp185 = icmp eq i24 %bf.clear.i1346, 24
+  %narrow1790 = select i1 %cmp185, i24 23, i24 %bf.clear.i1346
+  %s1139 = zext nneg i24 %narrow1790 to i32
+  %6 = and i32 %s, 254
+  %or.cond1132 = icmp eq i32 %6, 4
+  %s1142 = select i1 %or.cond1132, i32 2, i32 %s
+  %7 = and i32 %s1139, 254
+  %or.cond1133 = icmp eq i32 %7, 4
+  %s1140 = select i1 %or.cond1133, i32 2, i32 %s1139
+  %cmp198 = icmp eq i32 %s1142, 32
+  %s1134 = select i1 %cmp198, i32 31, i32 %s1142
+  %cmp201 = icmp eq i32 %s1140, 32
+  %s1143 = select i1 %cmp201, i32 31, i32 %s1140
+  %cmp204 = icmp eq i32 %s1134, 54
+  %s1135 = select i1 %cmp204, i32 53, i32 %s1134
+  store i32 %s1135, ptr %p1, align 4
+  %cmp207 = icmp eq i32 %s1143, 54
+  %s1141 = select i1 %cmp207, i32 53, i32 %s1143
+  %cmp210.not = icmp eq i32 %s1135, %s1141
+  ret i1 %cmp210.not
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll
index 6e512fcbb73924..6051638562b59b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll
@@ -6,18 +6,17 @@ define void @test(i8 %0) {
 ; CHECK-SAME: i8 [[TMP0:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> <i8 0, i8 poison>, i8 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i16> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
-; CHECK-NEXT:    [[ADD:%.*]] = or i32 [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = or i32 [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD]], 1
 ; CHECK-NEXT:    [[CONV9:%.*]] = trunc i32 [[SHR]] to i8
 ; CHECK-NEXT:    store i8 [[CONV9]], ptr null, align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
index 2c834616becc0d..4acd63078b82ef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
@@ -6,15 +6,20 @@ define void @test(i64 %d.promoted.i) {
 ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[AND_1_I_1:%.*]] = and i64 0, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I_1]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i64> [[TMP0]], i64 [[AND_1_I]], i32 9
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <16 x i1> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 0
-; CHECK-NEXT:    store i32 [[TMP6]], ptr null, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i64> <i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[AND_1_I_1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and i32 [[OP_RDX]], 0
+; CHECK-NEXT:    store i32 [[TMP10]], ptr null, align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
new file mode 100644
index 00000000000000..50b19d01ad58f1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-user-not-min.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %block, ptr noalias %pixels, i1 %b) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[BLOCK:%.*]], ptr noalias [[PIXELS:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i1> <i1 true, i1 poison, i1 false, i1 false>, i1 [[B]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[BLOCK]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i16> [[TMP2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP3]], <4 x i8> [[TMP4]], <4 x i8> [[TMP1]]
+; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr [[PIXELS]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i16, ptr %block, align 2
+  %tobool.not.i78 = icmp ult i16 %0, 0
+  %conv.i80 = sext i1 true to i8
+  %conv1.i81 = trunc i16 %0 to i8
+  %retval.0.i82 = select i1 %tobool.not.i78, i8 %conv1.i81, i8 %conv.i80
+  store i8 %retval.0.i82, ptr %pixels, align 1
+  %arrayidx2 = getelementptr i8, ptr %block, i64 2
+  %1 = load i16, ptr %arrayidx2, align 2
+  %tobool.not.i73 = icmp ult i16 %1, 0
+  %conv.i75 = sext i1 %b to i8
+  %conv1.i76 = trunc i16 %1 to i8
+  %retval.0.i77 = select i1 %tobool.not.i73, i8 %conv1.i76, i8 %conv.i75
+  %arrayidx5 = getelementptr i8, ptr %pixels, i64 1
+  store i8 %retval.0.i77, ptr %arrayidx5, align 1
+  %arrayidx6 = getelementptr i8, ptr %block, i64 4
+  %2 = load i16, ptr %arrayidx6, align 2
+  %tobool.not.i68 = icmp ult i16 %2, 0
+  %conv.i70 = sext i1 false to i8
+  %conv1.i71 = trunc i16 %2 to i8
+  %retval.0.i72 = select i1 %tobool.not.i68, i8 %conv1.i71, i8 %conv.i70
+  %arrayidx9 = getelementptr i8, ptr %pixels, i64 2
+  store i8 %retval.0.i72, ptr %arrayidx9, align 1
+  %arrayidx10 = getelementptr i8, ptr %block, i64 6
+  %3 = load i16, ptr %arrayidx10, align 2
+  %tobool.not.i63 = icmp ult i16 %3, 0
+  %conv.i65 = sext i1 false to i8
+  %conv1.i66 = trunc i16 %3 to i8
+  %retval.0.i67 = select i1 %tobool.not.i63, i8 %conv1.i66, i8 %conv.i65
+  %arrayidx13 = getelementptr i8, ptr %pixels, i64 3
+  store i8 %retval.0.i67, ptr %arrayidx13, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
index 651631de2c35ad..a316415dcc6b52 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -17,12 +17,15 @@ target triple = "x86_64-unknown-linux-gnu"
 define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; SSE-LABEL: @PR31243_zext(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; SSE-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
-; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
-; SSE-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; SSE-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
+; SSE-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
+; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
 ; SSE-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; SSE-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; SSE-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -73,12 +76,15 @@ entry:
 define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; SSE-LABEL: @PR31243_sext(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; SSE-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
-; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
-; SSE-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; SSE-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
+; SSE-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
+; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
 ; SSE-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; SSE-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; SSE-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -89,13 +95,12 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
-; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
-; AVX-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
+; AVX-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
+; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
+; AVX-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
+; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
 ; AVX-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; AVX-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; AVX-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
index 88f75c37846efc..3cc32c1fc7b28e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
@@ -15,8 +15,8 @@ define i32 @phi3UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 undef, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -52,8 +52,8 @@ define i32 @phi2UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 undef, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -89,8 +89,8 @@ define i32 @phi1UndefInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) {
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 0, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -127,8 +127,8 @@ define i32 @phi1Undef1PoisonInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %ar
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -165,8 +165,8 @@ define i32 @phi1Undef2PoisonInputs(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8 %a
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 poison, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
@@ -202,8 +202,8 @@ define i32 @phi1Undef1PoisonGapInput(i1 %cond, i8 %arg0, i8 %arg1, i8 %arg2, i8
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i8> [ [[TMP3]], [[BB2]] ], [ <i8 0, i8 0, i8 poison, i8 undef>, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index 6f5d3d3785e0c8..f6db831ed97b19 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -108,3 +108,111 @@ entry:
   store i32 %conv27, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 3), align 4
   ret void
 }
+
+define void @test_div() {
+; CHECK-LABEL: define void @test_div(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i32> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[TMP4]], <i64 1, i64 2, i64 1, i64 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx1 = getelementptr i32, ptr null, i64 1
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr i32, ptr null, i64 63
+  %1 = load i32, ptr %arrayidx2, align 4
+  %mul = mul i32 %1, %0
+  %conv = zext i32 %mul to i64
+  %shr = udiv i64 %conv, 1
+  %conv3 = trunc i64 %shr to i32
+  store i32 %conv3, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+  %arrayidx5 = getelementptr i32, ptr null, i64 33
+  %2 = load i32, ptr %arrayidx5, align 4
+  %arrayidx6 = getelementptr i32, ptr null, i64 62
+  %3 = load i32, ptr %arrayidx6, align 4
+  %mul7 = mul i32 %3, %2
+  %conv8 = zext i32 %mul7 to i64
+  %shr10 = udiv i64 %conv8, 2
+  %conv11 = trunc i64 %shr10 to i32
+  store i32 %conv11, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 1), align 4
+  %arrayidx13 = getelementptr i32, ptr null, i64 7
+  %4 = load i32, ptr %arrayidx13, align 4
+  %arrayidx14 = getelementptr i32, ptr null, i64 61
+  %5 = load i32, ptr %arrayidx14, align 4
+  %mul15 = mul i32 %5, %4
+  %conv16 = zext i32 %mul15 to i64
+  %shr18 = udiv i64 %conv16, 1
+  %conv19 = trunc i64 %shr18 to i32
+  store i32 %conv19, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 2), align 8
+  %6 = load i32, ptr null, align 4
+  %arrayidx22 = getelementptr i32, ptr null, i64 60
+  %7 = load i32, ptr %arrayidx22, align 4
+  %mul23 = mul i32 %7, %6
+  %conv24 = zext i32 %mul23 to i64
+  %shr26 = udiv i64 %conv24, 2
+  %conv27 = trunc i64 %shr26 to i32
+  store i32 %conv27, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 3), align 4
+  ret void
+}
+
+define void @test_rem() {
+; CHECK-LABEL: define void @test_rem(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i32> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = urem <4 x i64> [[TMP4]], <i64 1, i64 2, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx1 = getelementptr i32, ptr null, i64 1
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr i32, ptr null, i64 63
+  %1 = load i32, ptr %arrayidx2, align 4
+  %mul = mul i32 %1, %0
+  %conv = zext i32 %mul to i64
+  %shr = urem i64 %conv, 1
+  %conv3 = trunc i64 %shr to i32
+  store i32 %conv3, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+  %arrayidx5 = getelementptr i32, ptr null, i64 33
+  %2 = load i32, ptr %arrayidx5, align 4
+  %arrayidx6 = getelementptr i32, ptr null, i64 62
+  %3 = load i32, ptr %arrayidx6, align 4
+  %mul7 = mul i32 %3, %2
+  %conv8 = zext i32 %mul7 to i64
+  %shr10 = urem i64 %conv8, 2
+  %conv11 = trunc i64 %shr10 to i32
+  store i32 %conv11, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 1), align 4
+  %arrayidx13 = getelementptr i32, ptr null, i64 7
+  %4 = load i32, ptr %arrayidx13, align 4
+  %arrayidx14 = getelementptr i32, ptr null, i64 61
+  %5 = load i32, ptr %arrayidx14, align 4
+  %mul15 = mul i32 %5, %4
+  %conv16 = zext i32 %mul15 to i64
+  %shr18 = urem i64 %conv16, 1
+  %conv19 = trunc i64 %shr18 to i32
+  store i32 %conv19, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 2), align 8
+  %6 = load i32, ptr null, align 4
+  %arrayidx22 = getelementptr i32, ptr null, i64 60
+  %7 = load i32, ptr %arrayidx22, align 4
+  %mul23 = mul i32 %7, %6
+  %conv24 = zext i32 %mul23 to i64
+  %shr26 = urem i64 %conv24, 1
+  %conv27 = trunc i64 %shr26 to i32
+  store i32 %conv27, ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 3), align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
index 65229d8acd889e..f0e734d8c5aefe 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
@@ -10,29 +10,29 @@ define  void @foo (ptr %A, ptr %B, ptr %Result) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP18:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], ptr [[A:%.*]], i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], ptr [[B:%.*]], i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], ptr [[B]], i64 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP9]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP9]], [[SHUFFLE1]]
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP11]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[TMP11]], [[SHUFFLE2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> [[TMP15]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP17]] = fadd <2 x float> [[TMP2]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP8]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP16:%.*]] = fsub <2 x float> [[TMP11]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x float> [[TMP11]], [[TMP15]]
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP20]] = fadd <2 x float> [[TMP2]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP18]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    store <2 x float> [[TMP17]], ptr [[RESULT:%.*]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP20]], ptr [[RESULT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
index 78c6d9516a3dec..b7237cbb02bb32 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -11,26 +11,26 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK:       if.then22.i:
 ; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
 ; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[SHUFFLE1]], <i32 1, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 2, i32 3, i32 4>
 ; CHECK-NEXT:    [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5
 ; CHECK-NEXT:    [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6
 ; CHECK-NEXT:    [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <8 x i32> [[SHUFFLE]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SHR_4_I_I]], i32 5
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_5_I_I]], i32 6
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_6_I_I]], i32 7
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i8> [[TMP13]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    store <16 x i8> [[TMP14]], ptr undef, align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i8> [[TMP14]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    store <16 x i8> [[TMP15]], ptr undef, align 1
 ; CHECK-NEXT:    unreachable
 ; CHECK:       if.end50.i:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
index 5d22b5a4873be3..1d1fcec2a7aeba 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
@@ -7,12 +7,10 @@ define i1 @test(i1 %cmp5.not.31) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i1> <i1 poison, i1 false, i1 false, i1 false>, i1 [[CMP5_NOT_31]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], <i32 2, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 0
-; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i32 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], <i32 2, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 0
+; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i32 [[TMP4]], 0
 ; CHECK-NEXT:    ret i1 [[CMP_NOT_I_I]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
index 35f2f9e052e749..f1be11d0d0fc51 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
@@ -23,10 +23,11 @@ define void @test(i32 %arg) {
 ; CHECK-NEXT:    ]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ [[TMP0]], [[BB2]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i32, ptr null, i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr null, i64 [[TMP6]]
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb7:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
index c1dd90d0e9a7bb..2f6868d8dfd628 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
@@ -8,17 +8,18 @@
 ; YAML-NEXT:  Function:        stores
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-3'
+; YAML-NEXT:    - Cost:            '-7'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '6'
 define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) {
 ; CHECK-LABEL: @stores(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %load.1 = load i8, ptr %in, align 1
@@ -63,17 +64,18 @@ define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) {
 ; YAML-NEXT:  Function:        insertelems
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-5'
+; YAML-NEXT:    - Cost:            '-9'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '6'
 define <4 x i64> @insertelems(ptr noalias %in, ptr noalias %inn) {
 ; CHECK-LABEL: @insertelems(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[TMP6]]
 ;
   %load.1 = load i8, ptr %in, align 1
   %gep.1 = getelementptr inbounds i8, ptr %in, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 22cd408cd6dc7f..e8395fe69ea6bf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -179,11 +179,11 @@ define void @addsub0(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2
 ; CHECK-NEXT:    store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], <i32 -2, i32 -3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], <i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -212,11 +212,11 @@ define void @addsub1(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], <i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
@@ -531,11 +531,11 @@ define void @addsub0f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 2
 ; CHECK-NEXT:    store float [[TMP1]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP3]], <float -2.000000e+00, float -3.000000e+00>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x float> [[TMP2]], <float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -564,11 +564,11 @@ define void @addsub1f(ptr noalias %dst, ptr noalias %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 2
 ; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <2 x float> [[TMP0]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub fast <2 x float> [[TMP0]], <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[INCDEC_PTR2]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 3
diff --git a/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll
index 061fbdb45a13bc..ff6f0bdd3db8f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alt-cmp-vectorize.ll
@@ -10,8 +10,8 @@ define i32 @alt_cmp(i16 %call46) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt <4 x i16> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP5]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i16
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = or i16 [[TMP6]], 0
 ; CHECK-NEXT:    [[EXT:%.*]] = zext i16 [[OP_RDX]] to i32
 ; CHECK-NEXT:    ret i32 [[EXT]]
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list-preinliner.prof b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list-preinliner.prof
new file mode 100644
index 00000000000000..7388d574be5e8b
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list-preinliner.prof
@@ -0,0 +1,14 @@
+main:8001:0
+ 1: 0
+ 2: 2000
+ 3: 2000
+ 4: 0
+ 5: 2000
+ 6: 2000
+ 7: 0
+ 8: 0
+ 9: bar:1
+  1: 1
+  !CFGChecksum: 4294967295
+  !Attributes: 2
+ !CFGChecksum: 563088156202820
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list-preinliner.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list-preinliner.ll
new file mode 100644
index 00000000000000..9e342f2b99da33
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list-preinliner.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list-preinliner.prof -S -profile-summary-cutoff-hot=100000 -sample-profile-use-preinliner=0 | FileCheck %s --check-prefix=DISABLE-PREINLINE
+; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list-preinliner.prof -S -profile-summary-cutoff-hot=100000 | FileCheck %s
+
+; The GUID of bar is -2012135647395072713
+
+; DISABLE-PREINLINE-NOT: -2012135647395072713
+; CHECK: [[#]] = !{!"function_entry_count", i64 1, i64 -2012135647395072713}
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  call void @llvm.pseudoprobe(i64 0, i64 0, i32 0, i64 0)
+  %call2 = call i32 @bar(), !dbg !9
+  br label %for.cond
+}
+
+declare i32 @bar()
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
+
+attributes #0 = { "use-sample-profile" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7}
+!llvm.pseudo_probe_desc = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/home/", checksumkind: CSK_MD5, checksum: "1bff37d8b3f7858b0bc29ab4efdf9422")
+!2 = !{!3}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = distinct !DIGlobalVariable(name: "x", scope: !0, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true)
+!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6)
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i64 -2624081020897602054, i64 563108639284859, !"main"}
+!9 = !DILocation(line: 11, column: 10, scope: !10)
+!10 = !DILexicalBlockFile(scope: !11, file: !1, discriminator: 186646615)
+!11 = distinct !DILexicalBlock(scope: !12, file: !1, line: 8, column: 40)
+!12 = distinct !DILexicalBlock(scope: !13, file: !1, line: 8, column: 3)
+!13 = distinct !DILexicalBlock(scope: !14, file: !1, line: 8, column: 3)
+!14 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 6, type: !15, scopeLine: 7, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
+!15 = distinct !DISubroutineType(types: !16)
+!16 = !{}
diff --git a/llvm/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll b/llvm/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll
index 8e78921c556d58..a51c1d24956a11 100644
--- a/llvm/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll
+++ b/llvm/test/Transforms/ThinLTOBitcodeWriter/pr33536.ll
@@ -12,7 +12,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; M0: @global = local_unnamed_addr global
 ; M1-NOT: @global
-@global = local_unnamed_addr global %struct.hoge { %struct.widget { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @global.1, i32 0, inrange i32 0, i32 2) } }, align 8
+@global = local_unnamed_addr global %struct.hoge { %struct.widget { ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @global.1, i32 0, i32 0, i32 2) } }, align 8
 
 ; M0: @global.1 = external unnamed_addr constant
 ; M1: @global.1 = linkonce_odr unnamed_addr constant
diff --git a/llvm/test/Verifier/intrinsic-cmp.ll b/llvm/test/Verifier/intrinsic-cmp.ll
new file mode 100644
index 00000000000000..2224a5c5eba385
--- /dev/null
+++ b/llvm/test/Verifier/intrinsic-cmp.ll
@@ -0,0 +1,22 @@
+; RUN: not opt -S -passes=verify 2>&1 < %s | FileCheck %s
+
+define void @matching_vector_lens(<4 x i32> %arg1, <4 x i32> %arg2) {
+  ; CHECK: return type and arguments must have the same number of elements
+  %res = call <8 x i32> @llvm.scmp.v8i32.v4i32(<4 x i32> %arg1, <4 x i32> %arg2)
+  ret void
+}
+
+define void @result_len_is_at_least_2bits_wide(i32 %arg1, i32 %arg2) {
+  ; CHECK: result type must be at least 2 bits wide
+  %res2 = call i1 @llvm.scmp.i1.i32(i32 %arg1, i32 %arg2)
+  ret void
+}
+
+define void @both_args_are_vecs_or_neither(<4 x i32> %arg1, i32 %arg2) {
+  ; CHECK: ucmp/scmp argument and result types must both be either vector or scalar types
+  %res3 = call i2 @llvm.scmp.i2.v4i32(<4 x i32> %arg1, <4 x i32> %arg1)
+  ; CHECK: ucmp/scmp argument and result types must both be either vector or scalar types
+  %res4 = call <4 x i32> @llvm.scmp.v4i32.i32(i32 %arg2, i32 %arg2)
+  ret void
+}
+
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.expected
index 02d8870c61365c..775649cee0e764 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.expected
@@ -41,7 +41,7 @@ declare void @_Z10sideeffectv()
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -52,7 +52,7 @@ declare void @_Z10sideeffectv()
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.globals.expected
index 05e57774c260e8..a8086ae2157096 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.globals.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.generated.globals.expected
@@ -44,7 +44,7 @@ declare void @_Z10sideeffectv()
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -55,7 +55,7 @@ declare void @_Z10sideeffectv()
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.expected
index 36bcbe32e03634..57de350dec15e8 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.expected
@@ -12,7 +12,7 @@ define void @foo(i32) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -36,7 +36,7 @@ define void @bar(i32) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.globals.expected
index db7c692a7def98..696d5c6edee6f3 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.globals.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs.ll.nogenerated.globals.expected
@@ -15,7 +15,7 @@ define void @foo(i32) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -39,7 +39,7 @@ define void @bar(i32) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; CHECK:       codeRepl:
 ; CHECK-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.expected
index 10399951e7fbcd..5275870b4088d7 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.expected
@@ -42,7 +42,7 @@ declare void @_Z10sideeffectv()
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
@@ -53,7 +53,7 @@ declare void @_Z10sideeffectv()
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.globals.expected
index 0001790e66ab37..712ccb25ae8577 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.globals.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.generated.globals.expected
@@ -45,7 +45,7 @@ declare void @_Z10sideeffectv()
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
@@ -56,7 +56,7 @@ declare void @_Z10sideeffectv()
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.expected
index e05a57d06413b9..b5b12b770522bf 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.expected
@@ -13,7 +13,7 @@ define void @foo(i32) {
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
@@ -37,7 +37,7 @@ define void @bar(i32) {
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.globals.expected
index 17be222f15c125..7e2b991cd6fccb 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.globals.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/generated_funcs_prefix_reuse.ll.nogenerated.globals.expected
@@ -16,7 +16,7 @@ define void @foo(i32) {
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @foo.cold.1() #[[ATTR2:[0-9]+]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
@@ -40,7 +40,7 @@ define void @bar(i32) {
 ; REUSE-NEXT:    br i1 [[TMP2]], label [[CODEREPL:%.*]], label [[EXIT:%.*]]
 ; REUSE:       codeRepl:
 ; REUSE-NEXT:    call void @bar.cold.1() #[[ATTR2]]
-; REUSE-NEXT:    ret void
+; REUSE-NEXT:    unreachable
 ; REUSE:       exit:
 ; REUSE-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/dsymutil/ARM/obfuscated.test b/llvm/test/tools/dsymutil/ARM/obfuscated.test
deleted file mode 100644
index 3443b8e634692b..00000000000000
--- a/llvm/test/tools/dsymutil/ARM/obfuscated.test
+++ /dev/null
@@ -1,200 +0,0 @@
-REQUIRES: system-darwin
-
-RUN: dsymutil --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck %s
-
-RUN: dsymutil --accelerator=Pub --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=PUB %s
-
-RUN: dsymutil --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=NOHIDDEN %s
-
-RUN: dsymutil --symbol-map %p/../Inputs/obfuscated.2.map %p/../Inputs/obfuscated.2.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=NOHIDDEN %s
-
-// Run with plist and make sure dsymutil finds it.
-RUN: mkdir -p %t.dSYM/Contents/Resources/DWARF/
-RUN: mkdir -p %t.mapdir
-RUN: cp %p/../Inputs/obfuscated.arm64 %t.dSYM/Contents/Resources/DWARF/
-RUN: cp %p/../Inputs/E828A486-8433-3A5E-B6DB-A6294D28133D.plist %t.dSYM/Contents/Resources/
-RUN: cp %p/../Inputs/obfuscated.map %t.mapdir/506AA50A-6B26-3B37-86D2-DC6EBD57B720.bcsymbolmap
-RUN: dsymutil --symbol-map %t.mapdir %t.dSYM 2>&1 | FileCheck --check-prefix=OBFUSCATING %s
-
-// Run without plist and make sure dsymutil doesn't crash.
-RUN: rm %t.dSYM/Contents/Resources/E828A486-8433-3A5E-B6DB-A6294D28133D.plist
-RUN: dsymutil --symbol-map %t.mapdir %t.dSYM 2>&1 | FileCheck --check-prefix=NOTOBFUSCATING %s
-
-// ----------------------------------------
-// Repeat the same steps for --linker parallel.
-RUN: dsymutil --linker parallel --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck %s
-
-RUN: dsymutil --linker parallel --accelerator=Pub --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=PUB %s
-
-RUN: dsymutil --linker parallel --symbol-map %p/../Inputs/obfuscated.map %p/../Inputs/obfuscated.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=NOHIDDEN %s
-
-RUN: dsymutil --linker parallel --symbol-map %p/../Inputs/obfuscated.2.map %p/../Inputs/obfuscated.2.arm64 -f -o - \
-RUN:     | llvm-dwarfdump -v - \
-RUN:     | FileCheck --check-prefix=NOHIDDEN %s
-
-// Run with plist and make sure dsymutil finds it.
-RUN: mkdir -p %t.dSYM/Contents/Resources/DWARF/
-RUN: mkdir -p %t.mapdir
-RUN: cp %p/../Inputs/obfuscated.arm64 %t.dSYM/Contents/Resources/DWARF/
-RUN: cp %p/../Inputs/E828A486-8433-3A5E-B6DB-A6294D28133D.plist %t.dSYM/Contents/Resources/
-RUN: cp %p/../Inputs/obfuscated.map %t.mapdir/506AA50A-6B26-3B37-86D2-DC6EBD57B720.bcsymbolmap
-RUN: dsymutil --linker parallel --symbol-map %t.mapdir %t.dSYM 2>&1 | FileCheck --check-prefix=OBFUSCATING %s
-
-// Run without plist and make sure dsymutil doesn't crash.
-RUN: rm %t.dSYM/Contents/Resources/E828A486-8433-3A5E-B6DB-A6294D28133D.plist
-RUN: dsymutil --linker parallel --symbol-map %t.mapdir %t.dSYM 2>&1 | FileCheck --check-prefix=NOTOBFUSCATING %s
-
-OBFUSCATING-NOT: not unobfuscating
-
-NOTOBFUSCATING: not unobfuscating
-
-NOHIDDEN-NOT: __hidden#
-
-CHECK: .debug_info contents:
-
-CHECK: DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "main.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "main")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "one.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "one")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "two.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "two")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "three.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "three")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "four.c")
-CHECK:    DW_AT_stmt_list [DW_FORM_data4]  (0x0000011e)
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "four")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "five.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "five")
-
-CHECK:  DW_TAG_compile_unit [1] *
-CHECK:    DW_AT_producer [DW_FORM_strp]    ( {{.*}} "Apple LLVM version 7.0.0 (clang-700.2.38.2)")
-CHECK:    DW_AT_name [DW_FORM_strp]        ( {{.*}} "six.c")
-CHECK:    DW_AT_comp_dir [DW_FORM_strp]    ( {{.*}} "/Users/steven/dev/alpena/tests/src")
-CHECK:    DW_TAG_subprogram [2]
-CHECK:      DW_AT_name [DW_FORM_strp]      ( {{.*}} "six")
-
-CHECK: .debug_line contents:
-CHECK: file_names[  1]:
-CHECK:            name: "main.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "one.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "two.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "three.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "four.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "five.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-CHECK: file_names[  1]:
-CHECK:            name: "six.c"
-CHECK:       dir_index: 0
-CHECK:        mod_time: 0x00000000
-CHECK:          length: 0x00000000
-
-PUB: .debug_pubnames contents:
-PUB: length = 0x00000017, format = DWARF32, version = 0x0002, unit_offset = 0x00000000, unit_size = 0x00000044
-PUB: 0x0000002e "main"
-PUB: length = 0x00000016, format = DWARF32, version = 0x0002, unit_offset = 0x00000044, unit_size = 0x00000044
-PUB: 0x0000002e "one"
-PUB: length = 0x00000016, format = DWARF32, version = 0x0002, unit_offset = 0x00000088, unit_size = 0x00000044
-PUB: 0x0000002e "two"
-PUB: length = 0x00000018, format = DWARF32, version = 0x0002, unit_offset = 0x000000cc, unit_size = 0x00000044
-PUB: 0x0000002e "three"
-PUB: length = 0x00000017, format = DWARF32, version = 0x0002, unit_offset = 0x00000110, unit_size = 0x00000044
-PUB: 0x0000002e "four"
-PUB: length = 0x00000017, format = DWARF32, version = 0x0002, unit_offset = 0x00000154, unit_size = 0x00000044
-PUB: 0x0000002e "five"
-PUB: length = 0x00000016, format = DWARF32, version = 0x0002, unit_offset = 0x00000198, unit_size = 0x00000044
-PUB: 0x0000002e "six"
-
-CHECK: .apple_names contents:
-
-CHECK: String: 0x00000091 "five"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x00000182
-CHECK-NEXT: ]
-CHECK: String: 0x0000009c "six"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x000001c6
-CHECK-NEXT: ]
-CHECK: String: 0x00000078 "three"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x000000fa
-CHECK-NEXT: ]
-CHECK: String: 0x0000006c "two"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x000000b6
-CHECK-NEXT: ]
-CHECK: String: 0x00000057 "main"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x0000002e
-CHECK-NEXT: ]
-CHECK: String: 0x00000085 "four"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x0000013e
-CHECK-NEXT: ]
-CHECK: String: 0x00000062 "one"
-CHECK-NEXT: Data 0 [
-CHECK-NEXT:   Atom[0]: 0x00000072
-CHECK-NEXT: ]
diff --git a/llvm/test/tools/dsymutil/Inputs/obfuscated.2.arm64 b/llvm/test/tools/dsymutil/Inputs/obfuscated.2.arm64
deleted file mode 100644
index b40e023eb4916c..00000000000000
Binary files a/llvm/test/tools/dsymutil/Inputs/obfuscated.2.arm64 and /dev/null differ
diff --git a/llvm/test/tools/dsymutil/Inputs/obfuscated.2.map b/llvm/test/tools/dsymutil/Inputs/obfuscated.2.map
deleted file mode 100644
index 6efca5960311d6..00000000000000
--- a/llvm/test/tools/dsymutil/Inputs/obfuscated.2.map
+++ /dev/null
@@ -1,22 +0,0 @@
-BCSymbolMap Version: 2.0
-_two
-_three
-_four
-_five
-_six
-LLVM version 3.9.0 (ssh://git@stash.sd.apple.com/devtools/clang.git c74ae34bd917b77f9c848bd599dfde2813fb509f)
-main
-main.c
-/Volumes/Data/dev/BitcodeBuildTests/unit
-one
-one.c
-two
-two.c
-three
-three.c
-four
-four.c
-five
-five.c
-six
-six.c
diff --git a/llvm/test/tools/dsymutil/Inputs/obfuscated.arm64 b/llvm/test/tools/dsymutil/Inputs/obfuscated.arm64
deleted file mode 100644
index 8395798be17ff1..00000000000000
Binary files a/llvm/test/tools/dsymutil/Inputs/obfuscated.arm64 and /dev/null differ
diff --git a/llvm/test/tools/dsymutil/Inputs/obfuscated.map b/llvm/test/tools/dsymutil/Inputs/obfuscated.map
deleted file mode 100644
index 30fed8bf9b5a57..00000000000000
--- a/llvm/test/tools/dsymutil/Inputs/obfuscated.map
+++ /dev/null
@@ -1,17 +0,0 @@
-one
-two
-three
-four
-five
-six
-.str
-Apple LLVM version 7.0.0 (clang-700.2.38.2)
-main
-main.c
-/Users/steven/dev/alpena/tests/src
-one.c
-two.c
-three.c
-four.c
-five.c
-six.c
diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test
index 36cf3f542695ca..814252b6e23067 100644
--- a/llvm/test/tools/dsymutil/cmdline.test
+++ b/llvm/test/tools/dsymutil/cmdline.test
@@ -28,7 +28,6 @@ CHECK: -remarks-output-format <format>
 CHECK: -remarks-prepend-path <path>
 CHECK: -reproducer <mode>
 CHECK: -statistics
-CHECK: -symbol-map
 CHECK: -symtab
 CHECK: {{-S}}
 CHECK: -toolchain
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/dw-at-specification.test b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/dw-at-specification.test
index 9c9118ea49f8ab..b1b35e4f4fe3ed 100644
--- a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/dw-at-specification.test
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/dw-at-specification.test
@@ -10,7 +10,7 @@
 
 ; The above test compiled with clang++ produces both a DW_AT_type and
 ; DW_AT_specification on the definition die for S::Arr, which previously caused
-; an assert in the LVELFReader:
+; an assert in the LVDWARFReader:
 ; $ clang++ -g -c dw-at-specification.cpp -o dw-at-specification.o
 
 ; RUN: llvm-debuginfo-analyzer --attribute=level,format,producer \
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-ignored-DW_FORM_implicit_const.test b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-ignored-DW_FORM_implicit_const.test
index 7ee9f3118d0aa9..9df058c5598052 100644
--- a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-ignored-DW_FORM_implicit_const.test
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-ignored-DW_FORM_implicit_const.test
@@ -14,7 +14,7 @@
 ;         DW_AT_decl_file DW_FORM_implicit_const  1
 ;         DW_AT_decl_line DW_FORM_data1
 
-; Attributes with DW_FORM_implicit_const being ignored by the ELFReader,
+; Attributes with DW_FORM_implicit_const being ignored by the DWARFReader,
 ; causing {Parameter} and {TypeAlias} to omit line numbers.
 
 ; test.cpp
diff --git a/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s b/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
index 2257e453e0a812..d686293c9b4309 100644
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
@@ -1023,7 +1023,7 @@
 # CHECK-NEXT:  2      2     1.00                        blxne	r2
 # CHECK-NEXT:  2      1     1.00                  U     blx	#32424576
 # CHECK-NEXT:  2      1     1.00                  U     blx	#16212288
-# CHECK-NEXT:  1      1     1.00                  U     bx	r2
+# CHECK-NEXT:  1      1     1.00                        bx	r2
 # CHECK-NEXT:  1      1     1.00                  U     bxne	r2
 # CHECK-NEXT:  1      1     1.00                  U     bxj	r2
 # CHECK-NEXT:  1      1     1.00                  U     bxjne	r2
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s
index 892a5d14e8f3a5..03f7de2fe9a4c2 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/gpr-bypass.s
@@ -180,10 +180,10 @@ jr a0
 # CHECK-NEXT:  1      3     0.50                        sext.b	a0, a0
 # CHECK-NEXT:  1      3     0.50                        sext.h	a0, a0
 # CHECK-NEXT:  1      3     0.50                        zext.h	a0, a0
-# CHECK-NEXT:  1      3     0.50                        min	a0, a0, a0
-# CHECK-NEXT:  1      3     0.50                        minu	a0, a0, a0
-# CHECK-NEXT:  1      3     0.50                        max	a0, a0, a0
-# CHECK-NEXT:  1      3     0.50                        maxu	a0, a0, a0
+# CHECK-NEXT:  1      3     1.00                        min	a0, a0, a0
+# CHECK-NEXT:  1      3     1.00                        minu	a0, a0, a0
+# CHECK-NEXT:  1      3     1.00                        max	a0, a0, a0
+# CHECK-NEXT:  1      3     1.00                        maxu	a0, a0, a0
 # CHECK-NEXT:  1      3     1.00                        rol	a0, a0, a0
 # CHECK-NEXT:  1      3     1.00                        ror	a0, a0, a0
 # CHECK-NEXT:  1      3     1.00                        rori	a0, a0, 1
@@ -225,7 +225,7 @@ jr a0
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
-# CHECK-NEXT:  -      -     39.00  52.00   -      -      -      -
+# CHECK-NEXT:  -      -     37.00  54.00   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
@@ -289,9 +289,9 @@ jr a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     sext.h	a0, a0
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -     zext.h	a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     min	a0, a0, a0
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     minu	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     minu	a0, a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     max	a0, a0, a0
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     maxu	a0, a0, a0
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -     maxu	a0, a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rol	a0, a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     ror	a0, a0, a0
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -     rori	a0, a0, 1
diff --git a/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64.dylib b/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64.dylib
new file mode 100755
index 00000000000000..051e28f33d7494
Binary files /dev/null and b/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64.dylib differ
diff --git a/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64_32.dylib b/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64_32.dylib
new file mode 100755
index 00000000000000..d3a339057abc34
Binary files /dev/null and b/llvm/test/tools/llvm-objdump/MachO/AArch64/Inputs/rel-method-lists-arm64_32.dylib differ
diff --git a/llvm/test/tools/llvm-objdump/MachO/AArch64/macho-relative-method-lists.test b/llvm/test/tools/llvm-objdump/MachO/AArch64/macho-relative-method-lists.test
new file mode 100644
index 00000000000000..b1b96a41a32939
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/MachO/AArch64/macho-relative-method-lists.test
@@ -0,0 +1,86 @@
+RUN: llvm-objdump --macho --objc-meta-data    %p/Inputs/rel-method-lists-arm64_32.dylib | FileCheck %s --check-prefix=CHK32
+RUN: llvm-otool -ov                           %p/Inputs/rel-method-lists-arm64_32.dylib | FileCheck %s --check-prefix=CHK32
+
+RUN: llvm-objdump --macho --objc-meta-data    %p/Inputs/rel-method-lists-arm64.dylib    | FileCheck %s --check-prefix=CHK64
+RUN: llvm-otool -ov                           %p/Inputs/rel-method-lists-arm64.dylib    | FileCheck %s --check-prefix=CHK64
+
+CHK32:                 baseMethods 0x660 (struct method_list_t *)
+CHK32-NEXT:                 entsize 12 (relative)
+CHK32-NEXT:                   count 3
+CHK32-NEXT:                    name 0x144 (0x{{[0-9a-f]*}}) instance_method_00
+CHK32-NEXT:                   types 0x91 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffff18 (0x{{[0-9a-f]*}}) -[MyClass instance_method_00]
+CHK32-NEXT:                    name 0x13c (0x{{[0-9a-f]*}}) instance_method_01
+CHK32-NEXT:                   types 0x85 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffff28 (0x{{[0-9a-f]*}}) -[MyClass instance_method_01]
+CHK32-NEXT:                    name 0x134 (0x{{[0-9a-f]*}}) instance_method_02
+CHK32-NEXT:                   types 0x79 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffff38 (0x{{[0-9a-f]*}}) -[MyClass instance_method_02]
+
+CHK32:                 baseMethods 0x630 (struct method_list_t *)
+CHK32-NEXT:                 entsize 12 (relative)
+CHK32-NEXT:                   count 3
+CHK32-NEXT:                    name 0x180 (0x{{[0-9a-f]*}}) class_method_00
+CHK32-NEXT:                   types 0xc1 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffff9c (0x{{[0-9a-f]*}}) +[MyClass class_method_00]
+CHK32-NEXT:                    name 0x178 (0x{{[0-9a-f]*}}) class_method_01
+CHK32-NEXT:                   types 0xb5 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffffac (0x{{[0-9a-f]*}}) +[MyClass class_method_01]
+CHK32-NEXT:                    name 0x170 (0x{{[0-9a-f]*}}) class_method_02
+CHK32-NEXT:                   types 0xa9 (0x{{[0-9a-f]*}}) v8@0:4
+CHK32-NEXT:                     imp 0xffffffbc (0x{{[0-9a-f]*}}) +[MyClass class_method_02]
+
+CHK64:                  baseMethods 0x6e0 (struct method_list_t *)
+CHK64-NEXT:                  entsize 12 (relative)
+CHK64-NEXT:                    count 3
+CHK64-NEXT:                     name 0x188 (0x{{[0-9a-f]*}}) instance_method_00
+CHK64-NEXT:                    types 0x91 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffa8 (0x{{[0-9a-f]*}}) -[MyClass instance_method_00]
+CHK64-NEXT:                     name 0x184 (0x{{[0-9a-f]*}}) instance_method_01
+CHK64-NEXT:                    types 0x85 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffa0 (0x{{[0-9a-f]*}}) -[MyClass instance_method_01]
+CHK64-NEXT:                     name 0x180 (0x{{[0-9a-f]*}}) instance_method_02
+CHK64-NEXT:                    types 0x79 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffff98 (0x{{[0-9a-f]*}}) -[MyClass instance_method_02]
+
+CHK64:                  baseMethods 0x6b0 (struct method_list_t *)
+CHK64-NEXT:                  entsize 12 (relative)
+CHK64-NEXT:                    count 3
+CHK64-NEXT:                     name 0x1d0 (0x{{[0-9a-f]*}}) class_method_00
+CHK64-NEXT:                    types 0xc1 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffe4 (0x{{[0-9a-f]*}}) +[MyClass class_method_00]
+CHK64-NEXT:                     name 0x1cc (0x{{[0-9a-f]*}}) class_method_01
+CHK64-NEXT:                    types 0xb5 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffdc (0x{{[0-9a-f]*}}) +[MyClass class_method_01]
+CHK64-NEXT:                     name 0x1c8 (0x{{[0-9a-f]*}}) class_method_02
+CHK64-NEXT:                    types 0xa9 (0x{{[0-9a-f]*}}) v16@0:8
+CHK64-NEXT:                      imp 0xffffffd4 (0x{{[0-9a-f]*}}) +[MyClass class_method_02]
+
+######## Generate rel-method-lists-arm64.dylib ########
+// clang -c main.mm -o main.o -target arm64-apple-macos -arch arm64
+// ld64.ld64 -dylib -demangle -dynamic main.o -o rel-method-lists-arm64.dylib -syslibroot MacOSX14.2.sdk -segalign 0x10 -objc_relative_method_lists
+
+######## Generate rel-method-lists-arm64_32.dylib ########
+// clang -c main.mm -o main.o -target arm64_32-apple-watchos -arch arm64_32
+// ld64.ld64 -dylib -demangle -dynamic main.o -o rel-method-lists-arm64_32.dylib -syslibroot WatchOS.sdk -segalign 0x10 -objc_relative_method_lists
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~ main.mm ~~~~~~~~~~~~~~~~~~~~~~~~~
+__attribute__((objc_root_class))
+@interface MyClass
+- (void)instance_method_00;
+- (void)instance_method_01;
+- (void)instance_method_02;
++ (void)class_method_00;
++ (void)class_method_01;
++ (void)class_method_02;
+@end
+@implementation MyClass
+- (void)instance_method_00 {}
+- (void)instance_method_01 {}
+- (void)instance_method_02 {}
++ (void)class_method_00 {}
++ (void)class_method_01 {}
++ (void)class_method_02 {}
+@end
+void *_objc_empty_cache;
+void *_objc_empty_vtable;
diff --git a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe
new file mode 100644
index 00000000000000..309476a2437f04
Binary files /dev/null and b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.exe differ
diff --git a/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
new file mode 100644
index 00000000000000..96eb878c31667c
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/coff-profile.perfscript
@@ -0,0 +1,13 @@
+PERF_RECORD_MMAP2 5752/0: [0x7ff70a1b0000(0x640000) @ 0x1000 00:00 0 0]: r-xp c:\Users\haohaiwe\Desktop\coff-profile.exe
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/P/X/A/0 0x7ff70a1b1415/0x7ff70a1b13b0/M/X/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/-/X/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
+ 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 0x7ff70a1b1482/0x7ff70a1b1430/P/-/A/0 
diff --git a/llvm/test/tools/llvm-profgen/coff-profile.test b/llvm/test/tools/llvm-profgen/coff-profile.test
new file mode 100644
index 00000000000000..5578f731cac830
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/coff-profile.test
@@ -0,0 +1,79 @@
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/coff-profile.perfscript --binary=%S/Inputs/coff-profile.exe --output=%t
+; RUN: FileCheck %s --input-file %t
+
+CHECK:      main:31837:0
+CHECK-NEXT:  0: 0
+CHECK-NEXT:  3.1: 0
+CHECK-NEXT:  3.2: 0
+CHECK-NEXT:  8: 0
+CHECK-NEXT:  65501: 0
+CHECK-NEXT:  1: ??$init@HG@MyNameSpace2@@YAXHPEAG@Z:0
+CHECK-NEXT:   1: 0
+CHECK-NEXT:   1.1: 0
+CHECK-NEXT:   1.2: 0
+CHECK-NEXT:   2: 0
+CHECK-NEXT:   65514: 0
+CHECK-NEXT:  4: ?work1@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:3193
+CHECK-NEXT:   0: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:3193
+CHECK-NEXT:    1.1: 31
+CHECK-NEXT:    1.2: 31
+CHECK-NEXT:    2: 31
+CHECK-NEXT:    3: 31
+CHECK-NEXT:    65530: 0
+CHECK-NEXT:  5: ?work2@?$MyClass@GH@MyNameSpace1@@QEAAXQEAGH@Z:28644
+CHECK-NEXT:   0: ?work@?$MyClass@GH@MyNameSpace1@@AEAAXQEAGHH@Z:28644
+CHECK-NEXT:    1.1: 341
+CHECK-NEXT:    1.2: 341
+CHECK-NEXT:    2: 341
+CHECK-NEXT:    3: 341
+CHECK-NEXT:    65530: 0
+CHECK-NEXT:  7: ?print@MyNameSpace2@@YAXPEAGH@Z:0
+CHECK-NEXT:   1: 0
+
+; Original code
+; clang-cl.exe -O2 -gdwarf -gline-tables-only coff-profile.cpp -fuse-ld=lld -Xclang -fdebug-info-for-profiling -link -debug:dwarf
+
+#include <stdio.h>
+
+namespace MyNameSpace1 {
+
+template <typename T1, typename T2> class MyClass {
+  void work(T1 map[], T2 n, T2 m) {
+    for (int i = 1; i < n; i++) {
+      map[i] = map[i - 1] * map[i - 1];
+      map[i] += (i * map[i - 1]) / m + i % m;
+    }
+  }
+
+public:
+  void work1(T1 map[], T2 n) { work(map, n, 7); }
+  void work2(T1 map[], T2 n) { work(map, n, 3); }
+};
+
+} // namespace MyNameSpace1
+
+namespace MyNameSpace2 {
+
+template <typename T1, typename T2> void init(T1 c, T2 *p) {
+  for (int i = 0; i < c * 1000000; i++) {
+    p[i] = i / 3 + (i * i) % 3;
+  }
+}
+
+void print(unsigned short *p, int i) {
+  printf("%d %d %d\n", p[i * i * 100], p[i * i * 100 + 1], p[i * i * 100 + 2]);
+}
+
+} // namespace MyNameSpace2
+
+unsigned short M[3000000];
+int main(int argc, char *argv[]) {
+  MyNameSpace2::init(argc, M);
+  MyNameSpace1::MyClass<unsigned short, int> Obj;
+  for (int i = 0; i <= argc * 10; i++) {
+    Obj.work1(&M[argc], argc * 100000);
+    Obj.work2(&M[argc * argc], argc * 1000000);
+  }
+  MyNameSpace2::print(M, argc);
+  return 0;
+}
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
index f4c73de7ca6c9d..083c296b8cc137 100644
--- a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
@@ -10,6 +10,7 @@
 # CHECK-NEXT: [    18] x.c.
 # CHECK-NEXT: [    1e] .
 # CHECK-NEXT: [    20] .
+# CHECK-EMPTY:
 # CHECK-NEXT: Hex dump of section '.b':
 # CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
 # CHECK-NEXT: 0x00000000 01000000 00000000 01000000 00000000 ................
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
index ea7a8854eb1a0c..c1d12a6d560ec8 100644
--- a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
@@ -28,6 +28,7 @@
 # COMPRESSED:      String dump of section '.not_null_terminated':
 # COMPRESSED-NEXT: [     0] no
 # COMPRESSED-NEXT: [     3] null
+# COMPRESSED-EMPTY:
 # COMPRESSED-NEXT: Hex dump of section '.strings':
 # COMPRESSED-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
 # COMPRESSED-NEXT: 0x00000010 00000000 00000000 789ccb48 2d4a6548 ........x..H-JeH
@@ -39,6 +40,7 @@
 # INVALID:      String dump of section '.invalid1':
 # INVALID-NEXT: warning: '[[FILE]]': corrupted compressed section header
 # INVALID-NEXT: [     0] .
+# INVALID-EMPTY:
 # INVALID-NEXT: Hex dump of section '.invalid2':
 # INVALID-NEXT: warning: '[[FILE]]': zlib error: Z_DATA_ERROR
 # INVALID-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
index 65da952687f526..98c7cb002769ae 100644
--- a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
@@ -9,6 +9,7 @@
 # CHECK-NEXT: [    10] .
 # CHECK-NEXT: [    18] (./. ..
 # CHECK-NEXT: [    21] .
+# CHECK-EMPTY:
 # CHECK-NEXT: Hex dump of section '.b':
 # CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time
 # CHECK-NEXT: 0x00000000 02000000 00000000 01000000 00000000 ................
diff --git a/llvm/test/tools/llvm-readobj/ELF/hex-dump-multi.s b/llvm/test/tools/llvm-readobj/ELF/hex-dump-multi.s
index 33ef534e81e138..942bfc4b7fb026 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hex-dump-multi.s
+++ b/llvm/test/tools/llvm-readobj/ELF/hex-dump-multi.s
@@ -1,10 +1,12 @@
 # REQUIRES: x86-registered-target
 
 # RUN: llvm-mc -filetype=obj -triple x86_64 %s -o %t.o
-# RUN: llvm-readobj -x .a -x .b %t.o | FileCheck %s
+# RUN: llvm-readobj -x .a -x .b %t.o | FileCheck %s --check-prefixes=HEADER,CHECK
 # RUN: llvm-readelf -x .a -x .b %t.o | FileCheck %s
 
-# CHECK:      Hex dump of section '.a':
+# HEADER:     LoadName:
+# CHECK:      {{^$}}
+# CHECK-NEXT: Hex dump of section '.a':
 # CHECK-NEXT: 0x00000000 00
 # CHECK-EMPTY:
 # CHECK-NEXT: Hex dump of section '.b':
diff --git a/llvm/test/tools/llvm-readobj/ELF/hex-dump.test b/llvm/test/tools/llvm-readobj/ELF/hex-dump.test
index 7829944806f34f..71212dee5e07a7 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hex-dump.test
+++ b/llvm/test/tools/llvm-readobj/ELF/hex-dump.test
@@ -46,7 +46,8 @@ FileHeader:
 # RUN: llvm-readelf --hex-dump=.sec %t2.out1 | \
 # RUN:   FileCheck %s --match-full-lines --strict-whitespace --check-prefix=SPACES1
 
-#      SPACES1:Hex dump of section '.sec':
+#      SPACES1:{{^$}}
+# SPACES1-NEXT:Hex dump of section '.sec':
 # SPACES1-NEXT:0x00000000 00000000 00000000 00000000 00000000 ................
 # SPACES1-NEXT:0x00000010 0000                                ..
 
@@ -55,7 +56,8 @@ FileHeader:
 # RUN: llvm-readelf --hex-dump=.sec %t2.out2 | \
 # RUN:   FileCheck %s --match-full-lines --strict-whitespace --check-prefix=SPACES2
 
-#      SPACES2:Hex dump of section '.sec':
+#      SPACES2:{{^$}}
+# SPACES2-NEXT:Hex dump of section '.sec':
 # SPACES2-NEXT:0x00000000 00000000 00000000 00000000 00000000 ................
 # SPACES2-NEXT:0x00000010 00000000 00000000 00000000 0000     ..............
 
@@ -64,7 +66,8 @@ FileHeader:
 # RUN: llvm-readelf --hex-dump=.sec %t2.out3 | \
 # RUN:   FileCheck %s --match-full-lines --strict-whitespace --check-prefix=SPACES3
 
-#      SPACES3:Hex dump of section '.sec':
+#      SPACES3:{{^$}}
+# SPACES3-NEXT:Hex dump of section '.sec':
 # SPACES3-NEXT:0x00000000 00000000 00000000 00000000 00000000 ................
 # SPACES3-NEXT:0x00000010 00000000 00000000 00000000          ............
 
diff --git a/llvm/test/tools/llvm-readobj/ELF/machine-specific-section-types.test b/llvm/test/tools/llvm-readobj/ELF/machine-specific-section-types.test
index f9524383e80b6d..f65793c5bdc62d 100644
--- a/llvm/test/tools/llvm-readobj/ELF/machine-specific-section-types.test
+++ b/llvm/test/tools/llvm-readobj/ELF/machine-specific-section-types.test
@@ -17,6 +17,10 @@
 # RUN: llvm-readobj --section-headers %t-aarch64.o | FileCheck %s --check-prefix=AARCH64-LLVM
 # RUN: llvm-readelf --section-headers %t-aarch64.o | FileCheck %s --check-prefix=AARCH64-GNU
 
+# RUN: yaml2obj %s --docnum=5 -o %t-hexagon.o
+# RUN: llvm-readobj --section-headers %t-hexagon.o | FileCheck %s --check-prefix=HEXAGON-LLVM
+# RUN: llvm-readelf --section-headers %t-hexagon.o | FileCheck %s --check-prefix=HEXAGON-GNU
+
 # ARM-LLVM: Name: exidx
 # ARM-LLVM: Type: SHT_ARM_EXIDX
 # ARM-LLVM: Name: preemptmap
@@ -64,6 +68,13 @@
 # AARCH64-GNU: .memtag.globals.dynamic AARCH64_MEMTAG_GLOBALS_DYNAMIC
 # AARCH64-GNU: .memtag.globals.static  AARCH64_MEMTAG_GLOBALS_STATIC
 
+# HEXAGON-LLVM: Name: hexagon_ordered
+# HEXAGON-LLVM: Type: SHT_HEX_ORDERED
+# HEXAGON-LLVM: Name: .hexagon.attributes
+# HEXAGON-LLVM: Type: SHT_HEXAGON_ATTRIBUTES
+
+# HEXAGON-GNU: hexagon_ordered HEX_ORDERED
+
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
@@ -122,3 +133,15 @@ Sections:
     Type:  SHT_AARCH64_MEMTAG_GLOBALS_DYNAMIC
   - Name:  .memtag.globals.static
     Type:  SHT_AARCH64_MEMTAG_GLOBALS_STATIC
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_HEXAGON
+Sections:
+  - Name: hexagon_ordered
+    Type: SHT_HEX_ORDERED
+  - Name: .hexagon.attributes
+    Type: SHT_HEXAGON_ATTRIBUTES
diff --git a/llvm/test/tools/llvm-readobj/ELF/string-dump-multi.s b/llvm/test/tools/llvm-readobj/ELF/string-dump-multi.s
index 29d7ef011005e5..36a115bd259a96 100644
--- a/llvm/test/tools/llvm-readobj/ELF/string-dump-multi.s
+++ b/llvm/test/tools/llvm-readobj/ELF/string-dump-multi.s
@@ -1,10 +1,12 @@
 # REQUIRES: x86-registered-target
 
 # RUN: llvm-mc -filetype=obj -triple x86_64 %s -o %t.o
-# RUN: llvm-readobj -p .a -p .b %t.o | FileCheck %s
+# RUN: llvm-readobj -p .a -p .b %t.o | FileCheck %s --check-prefixes=HEADER,CHECK
 # RUN: llvm-readelf -p .a -p .b %t.o | FileCheck %s
 
-# CHECK:      String dump of section '.a':
+# HEADER:     LoadName:
+# CHECK:      {{^$}}
+# CHECK-NEXT: String dump of section '.a':
 # CHECK-NEXT: [     0] 0
 # CHECK-EMPTY:
 # CHECK-NEXT: String dump of section '.b':
diff --git a/llvm/test/tools/llvm-readobj/ELF/string-dump.test b/llvm/test/tools/llvm-readobj/ELF/string-dump.test
index c06b274d25288d..1d7a177b32de12 100644
--- a/llvm/test/tools/llvm-readobj/ELF/string-dump.test
+++ b/llvm/test/tools/llvm-readobj/ELF/string-dump.test
@@ -3,7 +3,7 @@
 
 # RUN: llvm-readobj --string-dump=.strings \
 # RUN:   --string-dump=.not_null_terminated %t > %t.readobj.out
-# RUN: FileCheck %s --input-file=%t.readobj.out
+# RUN: FileCheck %s --input-file=%t.readobj.out --check-prefixes=HEADER,CHECK
 
 # Also test the different ways --string-dump can be specified, i.e. as a short
 # flag (-p), with different prefix modes (-p .foo, -p=.foo, -p.foo), and with
@@ -23,7 +23,9 @@
 # RUN: llvm-readelf -hp1 -p2 %t | cmp %t.readelf.out -
 # RUN: llvm-readelf -hp 1 -p.not_null_terminated %t | cmp %t.readelf.out -
 
-# CHECK:      String dump of section '.strings':
+# HEADER:     LoadName:
+# CHECK:      {{^$}}
+# CHECK-NEXT: String dump of section '.strings':
 # CHECK-NEXT: [ 0] here
 # CHECK-NEXT: [ 5] are
 # CHECK-NEXT: [ 9] some
diff --git a/llvm/test/tools/llvm-reduce/remove-dp-values.ll b/llvm/test/tools/llvm-reduce/remove-dp-values.ll
index 40ff9f32e960d0..d137b279f4ea01 100644
--- a/llvm/test/tools/llvm-reduce/remove-dp-values.ll
+++ b/llvm/test/tools/llvm-reduce/remove-dp-values.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-reduce --abort-on-invalid-reduction --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t --try-experimental-debuginfo-iterators
 ; RUN: FileCheck --check-prefixes=CHECK-FINAL --input-file=%t %s --implicit-check-not=dbg.value
 
-; Test that we can, in RemoveDIs mode / DPValues mode (where variable location
+; Test that we can, in RemoveDIs mode / DbgVariableRecords mode (where variable location
 ; information isn't an instruction), remove one variable location assignment
 ; but not another.
 
diff --git a/llvm/tools/dsymutil/CMakeLists.txt b/llvm/tools/dsymutil/CMakeLists.txt
index 53f882e90b4e92..efe28bda68ebf1 100644
--- a/llvm/tools/dsymutil/CMakeLists.txt
+++ b/llvm/tools/dsymutil/CMakeLists.txt
@@ -32,7 +32,6 @@ add_llvm_tool(dsymutil
   MachOUtils.cpp
   Reproducer.cpp
   RelocationMap.cpp
-  SymbolMap.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index 5ae5ecd556adb7..677dfc44c54a40 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -634,25 +634,19 @@ bool DwarfLinkerForBinary::linkImpl(
 
   DebugMap DebugMap(Map.getTriple(), Map.getBinaryPath());
 
-  std::function<StringRef(StringRef)> TranslationLambda = [&](StringRef Input) {
-    assert(Options.Translator);
-    return Options.Translator(Input);
-  };
-
   std::unique_ptr<Linker> GeneralLinker = Linker::createLinker(
       [&](const Twine &Error, StringRef Context, const DWARFDie *DIE) {
         reportError(Error, Context, DIE);
       },
       [&](const Twine &Warning, StringRef Context, const DWARFDie *DIE) {
         reportWarning(Warning, Context, DIE);
-      },
-      Options.Translator ? TranslationLambda : nullptr);
+      });
 
   std::unique_ptr<classic::DwarfStreamer> Streamer;
   if (!Options.NoOutput) {
     if (Expected<std::unique_ptr<classic::DwarfStreamer>> StreamerOrErr =
             classic::DwarfStreamer::createStreamer(
-                Map.getTriple(), ObjectType, OutFile, Options.Translator,
+                Map.getTriple(), ObjectType, OutFile,
                 [&](const Twine &Warning, StringRef Context,
                     const DWARFDie *DIE) {
                   reportWarning(Warning, Context, DIE);
@@ -866,8 +860,8 @@ bool DwarfLinkerForBinary::linkImpl(
   if (Map.getTriple().isOSDarwin() && !Map.getBinaryPath().empty() &&
       ObjectType == Linker::OutputFileType::Object)
     return MachOUtils::generateDsymCompanion(
-        Options.VFS, Map, Options.Translator,
-        *Streamer->getAsmPrinter().OutStreamer, OutFile, RelocationsToApply);
+        Options.VFS, Map, *Streamer->getAsmPrinter().OutStreamer, OutFile,
+        RelocationsToApply);
 
   Streamer->finish();
   return true;
diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h
index fd9d985097d6e2..6aa0b847eebd67 100644
--- a/llvm/tools/dsymutil/LinkUtils.h
+++ b/llvm/tools/dsymutil/LinkUtils.h
@@ -9,8 +9,6 @@
 #ifndef LLVM_TOOLS_DSYMUTIL_LINKOPTIONS_H
 #define LLVM_TOOLS_DSYMUTIL_LINKOPTIONS_H
 
-#include "SymbolMap.h"
-
 #include "llvm/ADT/Twine.h"
 #include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -87,9 +85,6 @@ struct LinkOptions {
   /// The Resources directory in the .dSYM bundle.
   std::optional<std::string> ResourceDir;
 
-  /// Symbol map translator.
-  SymbolMapTranslator Translator;
-
   /// Virtual File System.
   llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS =
       vfs::getRealFileSystem();
diff --git a/llvm/tools/dsymutil/MachOUtils.cpp b/llvm/tools/dsymutil/MachOUtils.cpp
index 3efc1aff823744..8e144d640ed01f 100644
--- a/llvm/tools/dsymutil/MachOUtils.cpp
+++ b/llvm/tools/dsymutil/MachOUtils.cpp
@@ -373,7 +373,7 @@ static unsigned segmentLoadCommandSize(bool Is64Bit, unsigned NumSections) {
 // \a OutFile and it must be using a MachObjectWriter object to do so.
 bool generateDsymCompanion(
     llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, const DebugMap &DM,
-    SymbolMapTranslator &Translator, MCStreamer &MS, raw_fd_ostream &OutFile,
+    MCStreamer &MS, raw_fd_ostream &OutFile,
     const std::vector<MachOUtils::DwarfRelocationApplicationInfo>
         &RelocationsToApply) {
   auto &ObjectStreamer = static_cast<MCObjectStreamer &>(MS);
@@ -509,12 +509,9 @@ bool generateDsymCompanion(
   }
 
   SmallString<0> NewSymtab;
-  std::function<StringRef(StringRef)> TranslationLambda =
-      Translator ? [&](StringRef Input) { return Translator(Input); }
-                 : static_cast<std::function<StringRef(StringRef)>>(nullptr);
   // Legacy dsymutil puts an empty string at the start of the line table.
   // thus we set NonRelocatableStringpool(,PutEmptyString=true)
-  NonRelocatableStringpool NewStrings(TranslationLambda, true);
+  NonRelocatableStringpool NewStrings(true);
   unsigned NListSize = Is64Bit ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
   unsigned NumSyms = 0;
   uint64_t NewStringsSize = 0;
diff --git a/llvm/tools/dsymutil/MachOUtils.h b/llvm/tools/dsymutil/MachOUtils.h
index 059d9fdd788a66..0229647b00f85c 100644
--- a/llvm/tools/dsymutil/MachOUtils.h
+++ b/llvm/tools/dsymutil/MachOUtils.h
@@ -8,8 +8,6 @@
 #ifndef LLVM_TOOLS_DSYMUTIL_MACHOUTILS_H
 #define LLVM_TOOLS_DSYMUTIL_MACHOUTILS_H
 
-#include "SymbolMap.h"
-
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -59,7 +57,7 @@ bool generateUniversalBinary(SmallVectorImpl<ArchAndFile> &ArchFiles,
                              StringRef SDKPath, bool Fat64 = false);
 bool generateDsymCompanion(
     llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, const DebugMap &DM,
-    SymbolMapTranslator &Translator, MCStreamer &MS, raw_fd_ostream &OutFile,
+    MCStreamer &MS, raw_fd_ostream &OutFile,
     const std::vector<MachOUtils::DwarfRelocationApplicationInfo>
         &RelocationsToApply);
 
diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td
index a4e4c6c4cdb9c0..d8cec0cb2c410d 100644
--- a/llvm/tools/dsymutil/Options.td
+++ b/llvm/tools/dsymutil/Options.td
@@ -128,12 +128,6 @@ def object_prefix_map: Separate<["--", "-"], "object-prefix-map">,
     Group<grp_general>;
 def: Joined<["--", "-"], "object-prefix-map=">, Alias<object_prefix_map>;
 
-def symbolmap: Separate<["--", "-"], "symbol-map">,
-  MetaVarName<"<bcsymbolmap>">,
-  HelpText<"Updates the existing dSYMs inplace using symbol map specified.">,
-  Group<grp_general>;
-def: Joined<["--", "-"], "symbol-map=">, Alias<symbolmap>;
-
 def arch: Separate<["--", "-"], "arch">,
   MetaVarName<"<arch>">,
   HelpText<"Link DWARF debug information only for specified CPU architecture"
diff --git a/llvm/tools/dsymutil/SymbolMap.cpp b/llvm/tools/dsymutil/SymbolMap.cpp
deleted file mode 100644
index c55362e5f77563..00000000000000
--- a/llvm/tools/dsymutil/SymbolMap.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-//===- tools/dsymutil/SymbolMap.cpp ---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SymbolMap.h"
-#include "DebugMap.h"
-#include "MachOUtils.h"
-
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/WithColor.h"
-
-#ifdef __APPLE__
-#include <CoreFoundation/CoreFoundation.h>
-#include <uuid/uuid.h>
-#endif
-
-namespace llvm {
-namespace dsymutil {
-
-StringRef SymbolMapTranslator::operator()(StringRef Input) {
-  if (!Input.starts_with("__hidden#") && !Input.starts_with("___hidden#"))
-    return Input;
-
-  StringRef Line = Input.drop_front(sizeof("__hidden#") - 1);
-  bool MightNeedUnderscore = Line.consume_front("#");
-
-  std::size_t LineNumber = std::numeric_limits<std::size_t>::max();
-  Line.split('_').first.getAsInteger(10, LineNumber);
-  if (LineNumber >= UnobfuscatedStrings.size()) {
-    WithColor::warning() << "reference to a unexisting unobfuscated string "
-                         << Input << ": symbol map mismatch?\n"
-                         << Line << '\n';
-    return Input;
-  }
-
-  const std::string &Translation = UnobfuscatedStrings[LineNumber];
-  if (!MightNeedUnderscore || !MangleNames)
-    return Translation;
-
-  // Objective-C symbols for the MachO symbol table start with a \1. Please see
-  // `MangleContext::mangleObjCMethodName` in clang.
-  if (Translation[0] == 1)
-    return StringRef(Translation).drop_front();
-
-  // We need permanent storage for the string we are about to create. Just
-  // append it to the vector containing translations. This should only happen
-  // during MachO symbol table translation, thus there should be no risk on
-  // exponential growth.
-  UnobfuscatedStrings.emplace_back("_" + Translation);
-  return UnobfuscatedStrings.back();
-}
-
-SymbolMapTranslator SymbolMapLoader::Load(StringRef InputFile,
-                                          const DebugMap &Map) const {
-  if (SymbolMap.empty())
-    return {};
-
-  std::string SymbolMapPath = SymbolMap;
-
-#if __APPLE__
-  // Look through the UUID Map.
-  if (sys::fs::is_directory(SymbolMapPath) && !Map.getUUID().empty()) {
-    uuid_string_t UUIDString;
-    uuid_unparse_upper((const uint8_t *)Map.getUUID().data(), UUIDString);
-
-    SmallString<256> PlistPath(
-        sys::path::parent_path(sys::path::parent_path(InputFile)));
-    sys::path::append(PlistPath, StringRef(UUIDString).str() + ".plist");
-
-    CFStringRef plistFile = CFStringCreateWithCString(
-        kCFAllocatorDefault, PlistPath.c_str(), kCFStringEncodingUTF8);
-    CFURLRef fileURL = CFURLCreateWithFileSystemPath(
-        kCFAllocatorDefault, plistFile, kCFURLPOSIXPathStyle, false);
-    CFReadStreamRef resourceData =
-        CFReadStreamCreateWithFile(kCFAllocatorDefault, fileURL);
-    if (resourceData) {
-      CFReadStreamOpen(resourceData);
-      CFDictionaryRef plist = (CFDictionaryRef)CFPropertyListCreateWithStream(
-          kCFAllocatorDefault, resourceData, 0, kCFPropertyListImmutable,
-          nullptr, nullptr);
-
-      if (plist) {
-        if (CFDictionaryContainsKey(plist, CFSTR("DBGOriginalUUID"))) {
-          CFStringRef OldUUID = (CFStringRef)CFDictionaryGetValue(
-              plist, CFSTR("DBGOriginalUUID"));
-
-          StringRef UUID(CFStringGetCStringPtr(OldUUID, kCFStringEncodingUTF8));
-          SmallString<256> BCSymbolMapPath(SymbolMapPath);
-          sys::path::append(BCSymbolMapPath, UUID.str() + ".bcsymbolmap");
-          SymbolMapPath = std::string(BCSymbolMapPath);
-        }
-        CFRelease(plist);
-      }
-      CFReadStreamClose(resourceData);
-      CFRelease(resourceData);
-    }
-    CFRelease(fileURL);
-    CFRelease(plistFile);
-  }
-#endif
-
-  if (sys::fs::is_directory(SymbolMapPath)) {
-    SymbolMapPath += (Twine("/") + sys::path::filename(InputFile) + "-" +
-                      MachOUtils::getArchName(Map.getTriple().getArchName()) +
-                      ".bcsymbolmap")
-                         .str();
-  }
-
-  auto ErrOrMemBuffer = MemoryBuffer::getFile(SymbolMapPath);
-  if (auto EC = ErrOrMemBuffer.getError()) {
-    WithColor::warning() << SymbolMapPath << ": " << EC.message()
-                         << ": not unobfuscating.\n";
-    return {};
-  }
-
-  std::vector<std::string> UnobfuscatedStrings;
-  auto &MemBuf = **ErrOrMemBuffer;
-  StringRef Data(MemBuf.getBufferStart(),
-                 MemBuf.getBufferEnd() - MemBuf.getBufferStart());
-  StringRef LHS;
-  std::tie(LHS, Data) = Data.split('\n');
-  bool MangleNames = false;
-
-  // Check version string first.
-  if (!LHS.starts_with("BCSymbolMap Version:")) {
-    // Version string not present, warns but try to parse it.
-    WithColor::warning() << SymbolMapPath
-                         << " is missing version string: assuming 1.0.\n";
-    UnobfuscatedStrings.emplace_back(LHS);
-  } else if (LHS.equals("BCSymbolMap Version: 1.0")) {
-    MangleNames = true;
-  } else if (LHS.equals("BCSymbolMap Version: 2.0")) {
-    MangleNames = false;
-  } else {
-    StringRef VersionNum;
-    std::tie(LHS, VersionNum) = LHS.split(':');
-    WithColor::warning() << SymbolMapPath
-                         << " has unsupported symbol map version" << VersionNum
-                         << ": not unobfuscating.\n";
-    return {};
-  }
-
-  while (!Data.empty()) {
-    std::tie(LHS, Data) = Data.split('\n');
-    UnobfuscatedStrings.emplace_back(LHS);
-  }
-
-  return SymbolMapTranslator(std::move(UnobfuscatedStrings), MangleNames);
-}
-
-} // namespace dsymutil
-} // namespace llvm
diff --git a/llvm/tools/dsymutil/SymbolMap.h b/llvm/tools/dsymutil/SymbolMap.h
deleted file mode 100644
index 977de31a5a17ac..00000000000000
--- a/llvm/tools/dsymutil/SymbolMap.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//=- tools/dsymutil/SymbolMap.h -----------------------------------*- C++ -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_DSYMUTIL_SYMBOLMAP_H
-#define LLVM_TOOLS_DSYMUTIL_SYMBOLMAP_H
-
-#include "llvm/ADT/StringRef.h"
-
-#include <string>
-#include <vector>
-
-namespace llvm {
-namespace dsymutil {
-class DebugMap;
-
-/// Callable class to unobfuscate strings based on a BCSymbolMap.
-class SymbolMapTranslator {
-public:
-  SymbolMapTranslator() : MangleNames(false) {}
-
-  SymbolMapTranslator(std::vector<std::string> UnobfuscatedStrings,
-                      bool MangleNames)
-      : UnobfuscatedStrings(std::move(UnobfuscatedStrings)),
-        MangleNames(MangleNames) {}
-
-  StringRef operator()(StringRef Input);
-
-  operator bool() const { return !UnobfuscatedStrings.empty(); }
-
-private:
-  std::vector<std::string> UnobfuscatedStrings;
-  bool MangleNames;
-};
-
-/// Class to initialize SymbolMapTranslators from a BCSymbolMap.
-class SymbolMapLoader {
-public:
-  SymbolMapLoader(std::string SymbolMap) : SymbolMap(std::move(SymbolMap)) {}
-
-  SymbolMapTranslator Load(StringRef InputFile, const DebugMap &Map) const;
-
-private:
-  const std::string SymbolMap;
-};
-} // namespace dsymutil
-} // namespace llvm
-
-#endif // LLVM_TOOLS_DSYMUTIL_SYMBOLMAP_H
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index 25e281c415e75a..bc968b6387b653 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -108,7 +108,6 @@ struct DsymutilOptions {
   bool Flat = false;
   bool InputIsYAMLDebugMap = false;
   bool ForceKeepFunctionForStatic = false;
-  std::string SymbolMap;
   std::string OutputFile;
   std::string Toolchain;
   std::string ReproducerPath;
@@ -341,12 +340,6 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) {
     return DWARFLinkerType.takeError();
   }
 
-  if (opt::Arg *SymbolMap = Args.getLastArg(OPT_symbolmap))
-    Options.SymbolMap = SymbolMap->getValue();
-
-  if (Args.hasArg(OPT_symbolmap))
-    Options.LinkOpts.Update = true;
-
   if (Expected<std::vector<std::string>> InputFiles =
           getInputs(Args, Options.LinkOpts.Update)) {
     Options.InputFiles = std::move(*InputFiles);
@@ -560,8 +553,7 @@ getOutputFileName(StringRef InputFile, const DsymutilOptions &Options) {
     return OutputLocation(Options.OutputFile);
 
   // When updating, do in place replacement.
-  if (Options.OutputFile.empty() &&
-      (Options.LinkOpts.Update || !Options.SymbolMap.empty()))
+  if (Options.OutputFile.empty() && Options.LinkOpts.Update)
     return OutputLocation(std::string(InputFile));
 
   // When dumping the debug map, just return an empty output location. This
@@ -668,8 +660,6 @@ int dsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
       return EXIT_FAILURE;
     }
 
-  SymbolMapLoader SymMapLoader(Options.SymbolMap);
-
   for (auto &InputFile : Options.InputFiles) {
     // Dump the symbol table for each input file and requested arch
     if (Options.DumpStab) {
@@ -760,9 +750,6 @@ int dsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
         if (Options.DumpDebugMap)
           continue;
 
-        if (!Options.SymbolMap.empty())
-          Options.LinkOpts.Translator = SymMapLoader.Load(InputFile, *Map);
-
         if (Map->begin() == Map->end()) {
           std::lock_guard<std::mutex> Guard(ErrorHandlerMutex);
           WithColor::warning()
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index 1c869e1739319e..fd852563838f37 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -67,6 +67,7 @@ static cl::opt<std::string> ClDataLayout("data-layout",
                                          cl::desc("data layout string to use"),
                                          cl::value_desc("layout-string"),
                                          cl::init(""), cl::cat(AsCat));
+extern bool WriteNewDbgInfoFormatToBitcode;
 
 static void WriteOutputFile(const Module *M, const ModuleSummaryIndex *Index) {
   // Infer the output filename if needed.
@@ -139,6 +140,12 @@ int main(int argc, char **argv) {
     Err.print(argv[0], errs());
     return 1;
   }
+
+  // Convert to new debug format if requested.
+  assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug mode");
+  if (UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode)
+    M->convertToNewDbgValues();
+
   std::unique_ptr<ModuleSummaryIndex> Index = std::move(ModuleAndIndex.Index);
 
   if (!DisableVerify) {
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index a3e41be12e95d4..78ccaf12a380b5 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -29,9 +29,10 @@ declare_objc_class(LLVMDIBuilderRef DIB, LLVMMetadataRef File) {
   return Decl;
 }
 
-int llvm_test_dibuilder(void) {
+int llvm_test_dibuilder(bool NewDebugInfoFormat) {
   const char *Filename = "debuginfo.c";
   LLVMModuleRef M = LLVMModuleCreateWithName(Filename);
+  LLVMSetIsNewDbgInfoFormat(M, NewDebugInfoFormat);
   LLVMDIBuilderRef DIB = LLVMCreateDIBuilder(M);
 
   LLVMMetadataRef File = LLVMDIBuilderCreateFile(DIB, Filename,
@@ -135,21 +136,38 @@ int llvm_test_dibuilder(void) {
   LLVMMetadataRef FooParamVar1 =
     LLVMDIBuilderCreateParameterVariable(DIB, FunctionMetadata, "a", 1, 1, File,
                                          42, Int64Ty, true, 0);
-  LLVMDIBuilderInsertDeclareAtEnd(DIB, LLVMConstInt(LLVMInt64Type(), 0, false),
-                                  FooParamVar1, FooParamExpression,
-                                  FooParamLocation, FooEntryBlock);
+  if (LLVMIsNewDbgInfoFormat(M))
+    LLVMDIBuilderInsertDeclareRecordAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar1,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
+  else
+    LLVMDIBuilderInsertDeclareAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar1,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
   LLVMMetadataRef FooParamVar2 =
     LLVMDIBuilderCreateParameterVariable(DIB, FunctionMetadata, "b", 1, 2, File,
                                          42, Int64Ty, true, 0);
-  LLVMDIBuilderInsertDeclareAtEnd(DIB, LLVMConstInt(LLVMInt64Type(), 0, false),
-                                  FooParamVar2, FooParamExpression,
-                                  FooParamLocation, FooEntryBlock);
+
+  if (LLVMIsNewDbgInfoFormat(M))
+    LLVMDIBuilderInsertDeclareRecordAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar2,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
+  else
+    LLVMDIBuilderInsertDeclareAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar2,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
+
   LLVMMetadataRef FooParamVar3 =
     LLVMDIBuilderCreateParameterVariable(DIB, FunctionMetadata, "c", 1, 3, File,
                                          42, VectorTy, true, 0);
-  LLVMDIBuilderInsertDeclareAtEnd(DIB, LLVMConstInt(LLVMInt64Type(), 0, false),
-                                  FooParamVar3, FooParamExpression,
-                                  FooParamLocation, FooEntryBlock);
+  if (LLVMIsNewDbgInfoFormat(M))
+    LLVMDIBuilderInsertDeclareRecordAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar3,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
+  else
+    LLVMDIBuilderInsertDeclareAtEnd(
+        DIB, LLVMConstInt(LLVMInt64Type(), 0, false), FooParamVar3,
+        FooParamExpression, FooParamLocation, FooEntryBlock);
 
   LLVMSetSubprogram(FooFunction, FunctionMetadata);
 
@@ -166,9 +184,12 @@ int llvm_test_dibuilder(void) {
   LLVMValueRef FooVal1 = LLVMConstInt(LLVMInt64Type(), 0, false);
   LLVMMetadataRef FooVarValueExpr =
     LLVMDIBuilderCreateConstantValueExpression(DIB, 0);
-
-  LLVMDIBuilderInsertDbgValueAtEnd(DIB, FooVal1, FooVar1, FooVarValueExpr,
-                                   FooVarsLocation, FooVarBlock);
+  if (LLVMIsNewDbgInfoFormat(M))
+    LLVMDIBuilderInsertDbgValueRecordAtEnd(
+        DIB, FooVal1, FooVar1, FooVarValueExpr, FooVarsLocation, FooVarBlock);
+  else
+    LLVMDIBuilderInsertDbgValueIntrinsicAtEnd(
+        DIB, FooVal1, FooVar1, FooVarValueExpr, FooVarsLocation, FooVarBlock);
 
   LLVMMetadataRef MacroFile =
       LLVMDIBuilderCreateTempMacroFile(DIB, NULL, 0, File);
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index bc708e2d472edd..347863638849ce 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -1397,6 +1397,14 @@ static void clone_symbols(LLVMModuleRef Src, LLVMModuleRef M) {
     }
     LLVMDisposeValueMetadataEntries(AllMetadata);
 
+    // Copy any prefix data that may be on the function
+    if (LLVMHasPrefixData(Cur))
+      LLVMSetPrefixData(Fun, clone_constant(LLVMGetPrefixData(Cur), M));
+
+    // Copy any prologue data that may be on the function
+    if (LLVMHasPrologueData(Cur))
+      LLVMSetPrologueData(Fun, clone_constant(LLVMGetPrologueData(Cur), M));
+
     FunCloner FC(Cur, Fun);
     FC.CloneBBs(Cur);
 
diff --git a/llvm/tools/llvm-c-test/llvm-c-test.h b/llvm/tools/llvm-c-test/llvm-c-test.h
index 00566660257e07..c50d3cce86748d 100644
--- a/llvm/tools/llvm-c-test/llvm-c-test.h
+++ b/llvm/tools/llvm-c-test/llvm-c-test.h
@@ -36,7 +36,7 @@ int llvm_calc(void);
 int llvm_disassemble(void);
 
 // debuginfo.c
-int llvm_test_dibuilder(void);
+int llvm_test_dibuilder(bool NewDebugInfoFormat);
 int llvm_get_di_tag(void);
 int llvm_di_type_get_name(void);
 
diff --git a/llvm/tools/llvm-c-test/main.c b/llvm/tools/llvm-c-test/main.c
index badbe4b13b6ba5..c4748d342fba14 100644
--- a/llvm/tools/llvm-c-test/main.c
+++ b/llvm/tools/llvm-c-test/main.c
@@ -110,7 +110,7 @@ int main(int argc, char **argv) {
   } else if (argc == 2 && !strcmp(argv[1], "--test-diagnostic-handler")) {
     return llvm_test_diagnostic_handler();
   } else if (argc == 2 && !strcmp(argv[1], "--test-dibuilder")) {
-    return llvm_test_dibuilder();
+    return llvm_test_dibuilder(false) && llvm_test_dibuilder(true);
   } else {
     print_usage();
   }
diff --git a/llvm/tools/llvm-debuginfo-analyzer/README.txt b/llvm/tools/llvm-debuginfo-analyzer/README.txt
index e6c20db7cd7121..ce7569d2722452 100644
--- a/llvm/tools/llvm-debuginfo-analyzer/README.txt
+++ b/llvm/tools/llvm-debuginfo-analyzer/README.txt
@@ -31,11 +31,11 @@ use LIT tests to validate the logical readers.
 
 Convert the unitests:
   llvm-project/llvm/unittests/DebugInfo/LogicalView/CodeViewReaderTest.cpp
-  llvm-project/llvm/unittests/DebugInfo/LogicalView/ELFReaderTest.cpp
+  llvm-project/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp
 
 into LIT tests:
   llvm-project/llvm/test/DebugInfo/LogicalView/CodeViewReader.test
-  llvm-project/llvm/test/DebugInfo/LogicalView/ELFReader.test
+  llvm-project/llvm/test/DebugInfo/LogicalView/DWARFReader.test
 
 //===----------------------------------------------------------------------===//
 // Eliminate calls to 'getInputFileDirectory()' in the unit tests.
@@ -210,9 +210,6 @@ The following DWARF debug location operands are not supported:
 //===----------------------------------------------------------------------===//
 // Add support for additional binary formats.
 //===----------------------------------------------------------------------===//
-- WebAssembly (Wasm).
-  https://github.com/llvm/llvm-project/issues/57040#issuecomment-1211336680
-
 - Extended COFF (XCOFF)
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 06fc669390bf1d..8e443318dd7d2c 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -80,6 +80,8 @@ static cl::opt<bool> PrintThinLTOIndexOnly(
     cl::desc("Only read thinlto index and print the index as LLVM assembly."),
     cl::init(false), cl::Hidden, cl::cat(DisCategory));
 
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 namespace {
 
 static void printDebugLoc(const DebugLoc &DL, formatted_raw_ostream &OS) {
@@ -249,8 +251,14 @@ int main(int argc, char **argv) {
 
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
-        if (M)
+        if (M) {
+          bool ChangeDbgFormat = M->IsNewDbgInfoFormat != WriteNewDbgInfoFormat;
+          if (ChangeDbgFormat)
+            M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat);
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
+          if (ChangeDbgFormat)
+            M->setIsNewDbgInfoFormat(!WriteNewDbgInfoFormat);
+        }
         if (Index)
           Index->print(Out->os());
       }
diff --git a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
index bd17f3c4a6595e..285dcf75ecd9d3 100644
--- a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
+++ b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
@@ -338,9 +338,9 @@ Error linkDebugInfoImpl(object::ObjectFile &File, const Options &Options,
   Triple TargetTriple = File.makeTriple();
   std::unique_ptr<classic::DwarfStreamer> Streamer;
   if (Expected<std::unique_ptr<classic::DwarfStreamer>> StreamerOrErr =
-          classic::DwarfStreamer::createStreamer(
-              TargetTriple, Linker::OutputFileType::Object, OutStream, nullptr,
-              ReportWarn))
+          classic::DwarfStreamer::createStreamer(TargetTriple,
+                                                 Linker::OutputFileType::Object,
+                                                 OutStream, ReportWarn))
     Streamer = std::move(*StreamerOrErr);
   else
     return StreamerOrErr.takeError();
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index 0e6935c0ac5895..5e0d69a68d69b4 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -3661,6 +3661,10 @@ struct class_ro32_t {
 #define RO_ROOT (1 << 1)
 #define RO_HAS_CXX_STRUCTORS (1 << 2)
 
+/* Values for method_list{64,32}_t->entsize */
+#define ML_HAS_RELATIVE_PTRS (1 << 31)
+#define ML_ENTSIZE_MASK 0xFFFF
+
 struct method_list64_t {
   uint32_t entsize;
   uint32_t count;
@@ -3685,6 +3689,12 @@ struct method32_t {
   uint32_t imp;   /* IMP (32-bit pointer) */
 };
 
+struct method_relative_t {
+  int32_t name;  /* SEL (32-bit relative) */
+  int32_t types; /* const char * (32-bit relative) */
+  int32_t imp;   /* IMP (32-bit relative) */
+};
+
 struct protocol_list64_t {
   uint64_t count; /* uintptr_t (a 64-bit value) */
   /* struct protocol64_t * list[0];  These pointers follow inline */
@@ -3986,6 +3996,12 @@ inline void swapStruct(struct method32_t &m) {
   sys::swapByteOrder(m.imp);
 }
 
+inline void swapStruct(struct method_relative_t &m) {
+  sys::swapByteOrder(m.name);
+  sys::swapByteOrder(m.types);
+  sys::swapByteOrder(m.imp);
+}
+
 inline void swapStruct(struct protocol_list64_t &pl) {
   sys::swapByteOrder(pl.count);
 }
@@ -4440,6 +4456,88 @@ static void print_layout_map32(uint32_t p, struct DisassembleInfo *info) {
   print_layout_map(layout_map, left);
 }
 
+static void print_relative_method_list(uint32_t structSizeAndFlags,
+                                       uint32_t structCount, uint64_t p,
+                                       struct DisassembleInfo *info,
+                                       const char *indent,
+                                       uint32_t pointerBits) {
+  struct method_relative_t m;
+  const char *r, *name;
+  uint32_t offset, xoffset, left, i;
+  SectionRef S, xS;
+
+  assert(((structSizeAndFlags & ML_HAS_RELATIVE_PTRS) != 0) &&
+         "expected structSizeAndFlags to have ML_HAS_RELATIVE_PTRS flag");
+
+  outs() << indent << "\t\t   entsize "
+         << (structSizeAndFlags & ML_ENTSIZE_MASK) << " (relative) \n";
+  outs() << indent << "\t\t     count " << structCount << "\n";
+
+  for (i = 0; i < structCount; i++) {
+    r = get_pointer_64(p, offset, left, S, info);
+    memset(&m, '\0', sizeof(struct method_relative_t));
+    if (left < sizeof(struct method_relative_t)) {
+      memcpy(&m, r, left);
+      outs() << indent << "   (method_t extends past the end of the section)\n";
+    } else
+      memcpy(&m, r, sizeof(struct method_relative_t));
+    if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+      swapStruct(m);
+
+    outs() << indent << "\t\t      name " << format("0x%" PRIx32, m.name);
+    uint64_t relNameRefVA = p + offsetof(struct method_relative_t, name);
+    uint64_t absNameRefVA = relNameRefVA + m.name;
+    outs() << " (" << format("0x%" PRIx32, absNameRefVA) << ")";
+
+    // since this is a relative list, absNameRefVA is the address of the
+    // __objc_selrefs entry, so a pointer, not the actual name
+    const char *nameRefPtr =
+        get_pointer_64(absNameRefVA, xoffset, left, xS, info);
+    if (nameRefPtr) {
+      uint32_t pointerSize = pointerBits / CHAR_BIT;
+      if (left < pointerSize)
+        outs() << indent << " (nameRefPtr extends past the end of the section)";
+      else {
+        if (pointerSize == 64) {
+          uint64_t nameOff_64 = *reinterpret_cast<const uint64_t *>(nameRefPtr);
+          if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+            sys::swapByteOrder(nameOff_64);
+          name = get_pointer_64(nameOff_64, xoffset, left, xS, info);
+        } else {
+          uint32_t nameOff_32 = *reinterpret_cast<const uint32_t *>(nameRefPtr);
+          if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+            sys::swapByteOrder(nameOff_32);
+          name = get_pointer_32(nameOff_32, xoffset, left, xS, info);
+        }
+        if (name != nullptr)
+          outs() << format(" %.*s", left, name);
+      }
+    }
+    outs() << "\n";
+
+    outs() << indent << "\t\t     types " << format("0x%" PRIx32, m.types);
+    uint64_t relTypesVA = p + offsetof(struct method_relative_t, types);
+    uint64_t absTypesVA = relTypesVA + m.types;
+    outs() << " (" << format("0x%" PRIx32, absTypesVA) << ")";
+    name = get_pointer_32(absTypesVA, xoffset, left, xS, info);
+    if (name != nullptr)
+      outs() << format(" %.*s", left, name);
+    outs() << "\n";
+
+    outs() << indent << "\t\t       imp " << format("0x%" PRIx32, m.imp);
+    uint64_t relImpVA = p + offsetof(struct method_relative_t, imp);
+    uint64_t absImpVA = relImpVA + m.imp;
+    outs() << " (" << format("0x%" PRIx32, absImpVA) << ")";
+    name = GuessSymbolName(absImpVA, info->AddrMap);
+    if (name != nullptr)
+      outs() << " " << name;
+    outs() << "\n";
+
+    p += sizeof(struct method_relative_t);
+    offset += sizeof(struct method_relative_t);
+  }
+}
+
 static void print_method_list64_t(uint64_t p, struct DisassembleInfo *info,
                                   const char *indent) {
   struct method_list64_t ml;
@@ -4461,10 +4559,17 @@ static void print_method_list64_t(uint64_t p, struct DisassembleInfo *info,
     memcpy(&ml, r, sizeof(struct method_list64_t));
   if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
     swapStruct(ml);
+  p += sizeof(struct method_list64_t);
+
+  if ((ml.entsize & ML_HAS_RELATIVE_PTRS) != 0) {
+    print_relative_method_list(ml.entsize, ml.count, p, info, indent,
+                               /*pointerBits=*/64);
+    return;
+  }
+
   outs() << indent << "\t\t   entsize " << ml.entsize << "\n";
   outs() << indent << "\t\t     count " << ml.count << "\n";
 
-  p += sizeof(struct method_list64_t);
   offset += sizeof(struct method_list64_t);
   for (i = 0; i < ml.count; i++) {
     r = get_pointer_64(p, offset, left, S, info);
@@ -4552,10 +4657,17 @@ static void print_method_list32_t(uint64_t p, struct DisassembleInfo *info,
     memcpy(&ml, r, sizeof(struct method_list32_t));
   if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
     swapStruct(ml);
+  p += sizeof(struct method_list32_t);
+
+  if ((ml.entsize & ML_HAS_RELATIVE_PTRS) != 0) {
+    print_relative_method_list(ml.entsize, ml.count, p, info, indent,
+                               /*pointerBits=*/32);
+    return;
+  }
+
   outs() << indent << "\t\t   entsize " << ml.entsize << "\n";
   outs() << indent << "\t\t     count " << ml.count << "\n";
 
-  p += sizeof(struct method_list32_t);
   offset += sizeof(struct method_list32_t);
   for (i = 0; i < ml.count; i++) {
     r = get_pointer_32(p, offset, left, S, info);
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index c6fcf7e1196ec8..878147642aa6e7 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -408,9 +408,22 @@ PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary,
           PerfContent::UnknownContent};
 }
 
+static StringRef filename(StringRef Path, bool UseBackSlash) {
+  llvm::sys::path::Style PathStyle =
+      UseBackSlash ? llvm::sys::path::Style::windows_backslash
+                   : llvm::sys::path::Style::native;
+  StringRef FileName = llvm::sys::path::filename(Path, PathStyle);
+
+  // In case this file use \r\n as newline.
+  if (UseBackSlash && FileName.back() == '\r')
+    return FileName.drop_back();
+
+  return FileName;
+}
+
 void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) {
   // Drop the event which doesn't belong to user-provided binary
-  StringRef BinaryName = llvm::sys::path::filename(Event.BinaryPath);
+  StringRef BinaryName = filename(Event.BinaryPath, Binary->isCOFF());
   if (Binary->getName() != BinaryName)
     return;
 
@@ -975,7 +988,7 @@ bool PerfScriptReader::extractMMap2EventForBinary(ProfiledBinary *Binary,
            << format("0x%" PRIx64 ":", MMap.Address) << " \n";
   }
 
-  StringRef BinaryName = llvm::sys::path::filename(MMap.BinaryPath);
+  StringRef BinaryName = filename(MMap.BinaryPath, Binary->isCOFF());
   return Binary->getName() == BinaryName;
 }
 
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index f62228627b8f1e..1baf35820f97fa 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
@@ -211,10 +212,11 @@ void ProfiledBinary::load() {
   OwningBinary<Binary> OBinary = unwrapOrError(createBinary(Path), Path);
   Binary &ExeBinary = *OBinary.getBinary();
 
-  auto *Obj = dyn_cast<ELFObjectFileBase>(&ExeBinary);
-  if (!Obj)
-    exitWithError("not a valid Elf image", Path);
+  IsCOFF = isa<COFFObjectFile>(&ExeBinary);
+  if (!isa<ELFObjectFileBase>(&ExeBinary) && !IsCOFF)
+    exitWithError("not a valid ELF/COFF image", Path);
 
+  auto *Obj = cast<ObjectFile>(&ExeBinary);
   TheTriple = Obj->makeTriple();
 
   LLVM_DEBUG(dbgs() << "Loading " << Path << "\n");
@@ -236,13 +238,14 @@ void ProfiledBinary::load() {
   DisassembleFunctionSet.insert(DisassembleFunctions.begin(),
                                 DisassembleFunctions.end());
 
-  checkPseudoProbe(Obj);
+  if (auto *ELFObj = dyn_cast<ELFObjectFileBase>(Obj)) {
+    checkPseudoProbe(ELFObj);
+    if (UsePseudoProbes)
+      populateElfSymbolAddressList(ELFObj);
 
-  if (UsePseudoProbes)
-    populateElfSymbolAddressList(Obj);
-
-  if (ShowDisassemblyOnly)
-    decodePseudoProbe(Obj);
+    if (ShowDisassemblyOnly)
+      decodePseudoProbe(ELFObj);
+  }
 
   // Disassemble the text sections.
   disassemble(Obj);
@@ -335,18 +338,35 @@ void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj,
     exitWithError("no executable segment found", FileName);
 }
 
-void ProfiledBinary::setPreferredTextSegmentAddresses(
-    const ELFObjectFileBase *Obj) {
+void ProfiledBinary::setPreferredTextSegmentAddresses(const COFFObjectFile *Obj,
+                                                      StringRef FileName) {
+  uint64_t ImageBase = Obj->getImageBase();
+  if (!ImageBase)
+    exitWithError("Not a COFF image", FileName);
+
+  PreferredTextSegmentAddresses.push_back(ImageBase);
+  FirstLoadableAddress = ImageBase;
+
+  for (SectionRef Section : Obj->sections()) {
+    const coff_section *Sec = Obj->getCOFFSection(Section);
+    if (Sec->Characteristics & COFF::IMAGE_SCN_CNT_CODE)
+      TextSegmentOffsets.push_back(Sec->VirtualAddress);
+  }
+}
+
+void ProfiledBinary::setPreferredTextSegmentAddresses(const ObjectFile *Obj) {
   if (const auto *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
     setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
   else if (const auto *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
     setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
   else if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
     setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
-  else if (const auto *ELFObj = cast<ELF64BEObjectFile>(Obj))
+  else if (const auto *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
     setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName());
+  else if (const auto *COFFObj = dyn_cast<COFFObjectFile>(Obj))
+    setPreferredTextSegmentAddresses(COFFObj, Obj->getFileName());
   else
-    llvm_unreachable("invalid ELF object format");
+    llvm_unreachable("invalid object format");
 }
 
 void ProfiledBinary::checkPseudoProbe(const ELFObjectFileBase *Obj) {
@@ -442,7 +462,7 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
 void ProfiledBinary::decodePseudoProbe() {
   OwningBinary<Binary> OBinary = unwrapOrError(createBinary(Path), Path);
   Binary &ExeBinary = *OBinary.getBinary();
-  auto *Obj = dyn_cast<ELFObjectFileBase>(&ExeBinary);
+  auto *Obj = cast<ELFObjectFileBase>(&ExeBinary);
   decodePseudoProbe(Obj);
 }
 
@@ -593,7 +613,7 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
   return true;
 }
 
-void ProfiledBinary::setUpDisassembler(const ELFObjectFileBase *Obj) {
+void ProfiledBinary::setUpDisassembler(const ObjectFile *Obj) {
   const Target *TheTarget = getTarget(Obj);
   std::string TripleName = TheTriple.getTriple();
   StringRef FileName = Obj->getFileName();
@@ -635,7 +655,7 @@ void ProfiledBinary::setUpDisassembler(const ELFObjectFileBase *Obj) {
   IPrinter->setPrintBranchImmAsAddress(true);
 }
 
-void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
+void ProfiledBinary::disassemble(const ObjectFile *Obj) {
   // Set up disassembler and related components.
   setUpDisassembler(Obj);
 
@@ -687,7 +707,7 @@ void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
              << "]:\n\n";
     }
 
-    if (SectionName == ".plt")
+    if (isa<ELFObjectFileBase>(Obj) && SectionName == ".plt")
       continue;
 
     // Get the section data.
@@ -722,8 +742,7 @@ void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
 }
 
 void ProfiledBinary::checkUseFSDiscriminator(
-    const ELFObjectFileBase *Obj,
-    std::map<SectionRef, SectionSymbolsTy> &AllSymbols) {
+    const ObjectFile *Obj, std::map<SectionRef, SectionSymbolsTy> &AllSymbols) {
   const char *FSDiscriminatorVar = "__llvm_fs_discriminator__";
   for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end();
        SI != SE; ++SI) {
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index 0fd12f5acd6b48..5d2088ad7691c4 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -297,22 +297,26 @@ class ProfiledBinary {
   // Use to avoid redundant warning.
   bool MissingMMapWarned = false;
 
-  void setPreferredTextSegmentAddresses(const ELFObjectFileBase *O);
+  bool IsCOFF = false;
+
+  void setPreferredTextSegmentAddresses(const ObjectFile *O);
 
   template <class ELFT>
   void setPreferredTextSegmentAddresses(const ELFFile<ELFT> &Obj,
                                         StringRef FileName);
+  void setPreferredTextSegmentAddresses(const COFFObjectFile *Obj,
+                                        StringRef FileName);
 
   void checkPseudoProbe(const ELFObjectFileBase *Obj);
 
   void decodePseudoProbe(const ELFObjectFileBase *Obj);
 
   void
-  checkUseFSDiscriminator(const ELFObjectFileBase *Obj,
+  checkUseFSDiscriminator(const ObjectFile *Obj,
                           std::map<SectionRef, SectionSymbolsTy> &AllSymbols);
 
   // Set up disassembler and related components.
-  void setUpDisassembler(const ELFObjectFileBase *Obj);
+  void setUpDisassembler(const ObjectFile *Obj);
   symbolize::LLVMSymbolizer::Options getSymbolizerOpts() const;
 
   // Load debug info of subprograms from DWARF section.
@@ -333,7 +337,7 @@ class ProfiledBinary {
   void warnNoFuncEntry();
 
   /// Dissassemble the text section and build various address maps.
-  void disassemble(const ELFObjectFileBase *O);
+  void disassemble(const ObjectFile *O);
 
   /// Helper function to dissassemble the symbol and extract info for unwinding
   bool dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
@@ -362,6 +366,8 @@ class ProfiledBinary {
   uint64_t getBaseAddress() const { return BaseAddress; }
   void setBaseAddress(uint64_t Address) { BaseAddress = Address; }
 
+  bool isCOFF() const { return IsCOFF; }
+
   // Canonicalize to use preferred load address as base address.
   uint64_t canonicalizeVirtualAddress(uint64_t Address) {
     return Address - BaseAddress + getPreferredBaseAddress();
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index e78732353cc877..d1c05f437042de 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -49,6 +49,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/HexagonAttributeParser.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MSP430AttributeParser.h"
 #include "llvm/Support/MSP430Attributes.h"
@@ -2824,6 +2825,11 @@ template <typename ELFT> void ELFDumper<ELFT>::printLoadName() {
 
 template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
   switch (Obj.getHeader().e_machine) {
+  case EM_HEXAGON:
+    printAttributes(ELF::SHT_HEXAGON_ATTRIBUTES,
+                    std::make_unique<HexagonAttributeParser>(&W),
+                    llvm::endianness::little);
+    break;
   case EM_ARM:
     if (Obj.isLE())
       printAttributes(ELF::SHT_ARM_ATTRIBUTES,
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 0d3fea71aafd42..0980d2ad3a852b 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -160,15 +160,10 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj,
                                       ArrayRef<std::string> Sections,
                                       bool Decompress) {
   SmallString<0> Out;
-  bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
     StringRef SectionName = unwrapOrError(Obj.getFileName(), Section.getName());
-
-    if (!First)
-      W.startLine() << '\n';
-    First = false;
-    W.startLine() << "String dump of section '" << SectionName << "':\n";
+    W.startLine() << "\nString dump of section '" << SectionName << "':\n";
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
@@ -182,15 +177,10 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
                                    ArrayRef<std::string> Sections,
                                    bool Decompress) {
   SmallString<0> Out;
-  bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
     StringRef SectionName = unwrapOrError(Obj.getFileName(), Section.getName());
-
-    if (!First)
-      W.startLine() << '\n';
-    First = false;
-    W.startLine() << "Hex dump of section '" << SectionName << "':\n";
+    W.startLine() << "\nHex dump of section '" << SectionName << "':\n";
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp
index 2f3d4cac9fa015..25de659109c9f0 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp
@@ -7,13 +7,13 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements a function which calls the Generic Delta pass in order
-// to reduce uninteresting DPValues from defined functions.
+// to reduce uninteresting DbgVariableRecords from defined functions.
 //
-// DPValues store variable-location debug-info and are attached to instructions.
-// This information used to be represented by intrinsics such as dbg.value, and
-// would naturally get reduced by llvm-reduce like any other instruction. As
-// DPValues get stored elsewhere, they need to be enumerated and eliminated like
-// any other data structure in LLVM.
+// DbgVariableRecords store variable-location debug-info and are attached to
+// instructions. This information used to be represented by intrinsics such as
+// dbg.value, and would naturally get reduced by llvm-reduce like any other
+// instruction. As DbgVariableRecords get stored elsewhere, they need to be
+// enumerated and eliminated like any other data structure in LLVM.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h
index 6a8f62155ec3e2..07a1e04fceaeec 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h
+++ b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements a function which calls the Generic Delta pass in order
-// to reduce uninteresting DPValues from defined functions.
+// to reduce uninteresting DbgVariableRecords from defined functions.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt
index eba1672faee7fa..b20ac318e768db 100644
--- a/llvm/tools/llvm-shlib/CMakeLists.txt
+++ b/llvm/tools/llvm-shlib/CMakeLists.txt
@@ -33,10 +33,13 @@ if(LLVM_BUILD_LLVM_DYLIB)
   if (LLVM_LINK_LLVM_DYLIB)
     set(INSTALL_WITH_TOOLCHAIN INSTALL_WITH_TOOLCHAIN)
   endif()
-  add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB OUTPUT_NAME LLVM ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
-  # Add symlink for backwards compatibility with old library name
-  get_target_property(LLVM_DYLIB_SOVERSION LLVM SOVERSION)
-  llvm_install_library_symlink(LLVM-${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} LLVM SHARED COMPONENT LLVM SOVERSION ${LLVM_DYLIB_SOVERSION})
+  if (WIN32)
+    add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB SONAME ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
+  else()
+    add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB OUTPUT_NAME LLVM ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
+    # Add symlink for backwards compatibility with old library name
+    llvm_install_library_symlink(LLVM-${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} $<TARGET_FILE_NAME:LLVM> SHARED FULL_DEST COMPONENT LLVM)
+  endif()
 
   list(REMOVE_DUPLICATES LIB_NAMES)
   if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
diff --git a/llvm/tools/verify-uselistorder/verify-uselistorder.cpp b/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
index 9afe6817fefb9f..d929ae09958a1f 100644
--- a/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
+++ b/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
@@ -166,6 +166,11 @@ std::unique_ptr<Module> TempFile::readBitcode(LLVMContext &Context) const {
                           "verify-uselistorder: error: ");
     return nullptr;
   }
+
+  // verify-uselistoder currently only supports old-style debug info mode.
+  // FIXME: Update mapping code for RemoveDIs.
+  assert(!ModuleOr.get()->IsNewDbgInfoFormat &&
+         "Unexpectedly in new debug info mode");
   return std::move(ModuleOr.get());
 }
 
@@ -175,6 +180,9 @@ std::unique_ptr<Module> TempFile::readAssembly(LLVMContext &Context) const {
   std::unique_ptr<Module> M = parseAssemblyFile(Filename, Err, Context);
   if (!M.get())
     Err.print("verify-uselistorder", errs());
+  // verify-uselistoder currently only supports old-style debug info mode.
+  // FIXME: Update mapping code for RemoveDIs.
+  assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug info mode");
   return M;
 }
 
@@ -541,6 +549,9 @@ int main(int argc, char **argv) {
 
   // Load the input module...
   std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
+  // verify-uselistoder currently only supports old-style debug info mode.
+  // FIXME: Update mapping code for RemoveDIs.
+  assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug info mode");
 
   if (!M.get()) {
     Err.print(argv[0], errs());
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 400313a6234677..83cbb02e0f58b3 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -15,6 +15,7 @@
 #include "gtest/gtest.h"
 #include <array>
 #include <climits>
+#include <limits>
 #include <optional>
 
 using namespace llvm;
@@ -2918,7 +2919,7 @@ TEST(APIntTest, Average) {
   APInt A100(32, 100);
   APInt A101(32, 101);
   APInt A200(32, 200, false);
-  APInt ApUMax(32, UINT_MAX, false);
+  APInt ApUMax = APInt::getMaxValue(32);
 
   EXPECT_EQ(APInt(32, 150), APIntOps::avgFloorU(A100, A200));
   EXPECT_EQ(APIntOps::RoundingUDiv(A100 + A200, A2, APInt::Rounding::DOWN),
@@ -2945,8 +2946,8 @@ TEST(APIntTest, Average) {
   APInt Am100(32, -100);
   APInt Am101(32, -101);
   APInt Am200(32, -200);
-  APInt AmSMin(32, INT_MIN);
-  APInt ApSMax(32, INT_MAX);
+  APInt AmSMin = APInt::getSignedMinValue(32);
+  APInt ApSMax = APInt::getSignedMaxValue(32);
 
   EXPECT_EQ(APInt(32, +150), APIntOps::avgFloorS(Ap100, Ap200));
   EXPECT_EQ(APIntOps::RoundingSDiv(Ap100 + Ap200, A2, APInt::Rounding::DOWN),
@@ -3048,6 +3049,69 @@ TEST(APIntTest, smul_ov) {
       }
 }
 
+TEST(APIntTest, sfloordiv_ov) {
+  // int16 test overflow
+  {
+    using IntTy = int16_t;
+    APInt divisor(8 * sizeof(IntTy), std::numeric_limits<IntTy>::lowest(),
+                  true);
+    APInt dividend(8 * sizeof(IntTy), IntTy(-1), true);
+    bool Overflow = false;
+    (void)divisor.sfloordiv_ov(dividend, Overflow);
+    EXPECT_TRUE(Overflow);
+  }
+  // int32 test overflow
+  {
+    using IntTy = int32_t;
+    APInt divisor(8 * sizeof(IntTy), std::numeric_limits<IntTy>::lowest(),
+                  true);
+    APInt dividend(8 * sizeof(IntTy), IntTy(-1), true);
+    bool Overflow = false;
+    (void)divisor.sfloordiv_ov(dividend, Overflow);
+    EXPECT_TRUE(Overflow);
+  }
+  // int64 test overflow
+  {
+    using IntTy = int64_t;
+    APInt divisor(8 * sizeof(IntTy), std::numeric_limits<IntTy>::lowest(),
+                  true);
+    APInt dividend(8 * sizeof(IntTy), IntTy(-1), true);
+    bool Overflow = false;
+    (void)divisor.sfloordiv_ov(dividend, Overflow);
+    EXPECT_TRUE(Overflow);
+  }
+  // test all of int8
+  {
+    bool Overflow = false;
+    for (int i = -128; i < 128; ++i) {
+      for (int j = -128; j < 128; ++j) {
+        if (j == 0)
+          continue;
+
+        int8_t a = static_cast<int8_t>(i);
+        int8_t b = static_cast<int8_t>(j);
+
+        APInt divisor(8, a, true);
+        APInt dividend(8, b, true);
+        APInt quotient = divisor.sfloordiv_ov(dividend, Overflow);
+
+        if (i == -128 && j == -1) {
+          EXPECT_TRUE(Overflow);
+          continue;
+        }
+
+        if (((i >= 0 && j > 0) || (i <= 0 && j < 0)) ||
+            (i % j == 0)) // if quotient >= 0 and remain == 0 floordiv
+                          // equivalent to div
+          EXPECT_EQ(quotient.getSExtValue(), a / b);
+        else
+          EXPECT_EQ(quotient.getSExtValue(), a / b - 1);
+        EXPECT_FALSE(Overflow);
+      }
+    }
+  }
+}
+
 TEST(APIntTest, SolveQuadraticEquationWrap) {
   // Verify that "Solution" is the first non-negative integer that solves
   // Ax^2 + Bx + C = "0 or overflow", i.e. that it is a correct solution
diff --git a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
index cb34802a5de271..b60d82b529fa3f 100644
--- a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
+++ b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
@@ -18,6 +18,24 @@ using namespace llvm;
 
 namespace {
 
+TEST(LowLevelTypeTest, Token) {
+  LLVMContext C;
+  DataLayout DL("");
+
+  const LLT TTy = LLT::token();
+
+  // Test kind.
+  EXPECT_TRUE(TTy.isValid());
+  EXPECT_TRUE(TTy.isScalar());
+  EXPECT_TRUE(TTy.isToken());
+
+  EXPECT_FALSE(TTy.isPointer());
+  EXPECT_FALSE(TTy.isVector());
+
+  const LLT STy = LLT::scalar(0);
+  EXPECT_EQ(STy, TTy);
+}
+
 TEST(LowLevelTypeTest, Scalar) {
   LLVMContext C;
   DataLayout DL("");
@@ -32,6 +50,8 @@ TEST(LowLevelTypeTest, Scalar) {
     ASSERT_FALSE(Ty.isPointer());
     ASSERT_FALSE(Ty.isVector());
 
+    EXPECT_TRUE(S != 0 || Ty.isToken());
+
     // Test sizes.
     EXPECT_EQ(S, Ty.getSizeInBits());
     EXPECT_EQ(S, Ty.getScalarSizeInBits());
@@ -77,6 +97,7 @@ TEST(LowLevelTypeTest, Vector) {
 
       ASSERT_FALSE(VTy.isScalar());
       ASSERT_FALSE(VTy.isPointer());
+      ASSERT_FALSE(VTy.isToken());
 
       // Test sizes.
       EXPECT_EQ(S, VTy.getScalarSizeInBits());
@@ -300,6 +321,7 @@ TEST(LowLevelTypeTest, Invalid) {
   ASSERT_FALSE(Ty.isScalar());
   ASSERT_FALSE(Ty.isPointer());
   ASSERT_FALSE(Ty.isVector());
+  ASSERT_FALSE(Ty.isToken());
 }
 
 TEST(LowLevelTypeTest, Divide) {
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 180d4306a470f7..1967a62bbf9d11 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -187,10 +187,20 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue SExt = DAG->getNode(ISD::SIGN_EXTEND, DL, Int64VT, Op0);
   SDValue Trunc = DAG->getNode(ISD::TRUNCATE, DL, Int32VT, Op1);
 
+  SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Trunc, Op0);
+  SDValue Neg = DAG->getNegative(Op0, DL, Int32VT);
+  SDValue Not = DAG->getNOT(DL, Op0, Int32VT);
+
   using namespace SDPatternMatch;
   EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value())));
   EXPECT_TRUE(sd_match(SExt, m_SExt(m_Value())));
   EXPECT_TRUE(sd_match(Trunc, m_Trunc(m_Specific(Op1))));
+
+  EXPECT_TRUE(sd_match(Neg, m_Neg(m_Value())));
+  EXPECT_TRUE(sd_match(Not, m_Not(m_Value())));
+  EXPECT_FALSE(sd_match(ZExt, m_Neg(m_Value())));
+  EXPECT_FALSE(sd_match(Sub, m_Neg(m_Value())));
+  EXPECT_FALSE(sd_match(Neg, m_Not(m_Value())));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
diff --git a/llvm/unittests/DebugInfo/LogicalView/CMakeLists.txt b/llvm/unittests/DebugInfo/LogicalView/CMakeLists.txt
index 1b93d77a20bd69..1116edb212b057 100644
--- a/llvm/unittests/DebugInfo/LogicalView/CMakeLists.txt
+++ b/llvm/unittests/DebugInfo/LogicalView/CMakeLists.txt
@@ -12,7 +12,7 @@ add_llvm_unittest_with_input_files(DebugInfoLogicalViewTests
   CodeViewReaderTest.cpp
   CommandLineOptionsTest.cpp
   CompareElementsTest.cpp
-  ELFReaderTest.cpp
+  DWARFReaderTest.cpp
   SelectElementsTest.cpp
   LocationRangesTest.cpp
   LogicalElementsTest.cpp
diff --git a/llvm/unittests/DebugInfo/LogicalView/ELFReaderTest.cpp b/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp
similarity index 99%
rename from llvm/unittests/DebugInfo/LogicalView/ELFReaderTest.cpp
rename to llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp
index bd278befbc642f..a1bd5dee356566 100644
--- a/llvm/unittests/DebugInfo/LogicalView/ELFReaderTest.cpp
+++ b/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp
@@ -1,4 +1,4 @@
-//===- llvm/unittest/DebugInfo/LogicalView/ELFReaderTest.cpp --------------===//
+//===- llvm/unittest/DebugInfo/LogicalView/DWARFReaderTest.cpp ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -323,7 +323,7 @@ void compareElements(SmallString<128> &InputsDir) {
   checkElementComparison(Reference.get(), Target.get());
 }
 
-TEST(LogicalViewTest, ELFReader) {
+TEST(LogicalViewTest, DWARFReader) {
   // Initialize targets and assembly printers/parsers.
   llvm::InitializeAllTargetInfos();
   llvm::InitializeAllTargetMCs();
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index f102ba59e37542..8a6a26bba63c26 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_unittest(OrcJITTests
   JITTargetMachineBuilderTest.cpp
   LazyCallThroughAndReexportsTest.cpp
   LookupAndRecordAddrsTest.cpp
+  MachOPlatformTest.cpp
   MapperJITLinkMemoryManagerTest.cpp
   MemoryMapperTest.cpp
   ObjectFormatsTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/MachOPlatformTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MachOPlatformTest.cpp
new file mode 100644
index 00000000000000..bf6c1042252d76
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/MachOPlatformTest.cpp
@@ -0,0 +1,56 @@
+//===---------- MachOPlatformTest.cpp - MachPlatform API Tests ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+TEST(MachOPlatformTests, BuildVersionOptsFromTriple) {
+
+  auto darwinOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-darwin"), 0, 0);
+  EXPECT_FALSE(darwinOS);
+
+  auto macOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-macosx"), 0, 0);
+  EXPECT_TRUE(macOS);
+  EXPECT_EQ(macOS->Platform, MachO::PLATFORM_MACOS);
+
+  auto iOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-ios"), 0, 0);
+  EXPECT_TRUE(iOS);
+  EXPECT_EQ(iOS->Platform, MachO::PLATFORM_IOS);
+
+  auto iOSSim = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-ios-simulator"), 0, 0);
+  EXPECT_TRUE(iOSSim);
+  EXPECT_EQ(iOSSim->Platform, MachO::PLATFORM_IOSSIMULATOR);
+
+  auto tvOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-tvos"), 0, 0);
+  EXPECT_TRUE(tvOS);
+  EXPECT_EQ(tvOS->Platform, MachO::PLATFORM_TVOS);
+
+  auto tvOSSim = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-tvos-simulator"), 0, 0);
+  EXPECT_TRUE(tvOSSim);
+  EXPECT_EQ(tvOSSim->Platform, MachO::PLATFORM_TVOSSIMULATOR);
+
+  auto watchOS = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-watchos"), 0, 0);
+  EXPECT_TRUE(watchOS);
+  EXPECT_EQ(watchOS->Platform, MachO::PLATFORM_WATCHOS);
+
+  auto watchOSSim = MachOPlatform::HeaderOptions::BuildVersionOpts::fromTriple(
+      Triple("arm64-apple-watchos-simulator"), 0, 0);
+  EXPECT_TRUE(watchOSSim);
+  EXPECT_EQ(watchOSSim->Platform, MachO::PLATFORM_WATCHOSSIMULATOR);
+}
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index fdbe8df783b117..5c415cadcd686c 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -5856,6 +5856,23 @@ TEST_F(OpenMPIRBuilderTest, TargetDataRegion) {
   EXPECT_TRUE(TargetDataCall->getOperand(2)->getType()->isIntegerTy(32));
   EXPECT_TRUE(TargetDataCall->getOperand(8)->getType()->isPointerTy());
 
+  // Check that BodyGenCB is still made when IsTargetDevice is set to true.
+  OMPBuilder.Config.setIsTargetDevice(true);
+  bool CheckDevicePassBodyGen = false;
+  auto BodyTargetCB = [&](InsertPointTy CodeGenIP, BodyGenTy BodyGenType) {
+    CheckDevicePassBodyGen = true;
+    Builder.restoreIP(CodeGenIP);
+    CallInst *TargetDataCall =
+        dyn_cast<CallInst>(BB->back().getPrevNode()->getPrevNode());
+    // Make sure no begin_mapper call is present for device pass.
+    EXPECT_EQ(TargetDataCall, nullptr);
+    return Builder.saveIP();
+  };
+  Builder.restoreIP(OMPBuilder.createTargetData(
+      Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID),
+      /* IfCond= */ nullptr, Info, GenMapInfoCB, nullptr, BodyTargetCB));
+  EXPECT_TRUE(CheckDevicePassBodyGen);
+
   Builder.CreateRetVoid();
   EXPECT_FALSE(verifyModule(*M, &errs()));
 }
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index bfc64cb84143d3..92658b7b6895f5 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -41,7 +41,7 @@ namespace {
 // position that it already resides at. This is fine -- but gets complicated
 // with dbg.value intrinsics. By moving an instruction, we can end up changing
 // nothing but the location of debug-info intrinsics. That has to be modelled
-// by DPValues, the dbg.value replacement.
+// by DbgVariableRecords, the dbg.value replacement.
 TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   LLVMContext C;
   UseNewDbgInfoFormat = true;
@@ -90,10 +90,10 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   //    %b = add
   //    %c = add
   //    dbg.value
-  // Check that this is replicated by DPValues.
+  // Check that this is replicated by DbgVariableRecords.
   Inst2->moveAfter(Inst1);
 
-  // Inst1 should only have one DPValue on it.
+  // Inst1 should only have one DbgVariableRecord on it.
   EXPECT_TRUE(Inst1->hasDbgRecords());
   auto Range1 = Inst1->getDbgRecordRange();
   EXPECT_EQ(std::distance(Range1.begin(), Range1.end()), 1u);
@@ -158,44 +158,44 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   EXPECT_EQ(BB.getNextMarker(Instr1), Marker2);
   EXPECT_EQ(BB.getNextMarker(Instr2), EndMarker); // Is nullptr.
 
-  // There should be two DPValues,
+  // There should be two DbgVariableRecords,
   EXPECT_EQ(Marker1->StoredDbgRecords.size(), 1u);
   EXPECT_EQ(Marker2->StoredDbgRecords.size(), 1u);
 
   // Unlink them and try to re-insert them through the basic block.
-  DbgRecord *DPV1 = &*Marker1->StoredDbgRecords.begin();
-  DbgRecord *DPV2 = &*Marker2->StoredDbgRecords.begin();
-  DPV1->removeFromParent();
-  DPV2->removeFromParent();
+  DbgRecord *DVR1 = &*Marker1->StoredDbgRecords.begin();
+  DbgRecord *DVR2 = &*Marker2->StoredDbgRecords.begin();
+  DVR1->removeFromParent();
+  DVR2->removeFromParent();
   EXPECT_TRUE(Marker1->StoredDbgRecords.empty());
   EXPECT_TRUE(Marker2->StoredDbgRecords.empty());
 
   // This should appear in Marker1.
-  BB.insertDbgRecordBefore(DPV1, BB.begin());
+  BB.insertDbgRecordBefore(DVR1, BB.begin());
   EXPECT_EQ(Marker1->StoredDbgRecords.size(), 1u);
-  EXPECT_EQ(DPV1, &*Marker1->StoredDbgRecords.begin());
+  EXPECT_EQ(DVR1, &*Marker1->StoredDbgRecords.begin());
 
   // This should attach to Marker2.
-  BB.insertDbgRecordAfter(DPV2, &*BB.begin());
+  BB.insertDbgRecordAfter(DVR2, &*BB.begin());
   EXPECT_EQ(Marker2->StoredDbgRecords.size(), 1u);
-  EXPECT_EQ(DPV2, &*Marker2->StoredDbgRecords.begin());
+  EXPECT_EQ(DVR2, &*Marker2->StoredDbgRecords.begin());
 
-  // Now, how about removing instructions? That should cause any DPValues to
-  // "fall down".
+  // Now, how about removing instructions? That should cause any
+  // DbgVariableRecords to "fall down".
   Instr1->removeFromParent();
   Marker1 = nullptr;
-  // DPValues should now be in Marker2.
+  // DbgVariableRecords should now be in Marker2.
   EXPECT_EQ(BB.size(), 1u);
   EXPECT_EQ(Marker2->StoredDbgRecords.size(), 2u);
   // They should also be in the correct order.
-  SmallVector<DbgRecord *, 2> DPVs;
-  for (DbgRecord &DPV : Marker2->getDbgRecordRange())
-    DPVs.push_back(&DPV);
-  EXPECT_EQ(DPVs[0], DPV1);
-  EXPECT_EQ(DPVs[1], DPV2);
-
-  // If we remove the end instruction, the DPValues should fall down into
-  // the trailing marker.
+  SmallVector<DbgRecord *, 2> DVRs;
+  for (DbgRecord &DVR : Marker2->getDbgRecordRange())
+    DVRs.push_back(&DVR);
+  EXPECT_EQ(DVRs[0], DVR1);
+  EXPECT_EQ(DVRs[1], DVR2);
+
+  // If we remove the end instruction, the DbgVariableRecords should fall down
+  // into the trailing marker.
   EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
   Instr2->removeFromParent();
   EXPECT_TRUE(BB.empty());
@@ -204,35 +204,35 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
   // Again, these should arrive in the correct order.
 
-  DPVs.clear();
-  for (DbgRecord &DPV : EndMarker->getDbgRecordRange())
-    DPVs.push_back(&DPV);
-  EXPECT_EQ(DPVs[0], DPV1);
-  EXPECT_EQ(DPVs[1], DPV2);
+  DVRs.clear();
+  for (DbgRecord &DVR : EndMarker->getDbgRecordRange())
+    DVRs.push_back(&DVR);
+  EXPECT_EQ(DVRs[0], DVR1);
+  EXPECT_EQ(DVRs[1], DVR2);
 
   // Inserting a normal instruction at the beginning: shouldn't dislodge the
-  // DPValues. It's intended to not go at the start.
+  // DbgVariableRecords. It's intended to not go at the start.
   Instr1->insertBefore(BB, BB.begin());
   EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
   Instr1->removeFromParent();
 
-  // Inserting at end(): should dislodge the DPValues, if they were dbg.values
-  // then they would sit "above" the new instruction.
+  // Inserting at end(): should dislodge the DbgVariableRecords, if they were
+  // dbg.values then they would sit "above" the new instruction.
   Instr1->insertBefore(BB, BB.end());
   EXPECT_EQ(Instr1->DbgMarker->StoredDbgRecords.size(), 2u);
   // We should de-allocate the trailing marker when something is inserted
   // at end().
   EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
 
-  // Remove Instr1: now the DPValues will fall down again,
+  // Remove Instr1: now the DbgVariableRecords will fall down again,
   Instr1->removeFromParent();
   EndMarker = BB.getTrailingDbgRecords();
   EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
 
   // Inserting a terminator, however it's intended, should dislodge the
-  // trailing DPValues, as it's the clear intention of the caller that this be
-  // the final instr in the block, and DPValues aren't allowed to live off the
-  // end forever.
+  // trailing DbgVariableRecords, as it's the clear intention of the caller that
+  // this be the final instr in the block, and DbgVariableRecords aren't allowed
+  // to live off the end forever.
   Instr2->insertBefore(BB, BB.begin());
   EXPECT_EQ(Instr2->DbgMarker->StoredDbgRecords.size(), 2u);
   EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
@@ -300,29 +300,30 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   ASSERT_TRUE(CInst->DbgMarker);
   EXPECT_FALSE(CInst->DbgMarker->StoredDbgRecords.empty());
 
-  // If we move "c" to the start of the block, just normally, then the DPValues
-  // should fall down to "d".
+  // If we move "c" to the start of the block, just normally, then the
+  // DbgVariableRecords should fall down to "d".
   CInst->moveBefore(BB, BeginIt2);
   EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDbgRecords.empty());
   ASSERT_TRUE(DInst->DbgMarker);
   EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
 
   // Wheras if we move D to the start of the block with moveBeforePreserving,
-  // the DPValues should move with it.
+  // the DbgVariableRecords should move with it.
   DInst->moveBeforePreserving(BB, BB.begin());
   EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), DInst);
 
-  // Similarly, moveAfterPreserving "D" to "C" should move DPValues with "D".
+  // Similarly, moveAfterPreserving "D" to "C" should move DbgVariableRecords
+  // with "D".
   DInst->moveAfterPreserving(CInst);
   EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
 
   // (move back to the start...)
   DInst->moveBeforePreserving(BB, BB.begin());
 
-  // Current order of insts: "D -> C -> B -> Ret". DPValues on "D".
+  // Current order of insts: "D -> C -> B -> Ret". DbgVariableRecords on "D".
   // If we move "C" to the beginning of the block, it should go before the
-  // DPValues. They'll stay on "D".
+  // DbgVariableRecords. They'll stay on "D".
   CInst->moveBefore(BB, BB.begin());
   EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
@@ -333,14 +334,14 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   CInst->moveBefore(BInst);
   EXPECT_EQ(&*BB.begin(), DInst);
 
-  // Current order of insts: "D -> C -> B -> Ret". DPValues on "D".
+  // Current order of insts: "D -> C -> B -> Ret". DbgVariableRecords on "D".
   // Now move CInst to the position of DInst, but using getIterator instead of
   // BasicBlock::begin. This signals that we want the "C" instruction to be
-  // immediately before "D", with any DPValues on "D" now moving to "C".
-  // It's the equivalent of moving an instruction to the position between a
+  // immediately before "D", with any DbgVariableRecords on "D" now moving to
+  // "C". It's the equivalent of moving an instruction to the position between a
   // run of dbg.values and the next instruction.
   CInst->moveBefore(BB, DInst->getIterator());
-  // CInst gains the DPValues.
+  // CInst gains the DbgVariableRecords.
   EXPECT_TRUE(!DInst->DbgMarker || DInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_FALSE(CInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), CInst);
@@ -378,8 +379,8 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
-  // Check that DPValues can be accessed from Instructions without digging
-  // into the depths of DPMarkers.
+  // Check that DbgVariableRecords can be accessed from Instructions without
+  // digging into the depths of DPMarkers.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
   // Convert the module to "new" form debug-info.
   M->convertToNewDbgValues();
@@ -391,25 +392,25 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   ASSERT_FALSE(BInst->DbgMarker);
   ASSERT_TRUE(CInst->DbgMarker);
   ASSERT_EQ(CInst->DbgMarker->StoredDbgRecords.size(), 1u);
-  DbgRecord *DPV1 = &*CInst->DbgMarker->StoredDbgRecords.begin();
-  ASSERT_TRUE(DPV1);
+  DbgRecord *DVR1 = &*CInst->DbgMarker->StoredDbgRecords.begin();
+  ASSERT_TRUE(DVR1);
   EXPECT_FALSE(BInst->hasDbgRecords());
 
-  // Clone DPValues from one inst to another. Other arguments to clone are
-  // tested in DPMarker test.
+  // Clone DbgVariableRecords from one inst to another. Other arguments to clone
+  // are tested in DPMarker test.
   auto Range1 = BInst->cloneDebugInfoFrom(CInst);
   EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
-  DbgRecord *DPV2 = &*BInst->DbgMarker->StoredDbgRecords.begin();
+  DbgRecord *DVR2 = &*BInst->DbgMarker->StoredDbgRecords.begin();
   EXPECT_EQ(std::distance(Range1.begin(), Range1.end()), 1u);
-  EXPECT_EQ(&*Range1.begin(), DPV2);
-  EXPECT_NE(DPV1, DPV2);
+  EXPECT_EQ(&*Range1.begin(), DVR2);
+  EXPECT_NE(DVR1, DVR2);
 
   // We should be able to get a range over exactly the same information.
   auto Range2 = BInst->getDbgRecordRange();
   EXPECT_EQ(Range1.begin(), Range2.begin());
   EXPECT_EQ(Range1.end(), Range2.end());
 
-  // We should be able to query if there are DPValues,
+  // We should be able to query if there are DbgVariableRecords,
   EXPECT_TRUE(BInst->hasDbgRecords());
   EXPECT_TRUE(CInst->hasDbgRecords());
   EXPECT_FALSE(DInst->hasDbgRecords());
@@ -419,8 +420,8 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   EXPECT_FALSE(BInst->hasDbgRecords());
   EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 0u);
 
-  // And we should be able to drop individual DPValues.
-  CInst->dropOneDbgRecord(DPV1);
+  // And we should be able to drop individual DbgVariableRecords.
+  CInst->dropOneDbgRecord(DVR1);
   EXPECT_FALSE(CInst->hasDbgRecords());
   EXPECT_EQ(CInst->DbgMarker->StoredDbgRecords.size(), 0u);
 
@@ -514,7 +515,7 @@ class DbgSpliceTest : public ::testing::Test {
   BasicBlock *BBEntry, *BBExit;
   BasicBlock::iterator Dest, First, Last;
   Instruction *BInst, *Branch, *CInst;
-  DPValue *DPVA, *DPVB, *DPVConst;
+  DbgVariableRecord *DVRA, *DVRB, *DVRConst;
 
   void SetUp() override {
     UseNewDbgInfoFormat = true;
@@ -531,18 +532,21 @@ class DbgSpliceTest : public ::testing::Test {
     Branch = &*Last;
     CInst = &*Dest;
 
-    DPVA = cast<DPValue>(&*BInst->DbgMarker->StoredDbgRecords.begin());
-    DPVB = cast<DPValue>(&*Branch->DbgMarker->StoredDbgRecords.begin());
-    DPVConst = cast<DPValue>(&*CInst->DbgMarker->StoredDbgRecords.begin());
+    DVRA =
+        cast<DbgVariableRecord>(&*BInst->DbgMarker->StoredDbgRecords.begin());
+    DVRB =
+        cast<DbgVariableRecord>(&*Branch->DbgMarker->StoredDbgRecords.begin());
+    DVRConst =
+        cast<DbgVariableRecord>(&*CInst->DbgMarker->StoredDbgRecords.begin());
   }
 
   void TearDown() override { UseNewDbgInfoFormat = false; }
 
-  bool InstContainsDPValue(Instruction *I, DPValue *DPV) {
+  bool InstContainsDbgVariableRecord(Instruction *I, DbgVariableRecord *DVR) {
     for (DbgRecord &D : I->getDbgRecordRange()) {
-      if (&D == DPV) {
+      if (&D == DVR) {
         // Confirm too that the links between the records are correct.
-        EXPECT_EQ(DPV->Marker, I->DbgMarker);
+        EXPECT_EQ(DVR->Marker, I->DbgMarker);
         EXPECT_EQ(I->DbgMarker->MarkedInstr, I);
         return true;
       }
@@ -550,7 +554,8 @@ class DbgSpliceTest : public ::testing::Test {
     return false;
   }
 
-  bool CheckDPVOrder(Instruction *I, SmallVector<DPValue *> CheckVals) {
+  bool CheckDVROrder(Instruction *I,
+                     SmallVector<DbgVariableRecord *> CheckVals) {
     SmallVector<DbgRecord *> Vals;
     for (DbgRecord &D : I->getDbgRecordRange())
       Vals.push_back(&D);
@@ -578,15 +583,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest0) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, not including leading dbg.value, to Last, including the
@@ -595,15 +600,14 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0, !dbg !11
         }
 
 
@@ -613,14 +617,14 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Dest, in exit block.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVB));
+  // DVRB: should be on Dest, in exit block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRB));
 
-  // DPVA, should have "fallen" onto the branch, remained in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVA));
+  // DVRA, should have "fallen" onto the branch, remained in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRA));
 
-  // DPVConst should be on the moved %b instruction.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVConst));
+  // DVRConst should be on the moved %b instruction.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRConst));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest1) {
@@ -631,15 +635,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest1) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, not including leading dbg.value, to Last, including the
@@ -648,15 +652,15 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
 First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRConst  call void @llvm.dbg.value(metadata i16 0,
+metadata !9, metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1,
+!dbg !11 ret i16 0, !dbg !11
         }
 
 
@@ -666,17 +670,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on CInst, in exit block.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVB));
+  // DVRB: should be on CInst, in exit block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRB));
 
-  // DPVA, should have "fallen" onto the branch, remained in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVA));
+  // DVRA, should have "fallen" onto the branch, remained in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRA));
 
-  // DPVConst should be behind / after the moved instructions, remain on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be behind / after the moved instructions, remain on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVB and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(CInst, {DPVB, DPVConst}));
+  // Order of DVRB and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(CInst, {DVRB, DVRConst}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest2) {
@@ -687,15 +691,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest2) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from head of First, which includes the leading dbg.value, to Last,
@@ -707,12 +711,12 @@ BBEntry entry:
 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRA      call void @llvm.dbg.value(metadata i16 %a,
+metadata !9, metadata !DIExpression()), !dbg !11 First     %b = add i16 %a, 1,
+!dbg !11 DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9,
+metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret
+i16 0, !dbg !11
         }
 
 
@@ -721,18 +725,18 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(BInst->getParent(), BBExit);
   EXPECT_EQ(CInst->getParent(), BBExit);
 
-  // DPVB: should be on CInst, in exit block.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVB));
+  // DVRB: should be on CInst, in exit block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRB));
 
-  // DPVA, should have transferred with the spliced instructions, remains on
+  // DVRA, should have transferred with the spliced instructions, remains on
   // the "b" inst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVA));
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRA));
 
-  // DPVConst should be ahead of the moved instructions, ahead of BInst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVConst));
+  // DVRConst should be ahead of the moved instructions, ahead of BInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRConst));
 
-  // Order of DPVA and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(BInst, {DPVConst, DPVA}));
+  // Order of DVRA and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(BInst, {DVRConst, DVRA}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest3) {
@@ -743,15 +747,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest3) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from head of First, which includes the leading dbg.value, to Last,
@@ -763,12 +767,12 @@ BBEntry entry:
 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9,
+metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret
+i16 0, !dbg !11
         }
 
   */
@@ -776,18 +780,18 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(BInst->getParent(), BBExit);
   EXPECT_EQ(CInst->getParent(), BBExit);
 
-  // DPVB: should be on CInst, in exit block.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVB));
+  // DVRB: should be on CInst, in exit block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRB));
 
-  // DPVA, should have transferred with the spliced instructions, remains on
+  // DVRA, should have transferred with the spliced instructions, remains on
   // the "b" inst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVA));
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRA));
 
-  // DPVConst should be behind the moved instructions, ahead of CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be behind the moved instructions, ahead of CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVB and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(CInst, {DPVB, DPVConst}));
+  // Order of DVRB and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(CInst, {DVRB, DVRConst}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest4) {
@@ -798,15 +802,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest4) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, not including the leading dbg.value, to Last, but NOT
@@ -815,15 +819,15 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRB      call void @llvm.dbg.value(metadata i16 %b,
+metadata !9, metadata !DIExpression()), !dbg !11 Last      br label %exit, !dbg
+!11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 Dest      %c =
+add i16 %b, 1, !dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -832,17 +836,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have remained in entry block, falls onto Branch inst.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVA));
+  // DVRA, should have remained in entry block, falls onto Branch inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRA));
 
-  // DPVConst should be ahead of the moved instructions, BInst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVConst));
+  // DVRConst should be ahead of the moved instructions, BInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRConst));
 
-  // Order of DPVA and DPVA should be thus:
-  EXPECT_TRUE(CheckDPVOrder(Branch, {DPVA, DPVB}));
+  // Order of DVRA and DVRA should be thus:
+  EXPECT_TRUE(CheckDVROrder(Branch, {DVRA, DVRB}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest5) {
@@ -853,15 +857,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest5) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, not including the leading dbg.value, to Last, but NOT
@@ -870,15 +874,16 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRB      call void @llvm.dbg.value(metadata i16 %b,
+metadata !9, metadata !DIExpression()), !dbg !11 Last      br label %exit, !dbg
+!11
 
 BBExit  exit:
 First     %b = add i16 %a, 1, !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
   */
@@ -887,17 +892,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have remained in entry block, falls onto Branch inst.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVA));
+  // DVRA, should have remained in entry block, falls onto Branch inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRA));
 
-  // DPVConst should be behind of the moved instructions, on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be behind of the moved instructions, on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVA and DPVB should be thus:
-  EXPECT_TRUE(CheckDPVOrder(Branch, {DPVA, DPVB}));
+  // Order of DVRA and DVRB should be thus:
+  EXPECT_TRUE(CheckDVROrder(Branch, {DVRA, DVRB}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest6) {
@@ -908,15 +913,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest6) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, including the leading dbg.value, to Last, but NOT
@@ -925,15 +930,14 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRA      call void @llvm.dbg.value(metadata i16 %a,
+metadata !9, metadata !DIExpression()), !dbg !11 First     %b = add i16 %a, 1,
+!dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -942,17 +946,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have transferred to BBExit, on B inst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVA));
+  // DVRA, should have transferred to BBExit, on B inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRA));
 
-  // DPVConst should be ahead of the moved instructions, on BInst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVConst));
+  // DVRConst should be ahead of the moved instructions, on BInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRConst));
 
-  // Order of DPVA and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(BInst, {DPVConst, DPVA}));
+  // Order of DVRA and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(BInst, {DVRConst, DVRA}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceTest7) {
@@ -963,15 +967,15 @@ TEST_F(DbgSpliceTest, DbgSpliceTest7) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from First, including the leading dbg.value, to Last, but NOT
@@ -980,15 +984,14 @@ Dest      %c = add i16 %b, 1, !dbg !11
 
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRConst  call
+void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()),
+!dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -997,21 +1000,21 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have transferred to BBExit, on B inst.
-  EXPECT_TRUE(InstContainsDPValue(BInst, DPVA));
+  // DVRA, should have transferred to BBExit, on B inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(BInst, DVRA));
 
-  // DPVConst should be after of the moved instructions, on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be after of the moved instructions, on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 }
 
 // But wait, there's more! What if you splice a range that is empty, but
 // implicitly contains debug-info? In the dbg.value design for debug-info,
-// this would be an explicit range, but in DPValue debug-info, it isn't.
-// Check that if we try to do that, with differing head-bit values, that
-// DPValues are transferred.
+// this would be an explicit range, but in DbgVariableRecord debug-info, it
+// isn't. Check that if we try to do that, with differing head-bit values, that
+// DbgVariableRecords are transferred.
 // Test with empty transfers to Dest, with head bit set and not set.
 
 TEST_F(DbgSpliceTest, DbgSpliceEmpty0) {
@@ -1021,31 +1024,31 @@ TEST_F(DbgSpliceTest, DbgSpliceEmpty0) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from BBEntry.getFirstInsertionPt to First -- this implicitly is a
-    splice of DPVA, but the iterators are pointing at the same instruction. The
+    splice of DVRA, but the iterators are pointing at the same instruction. The
     only difference is the setting of the head bit. Becomes;
 
         define i16 @f(i16 %a) !dbg !6 {
 First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRA      call void @llvm.dbg.value(metadata i16 %a,
+metadata !9, metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1,
+!dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -1054,17 +1057,17 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have transferred to BBExit, on C inst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVA));
+  // DVRA, should have transferred to BBExit, on C inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRA));
 
-  // DPVConst should be ahead of the moved DPValue, on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be ahead of the moved DbgVariableRecord, on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVA and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(CInst, {DPVConst, DPVA}));
+  // Order of DVRA and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(CInst, {DVRConst, DVRA}));
 }
 
 TEST_F(DbgSpliceTest, DbgSpliceEmpty1) {
@@ -1074,32 +1077,32 @@ TEST_F(DbgSpliceTest, DbgSpliceEmpty1) {
   /*
         define i16 @f(i16 %a) !dbg !6 {
 BBEntry entry:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 First     %b = add i16 %a, 1, !dbg !11 DVRB      call
+void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()),
+!dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata
+!DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1, !dbg !11 ret i16 0,
+!dbg !11
         }
 
     Splice from BBEntry.getFirstInsertionPt to First -- this implicitly is a
-    splice of DPVA, but the iterators are pointing at the same instruction. The
+    splice of DVRA, but the iterators are pointing at the same instruction. The
     only difference is the setting of the head bit. Insert at head of Dest,
-    i.e. before DPVConst. Becomes;
+    i.e. before DVRConst. Becomes;
 
         define i16 @f(i16 %a) !dbg !6 {
 First     %b = add i16 %a, 1, !dbg !11
-DPVB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata !DIExpression()), !dbg !11
-Last      br label %exit, !dbg !11
+DVRB      call void @llvm.dbg.value(metadata i16 %b, metadata !9, metadata
+!DIExpression()), !dbg !11 Last      br label %exit, !dbg !11
 
 BBExit  exit:
-DPVA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
-DPVConst  call void @llvm.dbg.value(metadata i16 0, metadata !9, metadata !DIExpression()), !dbg !11
-Dest      %c = add i16 %b, 1, !dbg !11
-          ret i16 0, !dbg !11
+DVRA      call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata
+!DIExpression()), !dbg !11 DVRConst  call void @llvm.dbg.value(metadata i16 0,
+metadata !9, metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1,
+!dbg !11 ret i16 0, !dbg !11
         }
 
   */
@@ -1108,21 +1111,21 @@ Dest      %c = add i16 %b, 1, !dbg !11
   EXPECT_EQ(CInst->getParent(), BBExit);
   EXPECT_EQ(Branch->getParent(), BBEntry);
 
-  // DPVB: should be on Branch as before, remain in entry block.
-  EXPECT_TRUE(InstContainsDPValue(Branch, DPVB));
+  // DVRB: should be on Branch as before, remain in entry block.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(Branch, DVRB));
 
-  // DPVA, should have transferred to BBExit, on C inst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVA));
+  // DVRA, should have transferred to BBExit, on C inst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRA));
 
-  // DPVConst should be ahead of the moved DPValue, on CInst.
-  EXPECT_TRUE(InstContainsDPValue(CInst, DPVConst));
+  // DVRConst should be ahead of the moved DbgVariableRecord, on CInst.
+  EXPECT_TRUE(InstContainsDbgVariableRecord(CInst, DVRConst));
 
-  // Order of DPVA and DPVConst should be thus:
-  EXPECT_TRUE(CheckDPVOrder(CInst, {DPVA, DPVConst}));
+  // Order of DVRA and DVRConst should be thus:
+  EXPECT_TRUE(CheckDVROrder(CInst, {DVRA, DVRConst}));
 }
 
-// If we splice new instructions into a block with trailing DPValues, then
-// the trailing DPValues should get flushed back out.
+// If we splice new instructions into a block with trailing DbgVariableRecords,
+// then the trailing DbgVariableRecords should get flushed back out.
 TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   LLVMContext C;
   UseNewDbgInfoFormat = true;
@@ -1159,7 +1162,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   BasicBlock &Exit = *Entry.getNextNode();
   M->convertToNewDbgValues();
 
-  // Begin by forcing entry block to have dangling DPValue.
+  // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
   ASSERT_NE(Entry.getTrailingDbgRecords(), nullptr);
   EXPECT_TRUE(Entry.empty());
@@ -1167,8 +1170,8 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   // Now transfer the entire contents of the exit block into the entry.
   Entry.splice(Entry.end(), &Exit, Exit.begin(), Exit.end());
 
-  // The trailing DPValue should have been placed at the front of what's been
-  // spliced in.
+  // The trailing DbgVariableRecord should have been placed at the front of
+  // what's been spliced in.
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->DbgMarker);
   EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
@@ -1176,10 +1179,10 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   UseNewDbgInfoFormat = false;
 }
 
-// When we remove instructions from the program, adjacent DPValues coalesce
-// together into one DPMarker. In "old" dbg.value mode you could re-insert
-// the removed instruction back into the middle of a sequence of dbg.values.
-// Test that this can be replicated correctly by DPValues
+// When we remove instructions from the program, adjacent DbgVariableRecords
+// coalesce together into one DPMarker. In "old" dbg.value mode you could
+// re-insert the removed instruction back into the middle of a sequence of
+// dbg.values. Test that this can be replicated correctly by DbgVariableRecords
 TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   LLVMContext C;
   UseNewDbgInfoFormat = true;
@@ -1221,7 +1224,7 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   Instruction *RetInst = AddInst->getNextNode();
   ASSERT_TRUE(isa<ReturnInst>(RetInst));
 
-  // add and sub should both have one DPValue on add and ret.
+  // add and sub should both have one DbgVariableRecord on add and ret.
   EXPECT_FALSE(SubInst->hasDbgRecords());
   EXPECT_TRUE(AddInst->hasDbgRecords());
   EXPECT_TRUE(RetInst->hasDbgRecords());
@@ -1232,20 +1235,21 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
 
   // The Supported (TM) code sequence for removing then reinserting insts
   // after another instruction:
-  std::optional<DPValue::self_iterator> Pos =
+  std::optional<DbgVariableRecord::self_iterator> Pos =
       AddInst->getDbgReinsertionPosition();
   AddInst->removeFromParent();
 
   // We should have a re-insertion position.
   ASSERT_TRUE(Pos);
-  // Both DPValues should now be attached to the ret inst.
+  // Both DbgVariableRecords should now be attached to the ret inst.
   auto R3 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R3.begin(), R3.end()), 2u);
 
   // Re-insert and re-insert.
   AddInst->insertAfter(SubInst);
   Entry.reinsertInstInDbgRecords(AddInst, Pos);
-  // We should be back into a position of having one DPValue on add and ret.
+  // We should be back into a position of having one DbgVariableRecord on add
+  // and ret.
   EXPECT_FALSE(SubInst->hasDbgRecords());
   EXPECT_TRUE(AddInst->hasDbgRecords());
   EXPECT_TRUE(RetInst->hasDbgRecords());
@@ -1257,9 +1261,9 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   UseNewDbgInfoFormat = false;
 }
 
-// Test instruction removal and re-insertion, this time with one DPValue that
-// should hop up one instruction.
-TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDPValue) {
+// Test instruction removal and re-insertion, this time with one
+// DbgVariableRecord that should hop up one instruction.
+TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
   LLVMContext C;
   UseNewDbgInfoFormat = true;
 
@@ -1299,7 +1303,7 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDPValue) {
   Instruction *RetInst = AddInst->getNextNode();
   ASSERT_TRUE(isa<ReturnInst>(RetInst));
 
-  // There should be one DPValue.
+  // There should be one DbgVariableRecord.
   EXPECT_FALSE(SubInst->hasDbgRecords());
   EXPECT_TRUE(AddInst->hasDbgRecords());
   EXPECT_FALSE(RetInst->hasDbgRecords());
@@ -1307,13 +1311,13 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDPValue) {
   EXPECT_EQ(std::distance(R1.begin(), R1.end()), 1u);
 
   // The Supported (TM) code sequence for removing then reinserting insts:
-  std::optional<DPValue::self_iterator> Pos =
+  std::optional<DbgVariableRecord::self_iterator> Pos =
       AddInst->getDbgReinsertionPosition();
   AddInst->removeFromParent();
 
-  // No re-insertion position as there were no DPValues on the ret.
+  // No re-insertion position as there were no DbgVariableRecords on the ret.
   ASSERT_FALSE(Pos);
-  // The single DPValue should now be attached to the ret inst.
+  // The single DbgVariableRecord should now be attached to the ret inst.
   EXPECT_TRUE(RetInst->hasDbgRecords());
   auto R2 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R2.begin(), R2.end()), 1u);
@@ -1321,7 +1325,8 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDPValue) {
   // Re-insert and re-insert.
   AddInst->insertAfter(SubInst);
   Entry.reinsertInstInDbgRecords(AddInst, Pos);
-  // We should be back into a position of having one DPValue on the AddInst.
+  // We should be back into a position of having one DbgVariableRecord on the
+  // AddInst.
   EXPECT_FALSE(SubInst->hasDbgRecords());
   EXPECT_TRUE(AddInst->hasDbgRecords());
   EXPECT_FALSE(RetInst->hasDbgRecords());
@@ -1374,7 +1379,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   BasicBlock &Exit = *Entry.getNextNode();
   M->convertToNewDbgValues();
 
-  // Begin by forcing entry block to have dangling DPValue.
+  // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
   ASSERT_NE(Entry.getTrailingDbgRecords(), nullptr);
   EXPECT_TRUE(Entry.empty());
@@ -1388,16 +1393,16 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->hasDbgRecords());
   EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 2u);
-  SmallVector<DPValue *, 2> DPValues;
-  for (DbgRecord &DPV : BInst->getDbgRecordRange())
-    DPValues.push_back(cast<DPValue>(&DPV));
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
+  for (DbgRecord &DVR : BInst->getDbgRecordRange())
+    DbgVariableRecords.push_back(cast<DbgVariableRecord>(&DVR));
 
-  EXPECT_EQ(DPValues[0]->getVariableLocationOp(0), F.getArg(0));
-  Value *SecondDPVValue = DPValues[1]->getVariableLocationOp(0);
-  ASSERT_TRUE(isa<ConstantInt>(SecondDPVValue));
-  EXPECT_EQ(cast<ConstantInt>(SecondDPVValue)->getZExtValue(), 0ull);
+  EXPECT_EQ(DbgVariableRecords[0]->getVariableLocationOp(0), F.getArg(0));
+  Value *SecondDVRValue = DbgVariableRecords[1]->getVariableLocationOp(0);
+  ASSERT_TRUE(isa<ConstantInt>(SecondDVRValue));
+  EXPECT_EQ(cast<ConstantInt>(SecondDVRValue)->getZExtValue(), 0ull);
 
-  // No trailing DPValues in the entry block now.
+  // No trailing DbgVariableRecords in the entry block now.
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
 
   UseNewDbgInfoFormat = false;
@@ -1444,31 +1449,31 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   BasicBlock &Exit = *Entry.getNextNode();
   M->convertToNewDbgValues();
 
-  // Begin by forcing entry block to have dangling DPValue.
+  // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
   ASSERT_NE(Entry.getTrailingDbgRecords(), nullptr);
   EXPECT_TRUE(Entry.empty());
 
   // Now transfer into the entry block -- fetching the first instruction with
   // begin and then calling getIterator clears the "head" bit, meaning that the
-  // range to move will not include any leading DPValues.
+  // range to move will not include any leading DbgVariableRecords.
   Entry.splice(Entry.end(), &Exit, Exit.begin()->getIterator(), Exit.end());
 
   // We should now have one dbg.values on the first instruction, %a.
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->hasDbgRecords());
   EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
-  SmallVector<DPValue *, 2> DPValues;
-  for (DbgRecord &DPV : BInst->getDbgRecordRange())
-    DPValues.push_back(cast<DPValue>(&DPV));
+  SmallVector<DbgVariableRecord *, 2> DbgVariableRecords;
+  for (DbgRecord &DVR : BInst->getDbgRecordRange())
+    DbgVariableRecords.push_back(cast<DbgVariableRecord>(&DVR));
 
-  EXPECT_EQ(DPValues[0]->getVariableLocationOp(0), F.getArg(0));
-  // No trailing DPValues in the entry block now.
+  EXPECT_EQ(DbgVariableRecords[0]->getVariableLocationOp(0), F.getArg(0));
+  // No trailing DbgVariableRecords in the entry block now.
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
 
   // We should have nothing left in the exit block...
   EXPECT_TRUE(Exit.empty());
-  // ... except for some dangling DPValues.
+  // ... except for some dangling DbgVariableRecords.
   EXPECT_NE(Exit.getTrailingDbgRecords(), nullptr);
   EXPECT_FALSE(Exit.getTrailingDbgRecords()->empty());
   Exit.getTrailingDbgRecords()->eraseFromParent();
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 55944b8949888b..3672f2bccfecd7 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -234,8 +234,8 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
   EXPECT_TRUE(DbgDeclare->isKillLocation());
 }
 
-// Duplicate of above test, but in DPValue representation.
-TEST(MetadataTest, DeleteInstUsedByDPValue) {
+// Duplicate of above test, but in DbgVariableRecord representation.
+TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -267,26 +267,26 @@ TEST(MetadataTest, DeleteInstUsedByDPValue) {
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
   M->convertToNewDbgValues();
 
-  // Find the DPValues using %b.
+  // Find the DbgVariableRecords using %b.
   SmallVector<DbgValueInst *, 2> DVIs;
-  SmallVector<DPValue *, 2> DPVs;
-  findDbgValues(DVIs, &I, &DPVs);
-  ASSERT_EQ(DPVs.size(), 2u);
+  SmallVector<DbgVariableRecord *, 2> DVRs;
+  findDbgValues(DVIs, &I, &DVRs);
+  ASSERT_EQ(DVRs.size(), 2u);
 
-  // Delete %b. The DPValue should now point to undef.
+  // Delete %b. The DbgVariableRecord should now point to undef.
   I.eraseFromParent();
-  EXPECT_EQ(DPVs[0]->getNumVariableLocationOps(), 1u);
-  EXPECT_TRUE(isa<UndefValue>(DPVs[0]->getVariableLocationOp(0)));
-  EXPECT_TRUE(DPVs[0]->isKillLocation());
-  EXPECT_EQ(DPVs[1]->getNumVariableLocationOps(), 2u);
-  EXPECT_TRUE(isa<UndefValue>(DPVs[1]->getVariableLocationOp(1)));
-  EXPECT_TRUE(DPVs[1]->isKillLocation());
+  EXPECT_EQ(DVRs[0]->getNumVariableLocationOps(), 1u);
+  EXPECT_TRUE(isa<UndefValue>(DVRs[0]->getVariableLocationOp(0)));
+  EXPECT_TRUE(DVRs[0]->isKillLocation());
+  EXPECT_EQ(DVRs[1]->getNumVariableLocationOps(), 2u);
+  EXPECT_TRUE(isa<UndefValue>(DVRs[1]->getVariableLocationOp(1)));
+  EXPECT_TRUE(DVRs[1]->isKillLocation());
   UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 // Ensure that the order of dbg.value intrinsics returned by findDbgValues, and
-// their corresponding DPValue representation, are consistent.
-TEST(MetadataTest, OrderingOfDPValues) {
+// their corresponding DbgVariableRecord representation, are consistent.
+TEST(MetadataTest, OrderingOfDbgVariableRecords) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -319,10 +319,10 @@ TEST(MetadataTest, OrderingOfDPValues) {
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
 
   SmallVector<DbgValueInst *, 2> DVIs;
-  SmallVector<DPValue *, 2> DPVs;
-  findDbgValues(DVIs, &I, &DPVs);
+  SmallVector<DbgVariableRecord *, 2> DVRs;
+  findDbgValues(DVIs, &I, &DVRs);
   ASSERT_EQ(DVIs.size(), 2u);
-  ASSERT_EQ(DPVs.size(), 0u);
+  ASSERT_EQ(DVRs.size(), 0u);
 
   // The correct order of dbg.values is given by their use-list, which becomes
   // the reverse order of creation. Thus the dbg.values should come out as
@@ -332,17 +332,17 @@ TEST(MetadataTest, OrderingOfDPValues) {
   DILocalVariable *Var1 = DVIs[1]->getVariable();
   EXPECT_TRUE(Var1->getName() == "foo");
 
-  // Now try again, but in DPValue form.
+  // Now try again, but in DbgVariableRecord form.
   DVIs.clear();
 
   M->convertToNewDbgValues();
-  findDbgValues(DVIs, &I, &DPVs);
+  findDbgValues(DVIs, &I, &DVRs);
   ASSERT_EQ(DVIs.size(), 0u);
-  ASSERT_EQ(DPVs.size(), 2u);
+  ASSERT_EQ(DVRs.size(), 2u);
 
-  Var0 = DPVs[0]->getVariable();
+  Var0 = DVRs[0]->getVariable();
   EXPECT_TRUE(Var0->getName() == "bar");
-  Var1 = DPVs[1]->getVariable();
+  Var1 = DVRs[1]->getVariable();
   EXPECT_TRUE(Var1->getName() == "foo");
 
   M->convertFromNewDbgValues();
@@ -861,9 +861,9 @@ TEST(AssignmentTrackingTest, InstrMethods) {
   }
 }
 
-// Test some very straight-forward operations on DPValues -- these are
+// Test some very straight-forward operations on DbgVariableRecords -- these are
 // dbg.values that have been converted to a non-instruction format.
-TEST(MetadataTest, ConvertDbgToDPValue) {
+TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -900,7 +900,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
   const DIExpression *Expr = nullptr;
   const DILocation *Loc = nullptr;
   const Metadata *MLoc = nullptr;
-  DPValue *DPV1 = nullptr;
+  DbgVariableRecord *DVR1 = nullptr;
   {
     DbgValueInst *DPI = dyn_cast<DbgValueInst>(&I);
     ASSERT_TRUE(DPI);
@@ -909,17 +909,18 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
     Loc = DPI->getDebugLoc().get();
     MLoc = DPI->getRawLocation();
 
-    // Test the creation of a DPValue and it's conversion back to a dbg.value.
-    DPV1 = new DPValue(DPI);
-    EXPECT_EQ(DPV1->getVariable(), Var);
-    EXPECT_EQ(DPV1->getExpression(), Expr);
-    EXPECT_EQ(DPV1->getDebugLoc().get(), Loc);
-    EXPECT_EQ(DPV1->getRawLocation(), MLoc);
+    // Test the creation of a DbgVariableRecord and it's conversion back to a
+    // dbg.value.
+    DVR1 = new DbgVariableRecord(DPI);
+    EXPECT_EQ(DVR1->getVariable(), Var);
+    EXPECT_EQ(DVR1->getExpression(), Expr);
+    EXPECT_EQ(DVR1->getDebugLoc().get(), Loc);
+    EXPECT_EQ(DVR1->getRawLocation(), MLoc);
 
     // Erase dbg.value,
     DPI->eraseFromParent();
-    // Re-create from DPV1, inserting at front.
-    DPV1->createDebugIntrinsic(&*M,
+    // Re-create from DVR1, inserting at front.
+    DVR1->createDebugIntrinsic(&*M,
                                &M->getFunction("f")->getEntryBlock().front());
 
     Instruction *NewDPI = &M->getFunction("f")->getEntryBlock().front();
@@ -931,17 +932,17 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
     EXPECT_EQ(DPI2->getRawLocation(), MLoc);
   }
 
-  // Fetch the second dbg.value, convert it to a DPValue,
+  // Fetch the second dbg.value, convert it to a DbgVariableRecord,
   BasicBlock::iterator It = M->getFunction("f")->getEntryBlock().begin();
   It = std::next(std::next(It));
   DbgValueInst *DPI3 = dyn_cast<DbgValueInst>(It);
   ASSERT_TRUE(DPI3);
-  DPValue *DPV2 = new DPValue(DPI3);
+  DbgVariableRecord *DVR2 = new DbgVariableRecord(DPI3);
 
   // These dbg.values are supposed to refer to different values.
-  EXPECT_NE(DPV1->getRawLocation(), DPV2->getRawLocation());
+  EXPECT_NE(DVR1->getRawLocation(), DVR2->getRawLocation());
 
-  // Try manipulating DPValues and markers in the exit block.
+  // Try manipulating DbgVariableRecords and markers in the exit block.
   BasicBlock *ExitBlock = &*std::next(M->getFunction("f")->getEntryBlock().getIterator());
   Instruction *FirstInst = &ExitBlock->front();
   Instruction *RetInst = &*std::next(FirstInst->getIterator());
@@ -951,45 +952,48 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
   ExitBlock->createMarker(FirstInst);
   ExitBlock->createMarker(RetInst);
 
-  // Insert DbgRecords into markers, order should come out DPV2, DPV1.
-  FirstInst->DbgMarker->insertDbgRecord(DPV1, false);
-  FirstInst->DbgMarker->insertDbgRecord(DPV2, true);
+  // Insert DbgRecords into markers, order should come out DVR2, DVR1.
+  FirstInst->DbgMarker->insertDbgRecord(DVR1, false);
+  FirstInst->DbgMarker->insertDbgRecord(DVR2, true);
   unsigned int ItCount = 0;
   for (DbgRecord &Item : FirstInst->DbgMarker->getDbgRecordRange()) {
-    EXPECT_TRUE((&Item == DPV2 && ItCount == 0) ||
-              (&Item == DPV1 && ItCount == 1));
+    EXPECT_TRUE((&Item == DVR2 && ItCount == 0) ||
+                (&Item == DVR1 && ItCount == 1));
     EXPECT_EQ(Item.getMarker(), FirstInst->DbgMarker);
     ++ItCount;
   }
 
-  // Clone them onto the second marker -- should allocate new DPVs.
+  // Clone them onto the second marker -- should allocate new DVRs.
   RetInst->DbgMarker->cloneDebugInfoFrom(FirstInst->DbgMarker, std::nullopt, false);
   EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 2u);
   ItCount = 0;
   // Check these things store the same information; but that they're not the same
   // objects.
-  for (DPValue &Item : filterDbgVars(RetInst->DbgMarker->getDbgRecordRange())) {
-    EXPECT_TRUE((Item.getRawLocation() == DPV2->getRawLocation() && ItCount == 0) ||
-                (Item.getRawLocation() == DPV1->getRawLocation() && ItCount == 1));
+  for (DbgVariableRecord &Item :
+       filterDbgVars(RetInst->DbgMarker->getDbgRecordRange())) {
+    EXPECT_TRUE(
+        (Item.getRawLocation() == DVR2->getRawLocation() && ItCount == 0) ||
+        (Item.getRawLocation() == DVR1->getRawLocation() && ItCount == 1));
 
     EXPECT_EQ(Item.getMarker(), RetInst->DbgMarker);
-    EXPECT_NE(&Item, DPV1);
-    EXPECT_NE(&Item, DPV2);
+    EXPECT_NE(&Item, DVR1);
+    EXPECT_NE(&Item, DVR2);
     ++ItCount;
   }
 
   RetInst->DbgMarker->dropDbgRecords();
   EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 0u);
 
-  // Try cloning one single DPValue.
+  // Try cloning one single DbgVariableRecord.
   auto DIIt = std::next(FirstInst->DbgMarker->getDbgRecordRange().begin());
   RetInst->DbgMarker->cloneDebugInfoFrom(FirstInst->DbgMarker, DIIt, false);
   EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 1u);
-  // The second DPValue should have been cloned; it should have the same values
-  // as DPV1.
-  EXPECT_EQ(cast<DPValue>(RetInst->DbgMarker->StoredDbgRecords.begin())
-                ->getRawLocation(),
-            DPV1->getRawLocation());
+  // The second DbgVariableRecord should have been cloned; it should have the
+  // same values as DVR1.
+  EXPECT_EQ(
+      cast<DbgVariableRecord>(RetInst->DbgMarker->StoredDbgRecords.begin())
+          ->getRawLocation(),
+      DVR1->getRawLocation());
   // We should be able to drop individual DbgRecords.
   RetInst->DbgMarker->dropOneDbgRecord(
       &*RetInst->DbgMarker->StoredDbgRecords.begin());
@@ -998,19 +1002,19 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
   // to is disappearing so it needs to be transferred into "this" marker.
   RetInst->DbgMarker->absorbDebugValues(*FirstInst->DbgMarker, true);
   EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 2u);
-  // Should be the DPV1 and DPV2 objects.
+  // Should be the DVR1 and DVR2 objects.
   ItCount = 0;
   for (DbgRecord &Item : RetInst->DbgMarker->getDbgRecordRange()) {
-    EXPECT_TRUE((&Item == DPV2 && ItCount == 0) ||
-              (&Item == DPV1 && ItCount == 1));
+    EXPECT_TRUE((&Item == DVR2 && ItCount == 0) ||
+                (&Item == DVR1 && ItCount == 1));
     EXPECT_EQ(Item.getMarker(), RetInst->DbgMarker);
     ++ItCount;
   }
 
-  // Finally -- there are two DPValues left over. If we remove evrything in the
-  // basic block, then they should sink down into the "TrailingDbgRecords"
-  // container for dangling debug-info. Future facilities will restore them
-  // back when a terminator is inserted.
+  // Finally -- there are two DbgVariableRecords left over. If we remove
+  // evrything in the basic block, then they should sink down into the
+  // "TrailingDbgRecords" container for dangling debug-info. Future facilities
+  // will restore them back when a terminator is inserted.
   FirstInst->DbgMarker->removeMarker();
   FirstInst->eraseFromParent();
   RetInst->DbgMarker->removeMarker();
@@ -1019,24 +1023,24 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
   DPMarker *EndMarker = ExitBlock->getTrailingDbgRecords();
   ASSERT_NE(EndMarker, nullptr);
   EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
-  // Test again that it's those two DPValues, DPV1 and DPV2.
+  // Test again that it's those two DbgVariableRecords, DVR1 and DVR2.
   ItCount = 0;
   for (DbgRecord &Item : EndMarker->getDbgRecordRange()) {
-    EXPECT_TRUE((&Item == DPV2 && ItCount == 0) ||
-              (&Item == DPV1 && ItCount == 1));
+    EXPECT_TRUE((&Item == DVR2 && ItCount == 0) ||
+                (&Item == DVR1 && ItCount == 1));
     EXPECT_EQ(Item.getMarker(), EndMarker);
     ++ItCount;
   }
 
-  // Cleanup the trailing DPValue records and marker.
+  // Cleanup the trailing DbgVariableRecord records and marker.
   EndMarker->eraseFromParent();
 
-  // The record of those trailing DPValues would dangle and cause an assertion
-  // failure if it lived until the end of the LLVMContext.
+  // The record of those trailing DbgVariableRecords would dangle and cause an
+  // assertion failure if it lived until the end of the LLVMContext.
   ExitBlock->deleteTrailingDbgRecords();
 }
 
-TEST(MetadataTest, DPValueConversionRoutines) {
+TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   LLVMContext C;
 
   // For the purpose of this test, set and un-set the command line option
@@ -1073,14 +1077,14 @@ TEST(MetadataTest, DPValueConversionRoutines) {
 )");
 
   // Check that the conversion routines and utilities between dbg.value
-  // debug-info format and DPValues works.
+  // debug-info format and DbgVariableRecords works.
   Function *F = M->getFunction("f");
   BasicBlock *BB1 = &F->getEntryBlock();
   // First instruction should be a dbg.value.
   EXPECT_TRUE(isa<DbgValueInst>(BB1->front()));
   EXPECT_FALSE(BB1->IsNewDbgInfoFormat);
-  // Validating the block for DPValues / DPMarkers shouldn't fail -- there's
-  // no data stored right now.
+  // Validating the block for DbgVariableRecords / DPMarkers shouldn't fail --
+  // there's no data stored right now.
   bool BrokenDebugInfo = false;
   bool Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
   EXPECT_FALSE(Error);
@@ -1104,7 +1108,7 @@ TEST(MetadataTest, DPValueConversionRoutines) {
       EXPECT_FALSE(isa<DbgValueInst>(I));
 
   // There should be a DPMarker on each of the two instructions in the entry
-  // block, each containing one DPValue.
+  // block, each containing one DbgVariableRecord.
   EXPECT_EQ(BB1->size(), 2u);
   Instruction *FirstInst = &BB1->front();
   Instruction *SecondInst = FirstInst->getNextNode();
@@ -1115,21 +1119,21 @@ TEST(MetadataTest, DPValueConversionRoutines) {
   EXPECT_EQ(SecondInst, SecondInst->DbgMarker->MarkedInstr);
 
   EXPECT_EQ(FirstInst->DbgMarker->StoredDbgRecords.size(), 1u);
-  DPValue *DPV1 =
-      cast<DPValue>(&*FirstInst->DbgMarker->getDbgRecordRange().begin());
-  EXPECT_EQ(DPV1->getMarker(), FirstInst->DbgMarker);
+  DbgVariableRecord *DVR1 = cast<DbgVariableRecord>(
+      &*FirstInst->DbgMarker->getDbgRecordRange().begin());
+  EXPECT_EQ(DVR1->getMarker(), FirstInst->DbgMarker);
   // Should point at %a, an argument.
-  EXPECT_TRUE(isa<Argument>(DPV1->getVariableLocationOp(0)));
+  EXPECT_TRUE(isa<Argument>(DVR1->getVariableLocationOp(0)));
 
   EXPECT_EQ(SecondInst->DbgMarker->StoredDbgRecords.size(), 1u);
-  DPValue *DPV2 =
-      cast<DPValue>(&*SecondInst->DbgMarker->getDbgRecordRange().begin());
-  EXPECT_EQ(DPV2->getMarker(), SecondInst->DbgMarker);
+  DbgVariableRecord *DVR2 = cast<DbgVariableRecord>(
+      &*SecondInst->DbgMarker->getDbgRecordRange().begin());
+  EXPECT_EQ(DVR2->getMarker(), SecondInst->DbgMarker);
   // Should point at FirstInst.
-  EXPECT_EQ(DPV2->getVariableLocationOp(0), FirstInst);
+  EXPECT_EQ(DVR2->getVariableLocationOp(0), FirstInst);
 
-  // There should be no DPValues / DPMarkers in the second block, but it should
-  // be marked as being in the new format.
+  // There should be no DbgVariableRecords / DPMarkers in the second block, but
+  // it should be marked as being in the new format.
   BasicBlock *BB2 = BB1->getNextNode();
   EXPECT_TRUE(BB2->IsNewDbgInfoFormat);
   for (auto &Inst : *BB2)
@@ -1143,17 +1147,17 @@ TEST(MetadataTest, DPValueConversionRoutines) {
   // But if we were to break something, it should be able to fire. Don't attempt
   // to comprehensively test the validator, it's a smoke-test rather than a
   // "proper" verification pass.
-  DPV1->setMarker(nullptr);
+  DVR1->setMarker(nullptr);
   // A marker pointing the wrong way should be an error.
   Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
   EXPECT_FALSE(Error);
   EXPECT_TRUE(BrokenDebugInfo);
-  DPV1->setMarker(FirstInst->DbgMarker);
+  DVR1->setMarker(FirstInst->DbgMarker);
 
-  DILocalVariable *DLV1 = DPV1->getVariable();
-  DIExpression *Expr1 = DPV1->getExpression();
-  DILocalVariable *DLV2 = DPV2->getVariable();
-  DIExpression *Expr2 = DPV2->getExpression();
+  DILocalVariable *DLV1 = DVR1->getVariable();
+  DIExpression *Expr1 = DVR1->getExpression();
+  DILocalVariable *DLV2 = DVR2->getVariable();
+  DIExpression *Expr2 = DVR2->getExpression();
 
   // Convert everything back to the "old" format and ensure it's right.
   M->convertFromNewDbgValues();
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 139e8832c97b91..ec305982123046 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -57,7 +57,7 @@ TEST_F(IRBuilderTest, Intrinsics) {
   IRBuilder<> Builder(BB);
   Value *V;
   Instruction *I;
-  CallInst *Call;
+  Value *Result;
   IntrinsicInst *II;
 
   V = Builder.CreateLoad(GV->getValueType(), GV);
@@ -65,78 +65,80 @@ TEST_F(IRBuilderTest, Intrinsics) {
   I->setHasNoInfs(true);
   I->setHasNoNaNs(false);
 
-  Call = Builder.CreateMinNum(V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateMinNum(V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::minnum);
 
-  Call = Builder.CreateMaxNum(V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateMaxNum(V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maxnum);
 
-  Call = Builder.CreateMinimum(V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateMinimum(V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::minimum);
 
-  Call = Builder.CreateMaximum(V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateMaximum(V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maximum);
 
-  Call = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, {}, {});
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, {}, {});
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::readcyclecounter);
 
-  Call = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fabs);
   EXPECT_FALSE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, V, I);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, V, I);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fabs);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateBinaryIntrinsic(Intrinsic::pow, V, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateBinaryIntrinsic(Intrinsic::pow, V, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::pow);
   EXPECT_FALSE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateBinaryIntrinsic(Intrinsic::pow, V, V, I);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateBinaryIntrinsic(Intrinsic::pow, V, V, I);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::pow);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V});
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V});
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_FALSE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
-  II = cast<IntrinsicInst>(Call);
+  Result =
+      Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
-  II = cast<IntrinsicInst>(Call);
+  Result =
+      Builder.CreateIntrinsic(Intrinsic::fma, {V->getType()}, {V, V, V}, I);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::fma);
   EXPECT_TRUE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateUnaryIntrinsic(Intrinsic::roundeven, V);
-  II = cast<IntrinsicInst>(Call);
+  Result = Builder.CreateUnaryIntrinsic(Intrinsic::roundeven, V);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::roundeven);
   EXPECT_FALSE(II->hasNoInfs());
   EXPECT_FALSE(II->hasNoNaNs());
 
-  Call = Builder.CreateIntrinsic(
+  Result = Builder.CreateIntrinsic(
       Intrinsic::set_rounding, {},
       {Builder.getInt32(static_cast<uint32_t>(RoundingMode::TowardZero))});
-  II = cast<IntrinsicInst>(Call);
+  II = cast<IntrinsicInst>(Result);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::set_rounding);
 }
 
@@ -942,7 +944,7 @@ TEST_F(IRBuilderTest, DIBuilder) {
         DIB.createAutoVariable(BarSP, "X", File, 2, IntType, true);
     DILocalVariable *VarY =
         DIB.createAutoVariable(BarSP, "Y", File, 2, IntType, true);
-    { /* dbg.value | DPValue::Value */
+    { /* dbg.value | DbgVariableRecord::Value */
       ExpectOrder(DIB.insertDbgValueIntrinsic(I, VarX, DIB.createExpression(),
                                               VarLoc, I),
                   I->getIterator());
@@ -953,7 +955,7 @@ TEST_F(IRBuilderTest, DIBuilder) {
       ExpectOrder(VarXValue, I->getIterator());
       EXPECT_EQ(BB->getTrailingDbgRecords(), nullptr);
     }
-    { /* dbg.declare | DPValue::Declare */
+    { /* dbg.declare | DbgVariableRecord::Declare */
       ExpectOrder(DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc, I),
                   I->getIterator());
       // Check inserting at end of the block works as with labels.
@@ -963,7 +965,7 @@ TEST_F(IRBuilderTest, DIBuilder) {
       ExpectOrder(VarYDeclare, I->getIterator());
       EXPECT_EQ(BB->getTrailingDbgRecords(), nullptr);
     }
-    { /* dbg.assign | DPValue::Assign */
+    { /* dbg.assign | DbgVariableRecord::Assign */
       I = Builder.CreateAlloca(Builder.getInt32Ty());
       I->setMetadata(LLVMContext::MD_DIAssignID, DIAssignID::getDistinct(Ctx));
       // DbgAssign interface is slightly different - it always inserts after the
diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
index 767dd1a59d2b9e..4c2e5f77a54038 100644
--- a/llvm/unittests/IR/MetadataTest.cpp
+++ b/llvm/unittests/IR/MetadataTest.cpp
@@ -106,7 +106,7 @@ class MetadataTest : public testing::Test {
   DIType *getDerivedType() {
     return DIDerivedType::getDistinct(
         Context, dwarf::DW_TAG_pointer_type, "", nullptr, 0, nullptr,
-        getBasicType("basictype"), 1, 2, 0, std::nullopt, DINode::FlagZero);
+        getBasicType("basictype"), 1, 2, 0, std::nullopt, {}, DINode::FlagZero);
   }
   Constant *getConstant() {
     return ConstantInt::get(Type::getInt32Ty(Context), Counter++);
@@ -461,7 +461,7 @@ TEST_F(MDNodeTest, PrintTree) {
     auto *StructTy = cast<DICompositeType>(getCompositeType());
     DIType *PointerTy = DIDerivedType::getDistinct(
         Context, dwarf::DW_TAG_pointer_type, "", nullptr, 0, nullptr, StructTy,
-        1, 2, 0, std::nullopt, DINode::FlagZero);
+        1, 2, 0, std::nullopt, {}, DINode::FlagZero);
     StructTy->replaceElements(MDTuple::get(Context, PointerTy));
 
     auto *Var = DILocalVariable::get(Context, Scope, "foo", File,
@@ -1864,13 +1864,17 @@ TEST_F(DIDerivedTypeTest, get) {
   DIType *BaseType = getBasicType("basic");
   MDTuple *ExtraData = getTuple();
   unsigned DWARFAddressSpace = 8;
+  DIDerivedType::PtrAuthData PtrAuthData(1, false, 1234, true, true);
+  DIDerivedType::PtrAuthData PtrAuthData2(1, false, 1234, true, false);
   DINode::DIFlags Flags5 = static_cast<DINode::DIFlags>(5);
   DINode::DIFlags Flags4 = static_cast<DINode::DIFlags>(4);
 
-  auto *N =
-      DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something", File,
-                         1, Scope, BaseType, 2, 3, 4, DWARFAddressSpace, Flags5,
-                         ExtraData);
+  auto *N = DIDerivedType::get(
+      Context, dwarf::DW_TAG_pointer_type, "something", File, 1, Scope,
+      BaseType, 2, 3, 4, DWARFAddressSpace, std::nullopt, Flags5, ExtraData);
+  auto *N1 = DIDerivedType::get(Context, dwarf::DW_TAG_LLVM_ptrauth_type, "",
+                                File, 1, Scope, N, 2, 3, 4, DWARFAddressSpace,
+                                PtrAuthData, Flags5, ExtraData);
   EXPECT_EQ(dwarf::DW_TAG_pointer_type, N->getTag());
   EXPECT_EQ("something", N->getName());
   EXPECT_EQ(File, N->getFile());
@@ -1881,53 +1885,73 @@ TEST_F(DIDerivedTypeTest, get) {
   EXPECT_EQ(3u, N->getAlignInBits());
   EXPECT_EQ(4u, N->getOffsetInBits());
   EXPECT_EQ(DWARFAddressSpace, *N->getDWARFAddressSpace());
+  EXPECT_EQ(std::nullopt, N->getPtrAuthData());
+  EXPECT_EQ(PtrAuthData, N1->getPtrAuthData());
+  EXPECT_NE(PtrAuthData2, N1->getPtrAuthData());
   EXPECT_EQ(5u, N->getFlags());
   EXPECT_EQ(ExtraData, N->getExtraData());
   EXPECT_EQ(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
 
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_reference_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "else",
-                                  File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  File, 1, Scope, BaseType, 2, 3, 4,
+                                  DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", getFile(), 1, Scope, BaseType, 2,
-                                  3, 4, DWARFAddressSpace, Flags5, ExtraData));
+                                  3, 4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 2, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, getSubprogram(),
-                                  BaseType, 2, 3, 4, DWARFAddressSpace, Flags5,
-                                  ExtraData));
+                                  BaseType, 2, 3, 4, DWARFAddressSpace,
+                                  std::nullopt, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(
                    Context, dwarf::DW_TAG_pointer_type, "something", File, 1,
                    Scope, getBasicType("basic2"), 2, 3, 4, DWARFAddressSpace,
-                   Flags5, ExtraData));
+                   std::nullopt, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 3, 3,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 2,
-                                  4, DWARFAddressSpace, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  5, DWARFAddressSpace, Flags5, ExtraData));
+                                  5, DWARFAddressSpace, std::nullopt, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace + 1, Flags5, ExtraData));
+                                  4, DWARFAddressSpace + 1, std::nullopt,
+                                  Flags5, ExtraData));
+  EXPECT_NE(N1,
+            DIDerivedType::get(Context, dwarf::DW_TAG_LLVM_ptrauth_type, "",
+                               File, 1, Scope, N, 2, 3, 4, DWARFAddressSpace,
+                               std::nullopt, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags4, ExtraData));
+                                  4, DWARFAddressSpace, std::nullopt, Flags4,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, DWARFAddressSpace, Flags5, getTuple()));
+                                  4, DWARFAddressSpace, std::nullopt, Flags5,
+                                  getTuple()));
 
   TempDIDerivedType Temp = N->clone();
   EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+  TempDIDerivedType Temp1 = N1->clone();
+  EXPECT_EQ(N1, MDNode::replaceWithUniqued(std::move(Temp1)));
 }
 
 TEST_F(DIDerivedTypeTest, getWithLargeValues) {
@@ -1937,14 +1961,23 @@ TEST_F(DIDerivedTypeTest, getWithLargeValues) {
   MDTuple *ExtraData = getTuple();
   DINode::DIFlags Flags = static_cast<DINode::DIFlags>(5);
 
-  auto *N = DIDerivedType::get(
-      Context, dwarf::DW_TAG_pointer_type, "something", File, 1, Scope,
-      BaseType, UINT64_MAX, UINT32_MAX - 1, UINT64_MAX - 2, UINT32_MAX - 3,
-      Flags, ExtraData);
+  auto *N = DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something",
+                               File, 1, Scope, BaseType, UINT64_MAX,
+                               UINT32_MAX - 1, UINT64_MAX - 2, UINT32_MAX - 3,
+                               std::nullopt, Flags, ExtraData);
   EXPECT_EQ(UINT64_MAX, N->getSizeInBits());
   EXPECT_EQ(UINT32_MAX - 1, N->getAlignInBits());
   EXPECT_EQ(UINT64_MAX - 2, N->getOffsetInBits());
   EXPECT_EQ(UINT32_MAX - 3, *N->getDWARFAddressSpace());
+
+  auto *N1 = DIDerivedType::get(
+      Context, dwarf::DW_TAG_LLVM_ptrauth_type, "", File, 1, Scope, N,
+      UINT64_MAX, UINT32_MAX - 1, UINT64_MAX - 2, UINT32_MAX - 3,
+      DIDerivedType::PtrAuthData(7, true, 0xffff, true, false), Flags,
+      ExtraData);
+  EXPECT_EQ(7U, N1->getPtrAuthData()->key());
+  EXPECT_EQ(true, N1->getPtrAuthData()->isAddressDiscriminated());
+  EXPECT_EQ(0xffffU, N1->getPtrAuthData()->extraDiscriminator());
 }
 
 typedef MetadataTest DICompositeTypeTest;
@@ -3560,6 +3593,27 @@ TEST_F(DIExpressionTest, foldConstant) {
 #undef EXPECT_FOLD_CONST
 }
 
+TEST_F(DIExpressionTest, appendToStackAssert) {
+  DIExpression *Expr = DIExpression::get(Context, {});
+
+  // Verify that the DW_OP_LLVM_convert operands, which have the same values as
+  // DW_OP_stack_value and DW_OP_LLVM_fragment, do not get interpreted as such
+  // operations. This previously triggered an assert.
+  uint64_t FromSize = dwarf::DW_OP_stack_value;
+  uint64_t ToSize = dwarf::DW_OP_LLVM_fragment;
+  uint64_t Ops[] = {
+      dwarf::DW_OP_LLVM_convert, FromSize, dwarf::DW_ATE_signed,
+      dwarf::DW_OP_LLVM_convert, ToSize,   dwarf::DW_ATE_signed,
+  };
+  Expr = DIExpression::appendToStack(Expr, Ops);
+
+  uint64_t Expected[] = {
+      dwarf::DW_OP_LLVM_convert, FromSize, dwarf::DW_ATE_signed,
+      dwarf::DW_OP_LLVM_convert, ToSize,   dwarf::DW_ATE_signed,
+      dwarf::DW_OP_stack_value};
+  EXPECT_EQ(Expr->getElements(), ArrayRef<uint64_t>(Expected));
+}
+
 typedef MetadataTest DIObjCPropertyTest;
 
 TEST_F(DIObjCPropertyTest, get) {
@@ -4268,7 +4322,7 @@ TEST_F(MDTupleAllocationTest, Tracking2) {
 #if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG) && !defined(GTEST_HAS_SEH)
 typedef MetadataTest MDTupleAllocationDeathTest;
 TEST_F(MDTupleAllocationDeathTest, ResizeRejected) {
-  MDTuple *A = MDTuple::get(Context, None);
+  MDTuple *A = MDTuple::get(Context, std::nullopt);
   auto *Value1 = getConstantAsMetadata();
   EXPECT_DEATH(A->push_back(Value1),
                "Resizing is not supported for uniqued nodes");
diff --git a/llvm/unittests/IR/ValueTest.cpp b/llvm/unittests/IR/ValueTest.cpp
index 97c8fea3c6dbf4..246c2fc7fe4063 100644
--- a/llvm/unittests/IR/ValueTest.cpp
+++ b/llvm/unittests/IR/ValueTest.cpp
@@ -317,9 +317,9 @@ TEST(ValueTest, replaceUsesOutsideBlock) {
   ASSERT_TRUE(Ret->getOperand(0) == cast<Value>(B));
 }
 
-TEST(ValueTest, replaceUsesOutsideBlockDPValue) {
+TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
   // Check that Value::replaceUsesOutsideBlock(New, BB) replaces uses outside
-  // BB, including DPValues.
+  // BB, including DbgVariableRecords.
   const auto *IR = R"(
     define i32 @f() !dbg !6 {
     entry:
@@ -379,14 +379,16 @@ TEST(ValueTest, replaceUsesOutsideBlockDPValue) {
   EXPECT_TRUE(Branch->hasDbgRecords());
   EXPECT_TRUE(Ret->hasDbgRecords());
 
-  DPValue *DPV1 = cast<DPValue>(&*Branch->getDbgRecordRange().begin());
-  DPValue *DPV2 = cast<DPValue>(&*Ret->getDbgRecordRange().begin());
+  DbgVariableRecord *DVR1 =
+      cast<DbgVariableRecord>(&*Branch->getDbgRecordRange().begin());
+  DbgVariableRecord *DVR2 =
+      cast<DbgVariableRecord>(&*Ret->getDbgRecordRange().begin());
 
   A->replaceUsesOutsideBlock(B, Entry);
   // These users are in Entry so shouldn't be changed.
-  EXPECT_TRUE(DPV1->getVariableLocationOp(0) == cast<Value>(A));
+  EXPECT_TRUE(DVR1->getVariableLocationOp(0) == cast<Value>(A));
   // These users are outside Entry so should be changed.
-  EXPECT_TRUE(DPV2->getVariableLocationOp(0) == cast<Value>(B));
+  EXPECT_TRUE(DVR2->getVariableLocationOp(0) == cast<Value>(B));
   UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index b867b6ca637d52..48de0889fdf332 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -412,9 +412,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::urem(Known1, Known2);
-      },
+      KnownBits::urem,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         if (N2.isZero())
           return std::nullopt;
@@ -422,9 +420,7 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::srem(Known1, Known2);
-      },
+      KnownBits::srem,
       [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
         if (N2.isZero())
           return std::nullopt;
@@ -556,18 +552,14 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       [](const APInt &N1, const APInt &N2) { return N1 * N2; },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::mulhs(Known1, Known2);
-      },
+      KnownBits::mulhs,
       [](const APInt &N1, const APInt &N2) {
         unsigned Bits = N1.getBitWidth();
         return (N1.sext(2 * Bits) * N2.sext(2 * Bits)).extractBits(Bits, Bits);
       },
       checkCorrectnessOnlyBinary);
   testBinaryOpExhaustive(
-      [](const KnownBits &Known1, const KnownBits &Known2) {
-        return KnownBits::mulhu(Known1, Known2);
-      },
+      KnownBits::mulhu,
       [](const APInt &N1, const APInt &N2) {
         unsigned Bits = N1.getBitWidth();
         return (N1.zext(2 * Bits) * N2.zext(2 * Bits)).extractBits(Bits, Bits);
diff --git a/llvm/unittests/Support/LEB128Test.cpp b/llvm/unittests/Support/LEB128Test.cpp
index 21523e5f7a08c7..08b8c5573ce637 100644
--- a/llvm/unittests/Support/LEB128Test.cpp
+++ b/llvm/unittests/Support/LEB128Test.cpp
@@ -242,6 +242,21 @@ TEST(LEB128Test, DecodeInvalidSLEB128) {
 #undef EXPECT_INVALID_SLEB128
 }
 
+TEST(LEB128Test, DecodeAndInc) {
+#define EXPECT_LEB128(FUN, VALUE, SIZE)                                        \
+  do {                                                                         \
+    const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE), *P = V;       \
+    auto Expected = FUN(P), Actual = FUN##AndInc(P);                           \
+    EXPECT_EQ(Actual, Expected);                                               \
+    EXPECT_EQ(P - V, SIZE);                                                    \
+  } while (0)
+  EXPECT_LEB128(decodeULEB128, "\x7f", 1);
+  EXPECT_LEB128(decodeULEB128, "\x80\x01", 2);
+  EXPECT_LEB128(decodeSLEB128, "\x7f", 1);
+  EXPECT_LEB128(decodeSLEB128, "\x80\x01", 2);
+#undef EXPECT_LEB128
+}
+
 TEST(LEB128Test, SLEB128Size) {
   // Positive Value Testing Plan:
   // (1) 128 ^ n - 1 ........ need (n+1) bytes
diff --git a/llvm/unittests/Support/TimeProfilerTest.cpp b/llvm/unittests/Support/TimeProfilerTest.cpp
index 6be716bae6b3fb..bb820ec99a393e 100644
--- a/llvm/unittests/Support/TimeProfilerTest.cpp
+++ b/llvm/unittests/Support/TimeProfilerTest.cpp
@@ -54,6 +54,17 @@ TEST(TimeProfiler, Begin_End_Smoke) {
   ASSERT_TRUE(json.find(R"("detail":"detail")") != std::string::npos);
 }
 
+TEST(TimeProfiler, Async_Begin_End_Smoke) {
+  setupProfiler();
+
+  auto *Profiler = timeTraceAsyncProfilerBegin("event", "detail");
+  timeTraceProfilerEnd(Profiler);
+
+  std::string json = teardownProfiler();
+  ASSERT_TRUE(json.find(R"("name":"event")") != std::string::npos);
+  ASSERT_TRUE(json.find(R"("detail":"detail")") != std::string::npos);
+}
+
 TEST(TimeProfiler, Begin_End_Disabled) {
   // Nothing should be observable here. The test is really just making sure
   // we've not got a stray nullptr deref.
diff --git a/llvm/unittests/Support/VirtualFileSystemTest.cpp b/llvm/unittests/Support/VirtualFileSystemTest.cpp
index d4abbb4345873c..695b09343257f1 100644
--- a/llvm/unittests/Support/VirtualFileSystemTest.cpp
+++ b/llvm/unittests/Support/VirtualFileSystemTest.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
@@ -3330,3 +3332,66 @@ TEST(RedirectingFileSystemTest, Used) {
   EXPECT_TRUE(Redirecting1->hasBeenUsed());
   EXPECT_FALSE(Redirecting2->hasBeenUsed());
 }
+
+// Check that paths looked up in the external filesystem are unmodified, except
+// potentially to add the working directory. We cannot canonicalize away ..
+// in the presence of symlinks in the external filesystem.
+TEST(RedirectingFileSystemTest, ExternalPaths) {
+  struct InterceptorFS : llvm::vfs::ProxyFileSystem {
+    std::vector<std::string> SeenPaths;
+
+    InterceptorFS(IntrusiveRefCntPtr<FileSystem> UnderlyingFS)
+        : ProxyFileSystem(UnderlyingFS) {}
+
+    llvm::ErrorOr<llvm::vfs::Status> status(const Twine &Path) override {
+      SeenPaths.push_back(Path.str());
+      return ProxyFileSystem::status(Path);
+    }
+
+    llvm::ErrorOr<std::unique_ptr<llvm::vfs::File>>
+    openFileForRead(const Twine &Path) override {
+      SeenPaths.push_back(Path.str());
+      return ProxyFileSystem::openFileForRead(Path);
+    }
+
+    std::error_code isLocal(const Twine &Path, bool &Result) override {
+      SeenPaths.push_back(Path.str());
+      return ProxyFileSystem::isLocal(Path, Result);
+    }
+
+    vfs::directory_iterator dir_begin(const Twine &Dir,
+                                      std::error_code &EC) override {
+      SeenPaths.push_back(Dir.str());
+      return ProxyFileSystem::dir_begin(Dir, EC);
+    }
+  };
+
+  std::error_code EC;
+  auto BaseFS = makeIntrusiveRefCnt<DummyFileSystem>();
+  BaseFS->setCurrentWorkingDirectory("/cwd");
+  auto CheckFS = makeIntrusiveRefCnt<InterceptorFS>(BaseFS);
+  auto FS = vfs::RedirectingFileSystem::create({}, /*UseExternalNames=*/false,
+                                               *CheckFS);
+
+  FS->status("/a/../b");
+  FS->openFileForRead("c");
+  FS->exists("./d");
+  bool IsLocal = false;
+  FS->isLocal("/e/./../f", IsLocal);
+  FS->dir_begin(".././g", EC);
+
+  std::vector<std::string> Expected{"/a/../b", "/cwd/c", "/cwd/./d",
+                                    "/e/./../f", "/cwd/.././g"};
+
+  EXPECT_EQ(CheckFS->SeenPaths, Expected);
+
+  CheckFS->SeenPaths.clear();
+  FS->setRedirection(vfs::RedirectingFileSystem::RedirectKind::Fallback);
+  FS->status("/a/../b");
+  FS->openFileForRead("c");
+  FS->exists("./d");
+  FS->isLocal("/e/./../f", IsLocal);
+  FS->dir_begin(".././g", EC);
+
+  EXPECT_EQ(CheckFS->SeenPaths, Expected);
+}
diff --git a/llvm/unittests/Target/AArch64/AddressingModes.cpp b/llvm/unittests/Target/AArch64/AddressingModes.cpp
index 284ea7ae9233ed..0af18d886791a1 100644
--- a/llvm/unittests/Target/AArch64/AddressingModes.cpp
+++ b/llvm/unittests/Target/AArch64/AddressingModes.cpp
@@ -13,11 +13,13 @@ using namespace llvm;
 namespace {
 
 struct AddrMode : public TargetLowering::AddrMode {
-  constexpr AddrMode(GlobalValue *GV, int64_t Offs, bool HasBase, int64_t S) {
+  constexpr AddrMode(GlobalValue *GV, int64_t Offs, bool HasBase, int64_t S,
+                     int64_t SOffs = 0) {
     BaseGV = GV;
     BaseOffs = Offs;
     HasBaseReg = HasBase;
     Scale = S;
+    ScalableOffset = SOffs;
   }
 };
 struct TestCase {
@@ -153,6 +155,45 @@ const std::initializer_list<TestCase> Tests = {
     {{nullptr, 4096 + 1, true, 0}, 8, false},
 
 };
+
+struct SVETestCase {
+  AddrMode AM;
+  unsigned TypeBits;
+  unsigned NumElts;
+  bool Result;
+};
+
+const std::initializer_list<SVETestCase> SVETests = {
+    // {BaseGV, BaseOffs, HasBaseReg, Scale, SOffs}, EltBits, Count, Result
+    // Test immediate range -- [-8,7] vector's worth.
+    // <vscale x 16 x i8>, increment by one vector
+    {{nullptr, 0, true, 0, 16}, 8, 16, true},
+    // <vscale x 4 x i32>, increment by eight vectors
+    {{nullptr, 0, true, 0, 128}, 32, 4, false},
+    // <vscale x 8 x i16>, increment by seven vectors
+    {{nullptr, 0, true, 0, 112}, 16, 8, true},
+    // <vscale x 2 x i64>, decrement by eight vectors
+    {{nullptr, 0, true, 0, -128}, 64, 2, true},
+    // <vscale x 16 x i8>, decrement by nine vectors
+    {{nullptr, 0, true, 0, -144}, 8, 16, false},
+
+    // Half the size of a vector register, but allowable with extending
+    // loads and truncating stores
+    // <vscale x 8 x i8>, increment by three vectors
+    {{nullptr, 0, true, 0, 24}, 8, 8, true},
+
+    // Test invalid types or offsets
+    // <vscale x 5 x i32>, increment by one vector (base size > 16B)
+    {{nullptr, 0, true, 0, 20}, 32, 5, false},
+    // <vscale x 8 x i16>, increment by half a vector
+    {{nullptr, 0, true, 0, 8}, 16, 8, false},
+    // <vscale x 3 x i8>, increment by 3 vectors (non-power-of-two)
+    {{nullptr, 0, true, 0, 9}, 8, 3, false},
+
+    // Scalable and fixed offsets
+    // <vscale x 16 x i8>, increment by 32 then decrement by vscale x 16
+    {{nullptr, 32, true, 0, -16}, 8, 16, false},
+};
 } // namespace
 
 TEST(AddressingModes, AddressingModes) {
@@ -179,4 +220,11 @@ TEST(AddressingModes, AddressingModes) {
     Type *Typ = Type::getIntNTy(Ctx, Test.TypeBits);
     ASSERT_EQ(TLI->isLegalAddressingMode(DL, Test.AM, Typ, 0), Test.Result);
   }
+
+  for (const auto &SVETest : SVETests) {
+    Type *Ty = VectorType::get(Type::getIntNTy(Ctx, SVETest.TypeBits),
+                               ElementCount::getScalable(SVETest.NumElts));
+    ASSERT_EQ(TLI->isLegalAddressingMode(DL, SVETest.AM, Ty, 0),
+              SVETest.Result);
+  }
 }
diff --git a/llvm/unittests/Target/AArch64/CMakeLists.txt b/llvm/unittests/Target/AArch64/CMakeLists.txt
index dacd919ba1e33b..64ab991ac479a4 100644
--- a/llvm/unittests/Target/AArch64/CMakeLists.txt
+++ b/llvm/unittests/Target/AArch64/CMakeLists.txt
@@ -29,6 +29,7 @@ add_llvm_target_unittest(AArch64Tests
   MatrixRegisterAliasing.cpp
   SMEAttributesTest.cpp
   AArch64SVESchedPseudoTest.cpp
+  Immediates.cpp
   )
 
 set_property(TARGET AArch64Tests PROPERTY FOLDER "Tests/UnitTests/TargetTests")
diff --git a/llvm/unittests/Target/AArch64/Immediates.cpp b/llvm/unittests/Target/AArch64/Immediates.cpp
new file mode 100644
index 00000000000000..a4551b083b4407
--- /dev/null
+++ b/llvm/unittests/Target/AArch64/Immediates.cpp
@@ -0,0 +1,106 @@
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+#include "gtest/gtest.h"
+#include <initializer_list>
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+struct TestCase {
+  int64_t Imm;
+  bool Result;
+};
+
+const std::initializer_list<TestCase> Tests = {
+    // ScalableImm, Result
+    // No change, easily 'supported'
+    {0, true},
+
+    // addvl increments by whole registers, range [-32,31]
+    // +(16 * vscale), one register's worth
+    {16, true},
+    // -(32 * 16 * vscale)
+    {-512, true},
+    // -(33 * 16 * vscale)
+    {-528, false},
+    // +(31 * 16 * vscale)
+    {496, true},
+    // +(32 * 16 * vscale)
+    {512, false},
+
+    // inc[h|w|d] increments by the number of 16/32/64bit elements in a
+    // register. mult_imm is in the range [1,16]
+    // +(mult_imm * num_elts * vscale)
+    // +(1 * 8 * vscale), 16 bit
+    {8, true},
+    // +(15 * 8 * vscale), 16 bit
+    {120, true},
+    // +(1 * 4 * vscale), 32 bit
+    {4, true},
+    // +(7 * 4 * vscale), 32 bit
+    {28, true},
+    // +(1 * 2 * vscale), 64 bit
+    {2, true},
+    // +(13 * 2 * vscale), 64 bit
+    {26, true},
+    // +(17 * 8 * vscale), 16 bit, out of range.
+    {136, false},
+    // +(19 * 2 * vscale), 64 bit, out of range.
+    {38, false},
+    // +(21 * 4 * vscale), 32 bit, out of range.
+    {84, false},
+
+    // dec[h|w|d] -- Same as above, but negative.
+    // -(mult_imm * num_elts * vscale)
+    // -(1 * 8 * vscale), 16 bit
+    {-8, true},
+    // -(15 * 8 * vscale), 16 bit
+    {-120, true},
+    // -(1 * 4 * vscale), 32 bit
+    {-4, true},
+    // -(7 * 4 * vscale), 32 bit
+    {-28, true},
+    // -(1 * 2 * vscale), 64 bit
+    {-2, true},
+    // -(13 * 2 * vscale), 64 bit
+    {-26, true},
+    // -(17 * 8 * vscale), 16 bit, out of range.
+    {-136, false},
+    // -(19 * 2 * vscale), 64 bit, out of range.
+    {-38, false},
+    // -(21 * 4 * vscale), 32 bit, out of range.
+    {-84, false},
+
+    // Invalid; not divisible by the above powers of 2.
+    {5, false},
+};
+} // namespace
+
+TEST(Immediates, Immediates) {
+  LLVMInitializeAArch64TargetInfo();
+  LLVMInitializeAArch64Target();
+  LLVMInitializeAArch64TargetMC();
+
+  std::string Error;
+  auto TT = Triple::normalize("aarch64");
+  const Target *T = TargetRegistry::lookupTarget(TT, Error);
+
+  std::unique_ptr<TargetMachine> TM(T->createTargetMachine(
+      TT, "generic", "+sve2", TargetOptions(), std::nullopt, std::nullopt,
+      CodeGenOptLevel::Default));
+  AArch64Subtarget ST(TM->getTargetTriple(), TM->getTargetCPU(),
+                      TM->getTargetCPU(), TM->getTargetFeatureString(), *TM,
+                      true);
+
+  auto *TLI = ST.getTargetLowering();
+
+  for (const auto &Test : Tests) {
+    ASSERT_EQ(TLI->isLegalAddScalableImmediate(Test.Imm), Test.Result);
+  }
+}
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 3773f59a3c5af9..a7d0b1687a7f91 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1140,6 +1140,22 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
                  AArch64::AEK_JSCVT,       AArch64::AEK_FCMA}),
             "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "cortex-a520ae", "armv9.2-a", "crypto-neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,        AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,         AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,        AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,         AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,        AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,         AArch64::AEK_CRC,
+                 AArch64::AEK_FP,          AArch64::AEK_SB,
+                 AArch64::AEK_SSBS,        AArch64::AEK_MTE,
+                 AArch64::AEK_FP16FML,     AArch64::AEK_PAUTH,
+                 AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
+                 AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
+                 AArch64::AEK_JSCVT,       AArch64::AEK_FCMA}),
+            "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "cortex-a57", "armv8-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset({AArch64::AEK_CRC, AArch64::AEK_AES,
@@ -1283,6 +1299,23 @@ INSTANTIATE_TEST_SUITE_P(
                  AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
                  AArch64::AEK_FCMA}),
             "9.2-A"),
+        ARMCPUTestParams<AArch64::ExtensionBitset>(
+            "cortex-a720ae", "armv9.2-a", "crypto-neon-fp-armv8",
+            AArch64::ExtensionBitset(
+                {AArch64::AEK_BF16,        AArch64::AEK_I8MM,
+                 AArch64::AEK_SVE,         AArch64::AEK_SVE2,
+                 AArch64::AEK_FP16,        AArch64::AEK_DOTPROD,
+                 AArch64::AEK_LSE,         AArch64::AEK_RDM,
+                 AArch64::AEK_SIMD,        AArch64::AEK_RCPC,
+                 AArch64::AEK_RAS,         AArch64::AEK_CRC,
+                 AArch64::AEK_FP,          AArch64::AEK_SB,
+                 AArch64::AEK_SSBS,        AArch64::AEK_MTE,
+                 AArch64::AEK_FP16FML,     AArch64::AEK_PAUTH,
+                 AArch64::AEK_SVE2BITPERM, AArch64::AEK_FLAGM,
+                 AArch64::AEK_PERFMON,     AArch64::AEK_PREDRES,
+                 AArch64::AEK_PROFILE,     AArch64::AEK_JSCVT,
+                 AArch64::AEK_FCMA}),
+            "9.2-A"),
         ARMCPUTestParams<AArch64::ExtensionBitset>(
             "neoverse-v1", "armv8.4-a", "crypto-neon-fp-armv8",
             AArch64::ExtensionBitset(
@@ -1717,7 +1750,7 @@ INSTANTIATE_TEST_SUITE_P(
     ARMCPUTestParams<AArch64::ExtensionBitset>::PrintToStringParamName);
 
 // Note: number of CPUs includes aliases.
-static constexpr unsigned NumAArch64CPUArchs = 70;
+static constexpr unsigned NumAArch64CPUArchs = 72;
 
 TEST(TargetParserTest, testAArch64CPUArchList) {
   SmallVector<StringRef, NumAArch64CPUArchs> List;
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 87a2a2ae470051..4258f218794f84 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -1279,11 +1279,11 @@ TEST(Local, ExpressionForConstant) {
   EXPECT_EQ(Expr, nullptr);
 }
 
-TEST(Local, ReplaceDPValue) {
+TEST(Local, ReplaceDbgVariableRecord) {
   LLVMContext C;
 
-  // Test that RAUW also replaces the operands of DPValue objects, i.e.
-  // non-instruction stored debugging information.
+  // Test that RAUW also replaces the operands of DbgVariableRecord objects,
+  // i.e. non-instruction stored debugging information.
   std::unique_ptr<Module> M = parseIR(C,
                                       R"(
       declare void @llvm.dbg.value(metadata, metadata, metadata)
@@ -1323,24 +1323,23 @@ TEST(Local, ReplaceDPValue) {
   It = std::next(It);
   Instruction *RetInst = &*It;
 
-  // Convert DVI into a DPValue.
+  // Convert DVI into a DbgVariableRecord.
   RetInst->DbgMarker = new DPMarker();
   RetInst->DbgMarker->MarkedInstr = RetInst;
-  DPValue *DPV = new DPValue(DVI);
-  RetInst->DbgMarker->insertDbgRecord(DPV, false);
+  DbgVariableRecord *DVR = new DbgVariableRecord(DVI);
+  RetInst->DbgMarker->insertDbgRecord(DVR, false);
   // ... and erase the dbg.value.
   DVI->eraseFromParent();
 
-  // DPV should originally refer to %bar,
-  EXPECT_EQ(DPV->getVariableLocationOp(0), BarInst);
+  // DVR should originally refer to %bar,
+  EXPECT_EQ(DVR->getVariableLocationOp(0), BarInst);
 
   // Now try to replace the computation of %bar with %foo -- this should cause
-  // the DPValue's to have it's operand updated beneath it.
+  // the DbgVariableRecord's to have it's operand updated beneath it.
   BarInst->replaceAllUsesWith(FooInst);
-  // Check DPV now points at %foo.
-  EXPECT_EQ(DPV->getVariableLocationOp(0), FooInst);
+  // Check DVR now points at %foo.
+  EXPECT_EQ(DVR->getVariableLocationOp(0), FooInst);
 
   // Teardown.
   RetInst->DbgMarker->eraseFromParent();
 }
-
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index febd96086df27b..5df7990d8fc2c8 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -375,6 +375,10 @@ class AsmVariantInfo {
   int AsmVariantNo;
 };
 
+bool getPreferSmallerInstructions(CodeGenTarget const &Target) {
+  return Target.getAsmParser()->getValueAsBit("PreferSmallerInstructions");
+}
+
 /// MatchableInfo - Helper class for storing the necessary information for an
 /// instruction or alias which is capable of being matched.
 struct MatchableInfo {
@@ -502,6 +506,9 @@ struct MatchableInfo {
   /// matchable came from.
   Record *const TheDef;
 
+  // ResInstSize - The size of the resulting instruction for this matchable.
+  unsigned ResInstSize;
+
   /// DefRec - This is the definition that it came from.
   PointerUnion<const CodeGenInstruction *, const CodeGenInstAlias *> DefRec;
 
@@ -543,10 +550,12 @@ struct MatchableInfo {
 
   MatchableInfo(const CodeGenInstruction &CGI)
       : AsmVariantID(0), AsmString(CGI.AsmString), TheDef(CGI.TheDef),
-        DefRec(&CGI), UseInstAsmMatchConverter(true) {}
+        ResInstSize(TheDef->getValueAsInt("Size")), DefRec(&CGI),
+        UseInstAsmMatchConverter(true) {}
 
   MatchableInfo(std::unique_ptr<const CodeGenInstAlias> Alias)
       : AsmVariantID(0), AsmString(Alias->AsmString), TheDef(Alias->TheDef),
+        ResInstSize(Alias->ResultInst->TheDef->getValueAsInt("Size")),
         DefRec(Alias.release()), UseInstAsmMatchConverter(TheDef->getValueAsBit(
                                      "UseInstAsmMatchConverter")) {}
 
@@ -555,9 +564,9 @@ struct MatchableInfo {
   // where it was copied while being in an owning state.
   MatchableInfo(const MatchableInfo &RHS)
       : AsmVariantID(RHS.AsmVariantID), AsmString(RHS.AsmString),
-        TheDef(RHS.TheDef), DefRec(RHS.DefRec), ResOperands(RHS.ResOperands),
-        Mnemonic(RHS.Mnemonic), AsmOperands(RHS.AsmOperands),
-        RequiredFeatures(RHS.RequiredFeatures),
+        TheDef(RHS.TheDef), ResInstSize(RHS.ResInstSize), DefRec(RHS.DefRec),
+        ResOperands(RHS.ResOperands), Mnemonic(RHS.Mnemonic),
+        AsmOperands(RHS.AsmOperands), RequiredFeatures(RHS.RequiredFeatures),
         ConversionFnKind(RHS.ConversionFnKind),
         HasDeprecation(RHS.HasDeprecation),
         UseInstAsmMatchConverter(RHS.UseInstAsmMatchConverter) {
@@ -608,12 +617,18 @@ struct MatchableInfo {
   void buildInstructionResultOperands();
   void buildAliasResultOperands(bool AliasConstraintsAreChecked);
 
-  /// operator< - Compare two matchables.
-  bool operator<(const MatchableInfo &RHS) const {
+  /// shouldBeMatchedBefore - Compare two matchables for ordering.
+  bool shouldBeMatchedBefore(const MatchableInfo &RHS,
+                             bool PreferSmallerInstructions) const {
     // The primary comparator is the instruction mnemonic.
     if (int Cmp = Mnemonic.compare_insensitive(RHS.Mnemonic))
       return Cmp == -1;
 
+    // (Optionally) Order by the resultant instuctions size.
+    // eg. for ARM thumb instructions smaller encodings should be preferred.
+    if (PreferSmallerInstructions && ResInstSize != RHS.ResInstSize)
+      return ResInstSize < RHS.ResInstSize;
+
     if (AsmOperands.size() != RHS.AsmOperands.size())
       return AsmOperands.size() < RHS.AsmOperands.size();
 
@@ -652,7 +667,8 @@ struct MatchableInfo {
   /// couldMatchAmbiguouslyWith - Check whether this matchable could
   /// ambiguously match the same set of operands as \p RHS (without being a
   /// strictly superior match).
-  bool couldMatchAmbiguouslyWith(const MatchableInfo &RHS) const {
+  bool couldMatchAmbiguouslyWith(const MatchableInfo &RHS,
+                                 bool PreferSmallerInstructions) const {
     // The primary comparator is the instruction mnemonic.
     if (Mnemonic != RHS.Mnemonic)
       return false;
@@ -661,6 +677,10 @@ struct MatchableInfo {
     if (AsmVariantID != RHS.AsmVariantID)
       return false;
 
+    // The size of instruction is unambiguous.
+    if (PreferSmallerInstructions && ResInstSize != RHS.ResInstSize)
+      return false;
+
     // The number of operands is unambiguous.
     if (AsmOperands.size() != RHS.AsmOperands.size())
       return false;
@@ -1986,16 +2006,16 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
   }
   CvtOS << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n";
   CvtOS << "  const uint8_t *Converter = ConversionTable[Kind];\n";
-  CvtOS << "  unsigned OpIdx;\n";
   CvtOS << "  Inst.setOpcode(Opcode);\n";
   CvtOS << "  for (const uint8_t *p = Converter; *p; p += 2) {\n";
   if (HasOptionalOperands) {
     // When optional operands are involved, formal and actual operand indices
     // may differ. Map the former to the latter by subtracting the number of
     // absent optional operands.
-    CvtOS << "    OpIdx = *(p + 1) - DefaultsOffset[*(p + 1)];\n";
+    // FIXME: This is not an operand index in the CVT_Tied case
+    CvtOS << "    unsigned OpIdx = *(p + 1) - DefaultsOffset[*(p + 1)];\n";
   } else {
-    CvtOS << "    OpIdx = *(p + 1);\n";
+    CvtOS << "    unsigned OpIdx = *(p + 1);\n";
   }
   CvtOS << "    switch (*p) {\n";
   CvtOS << "    default: llvm_unreachable(\"invalid conversion entry!\");\n";
@@ -2004,11 +2024,11 @@ emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         << " &>(*Operands[OpIdx]).addRegOperands(Inst, 1);\n";
   CvtOS << "      break;\n";
   CvtOS << "    case CVT_Tied: {\n";
-  CvtOS << "      assert(OpIdx < (size_t)(std::end(TiedAsmOperandTable) -\n";
+  CvtOS << "      assert(*(p + 1) < (size_t)(std::end(TiedAsmOperandTable) -\n";
   CvtOS
       << "                              std::begin(TiedAsmOperandTable)) &&\n";
   CvtOS << "             \"Tied operand not found\");\n";
-  CvtOS << "      unsigned TiedResOpnd = TiedAsmOperandTable[OpIdx][0];\n";
+  CvtOS << "      unsigned TiedResOpnd = TiedAsmOperandTable[*(p + 1)][0];\n";
   CvtOS << "      if (TiedResOpnd != (uint8_t)-1)\n";
   CvtOS << "        Inst.addOperand(Inst.getOperand(TiedResOpnd));\n";
   CvtOS << "      break;\n";
@@ -3221,20 +3241,23 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   AsmMatcherInfo Info(AsmParser, Target, Records);
   Info.buildInfo();
 
+  bool PreferSmallerInstructions = getPreferSmallerInstructions(Target);
   // Sort the instruction table using the partial order on classes. We use
   // stable_sort to ensure that ambiguous instructions are still
   // deterministically ordered.
   llvm::stable_sort(
       Info.Matchables,
-      [](const std::unique_ptr<MatchableInfo> &a,
-         const std::unique_ptr<MatchableInfo> &b) { return *a < *b; });
+      [PreferSmallerInstructions](const std::unique_ptr<MatchableInfo> &A,
+                                  const std::unique_ptr<MatchableInfo> &B) {
+        return A->shouldBeMatchedBefore(*B, PreferSmallerInstructions);
+      });
 
 #ifdef EXPENSIVE_CHECKS
   // Verify that the table is sorted and operator < works transitively.
   for (auto I = Info.Matchables.begin(), E = Info.Matchables.end(); I != E;
        ++I) {
     for (auto J = I; J != E; ++J) {
-      assert(!(**J < **I));
+      assert(!(*J)->shouldBeMatchedBefore(**I, PreferSmallerInstructions));
     }
   }
 #endif
@@ -3253,7 +3276,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
         const MatchableInfo &A = **I;
         const MatchableInfo &B = **J;
 
-        if (A.couldMatchAmbiguouslyWith(B)) {
+        if (A.couldMatchAmbiguouslyWith(B, PreferSmallerInstructions)) {
           errs() << "warning: ambiguous matchables:\n";
           A.dump();
           errs() << "\nis incomparable with:\n";
@@ -3729,6 +3752,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "        } else {\n";
     OS << "          DEBUG_WITH_TYPE(\"asm-matcher\", dbgs() << \"but formal "
           "operand not required\\n\");\n";
+    OS << "          if (isSubclass(Formal, OptionalMatchClass)) {\n";
+    OS << "            OptionalOperandsMask.set(FormalIdx);\n";
+    OS << "          }\n";
     OS << "        }\n";
     OS << "        continue;\n";
   } else {
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 628bff520a12e9..732f34ed04c577 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2283,10 +2283,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_CheckField: {
       // Decode the start value.
-      unsigned Len;
-      unsigned Start = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      Len = *Ptr;)";
+      unsigned Start = decodeULEB128AndInc(++Ptr);
+      unsigned Len = *Ptr;)";
   if (IsVarLenInst)
     OS << "\n      makeUp(insn, Start + Len);";
   OS << R"(
@@ -2311,10 +2309,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       break;
     }
     case MCD::OPC_CheckPredicate: {
-      unsigned Len;
       // Decode the Predicate Index value.
-      unsigned PIdx = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
+      unsigned PIdx = decodeULEB128AndInc(++Ptr);
       // NumToSkip is a plain 24-bit integer.
       unsigned NumToSkip = *Ptr++;
       NumToSkip |= (*Ptr++) << 8;
@@ -2330,18 +2326,15 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       break;
     }
     case MCD::OPC_Decode: {
-      unsigned Len;
       // Decode the Opcode value.
-      unsigned Opc = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);
-      Ptr += Len;
+      unsigned Opc = decodeULEB128AndInc(++Ptr);
+      unsigned DecodeIdx = decodeULEB128AndInc(Ptr);
 
       MI.clear();
       MI.setOpcode(Opc);
       bool DecodeComplete;)";
   if (IsVarLenInst) {
-    OS << "\n      Len = InstrLenTable[Opc];\n"
+    OS << "\n      unsigned Len = InstrLenTable[Opc];\n"
        << "      makeUp(insn, Len);";
   }
   OS << R"(
@@ -2354,12 +2347,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       return S;
     }
     case MCD::OPC_TryDecode: {
-      unsigned Len;
       // Decode the Opcode value.
-      unsigned Opc = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      unsigned DecodeIdx = decodeULEB128(Ptr, &Len);
-      Ptr += Len;
+      unsigned Opc = decodeULEB128AndInc(++Ptr);
+      unsigned DecodeIdx = decodeULEB128AndInc(Ptr);
       // NumToSkip is a plain 24-bit integer.
       unsigned NumToSkip = *Ptr++;
       NumToSkip |= (*Ptr++) << 8;
@@ -2391,11 +2381,8 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     }
     case MCD::OPC_SoftFail: {
       // Decode the mask values.
-      unsigned Len;
-      uint64_t PositiveMask = decodeULEB128(++Ptr, &Len);
-      Ptr += Len;
-      uint64_t NegativeMask = decodeULEB128(Ptr, &Len);
-      Ptr += Len;
+      uint64_t PositiveMask = decodeULEB128AndInc(++Ptr);
+      uint64_t NegativeMask = decodeULEB128AndInc(Ptr);
       bool Fail = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
       if (Fail)
         S = MCDisassembler::SoftFail;
diff --git a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
index 7f494e532b1f44..91c3b0b4359cf0 100644
--- a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
+++ b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
@@ -40,12 +40,10 @@
 
 #include "CodeGenTarget.h"
 #include "PredicateExpander.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
-#include <set>
 #include <vector>
 
 using namespace llvm;
@@ -61,14 +59,14 @@ class MacroFusionPredicatorEmitter {
                            raw_ostream &OS);
   void emitMacroFusionImpl(std::vector<Record *> Fusions, PredicateExpander &PE,
                            raw_ostream &OS);
-  void emitPredicates(std::vector<Record *> &FirstPredicate,
+  void emitPredicates(std::vector<Record *> &FirstPredicate, bool IsCommutable,
                       PredicateExpander &PE, raw_ostream &OS);
-  void emitFirstPredicate(Record *SecondPredicate, PredicateExpander &PE,
-                          raw_ostream &OS);
-  void emitSecondPredicate(Record *SecondPredicate, PredicateExpander &PE,
-                           raw_ostream &OS);
-  void emitBothPredicate(Record *Predicates, PredicateExpander &PE,
-                         raw_ostream &OS);
+  void emitFirstPredicate(Record *SecondPredicate, bool IsCommutable,
+                          PredicateExpander &PE, raw_ostream &OS);
+  void emitSecondPredicate(Record *SecondPredicate, bool IsCommutable,
+                           PredicateExpander &PE, raw_ostream &OS);
+  void emitBothPredicate(Record *Predicates, bool IsCommutable,
+                         PredicateExpander &PE, raw_ostream &OS);
 
 public:
   MacroFusionPredicatorEmitter(RecordKeeper &R) : Records(R), Target(R) {}
@@ -103,6 +101,7 @@ void MacroFusionPredicatorEmitter::emitMacroFusionImpl(
   for (Record *Fusion : Fusions) {
     std::vector<Record *> Predicates =
         Fusion->getValueAsListOfDefs("Predicates");
+    bool IsCommutable = Fusion->getValueAsBit("IsCommutable");
 
     OS << "bool is" << Fusion->getName() << "(\n";
     OS.indent(4) << "const TargetInstrInfo &TII,\n";
@@ -111,7 +110,7 @@ void MacroFusionPredicatorEmitter::emitMacroFusionImpl(
     OS.indent(4) << "const MachineInstr &SecondMI) {\n";
     OS.indent(2) << "auto &MRI = SecondMI.getMF()->getRegInfo();\n";
 
-    emitPredicates(Predicates, PE, OS);
+    emitPredicates(Predicates, IsCommutable, PE, OS);
 
     OS.indent(2) << "return true;\n";
     OS << "}\n";
@@ -122,15 +121,16 @@ void MacroFusionPredicatorEmitter::emitMacroFusionImpl(
 }
 
 void MacroFusionPredicatorEmitter::emitPredicates(
-    std::vector<Record *> &Predicates, PredicateExpander &PE, raw_ostream &OS) {
+    std::vector<Record *> &Predicates, bool IsCommutable, PredicateExpander &PE,
+    raw_ostream &OS) {
   for (Record *Predicate : Predicates) {
     Record *Target = Predicate->getValueAsDef("Target");
     if (Target->getName() == "first_fusion_target")
-      emitFirstPredicate(Predicate, PE, OS);
+      emitFirstPredicate(Predicate, IsCommutable, PE, OS);
     else if (Target->getName() == "second_fusion_target")
-      emitSecondPredicate(Predicate, PE, OS);
+      emitSecondPredicate(Predicate, IsCommutable, PE, OS);
     else if (Target->getName() == "both_fusion_target")
-      emitBothPredicate(Predicate, PE, OS);
+      emitBothPredicate(Predicate, IsCommutable, PE, OS);
     else
       PrintFatalError(Target->getLoc(),
                       "Unsupported 'FusionTarget': " + Target->getName());
@@ -138,6 +138,7 @@ void MacroFusionPredicatorEmitter::emitPredicates(
 }
 
 void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate,
+                                                      bool IsCommutable,
                                                       PredicateExpander &PE,
                                                       raw_ostream &OS) {
   if (Predicate->isSubClassOf("WildcardPred")) {
@@ -170,6 +171,7 @@ void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate,
 }
 
 void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
+                                                       bool IsCommutable,
                                                        PredicateExpander &PE,
                                                        raw_ostream &OS) {
   if (Predicate->isSubClassOf("FusionPredicateWithMCInstPredicate")) {
@@ -182,6 +184,36 @@ void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
     OS << ")\n";
     OS.indent(4) << "  return false;\n";
     OS.indent(2) << "}\n";
+  } else if (Predicate->isSubClassOf("SameReg")) {
+    int FirstOpIdx = Predicate->getValueAsInt("FirstOpIdx");
+    int SecondOpIdx = Predicate->getValueAsInt("SecondOpIdx");
+
+    OS.indent(2) << "if (!SecondMI.getOperand(" << FirstOpIdx
+                 << ").getReg().isVirtual()) {\n";
+    OS.indent(4) << "if (SecondMI.getOperand(" << FirstOpIdx
+                 << ").getReg() != SecondMI.getOperand(" << SecondOpIdx
+                 << ").getReg())";
+
+    if (IsCommutable) {
+      OS << " {\n";
+      OS.indent(6) << "if (!SecondMI.getDesc().isCommutable())\n";
+      OS.indent(6) << "  return false;\n";
+
+      OS.indent(6)
+          << "unsigned SrcOpIdx1 = " << SecondOpIdx
+          << ", SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;\n";
+      OS.indent(6)
+          << "if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))\n";
+      OS.indent(6)
+          << "  if (SecondMI.getOperand(" << FirstOpIdx
+          << ").getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())\n";
+      OS.indent(6) << "    return false;\n";
+      OS.indent(4) << "}\n";
+    } else {
+      OS << "\n";
+      OS.indent(4) << "  return false;\n";
+    }
+    OS.indent(2) << "}\n";
   } else {
     PrintFatalError(Predicate->getLoc(),
                     "Unsupported predicate for second instruction: " +
@@ -190,13 +222,14 @@ void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
 }
 
 void MacroFusionPredicatorEmitter::emitBothPredicate(Record *Predicate,
+                                                     bool IsCommutable,
                                                      PredicateExpander &PE,
                                                      raw_ostream &OS) {
   if (Predicate->isSubClassOf("FusionPredicateWithCode"))
     OS << Predicate->getValueAsString("Predicate");
   else if (Predicate->isSubClassOf("BothFusionPredicateWithMCInstPredicate")) {
-    emitFirstPredicate(Predicate, PE, OS);
-    emitSecondPredicate(Predicate, PE, OS);
+    emitFirstPredicate(Predicate, IsCommutable, PE, OS);
+    emitSecondPredicate(Predicate, IsCommutable, PE, OS);
   } else if (Predicate->isSubClassOf("TieReg")) {
     int FirstOpIdx = Predicate->getValueAsInt("FirstOpIdx");
     int SecondOpIdx = Predicate->getValueAsInt("SecondOpIdx");
@@ -206,8 +239,28 @@ void MacroFusionPredicatorEmitter::emitBothPredicate(Record *Predicate,
                  << ").isReg() &&\n";
     OS.indent(2) << "      FirstMI->getOperand(" << FirstOpIdx
                  << ").getReg() == SecondMI.getOperand(" << SecondOpIdx
-                 << ").getReg()))\n";
-    OS.indent(2) << "  return false;\n";
+                 << ").getReg()))";
+
+    if (IsCommutable) {
+      OS << " {\n";
+      OS.indent(4) << "if (!SecondMI.getDesc().isCommutable())\n";
+      OS.indent(4) << "  return false;\n";
+
+      OS.indent(4)
+          << "unsigned SrcOpIdx1 = " << SecondOpIdx
+          << ", SrcOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex;\n";
+      OS.indent(4)
+          << "if (TII.findCommutedOpIndices(SecondMI, SrcOpIdx1, SrcOpIdx2))\n";
+      OS.indent(4)
+          << "  if (FirstMI->getOperand(" << FirstOpIdx
+          << ").getReg() != SecondMI.getOperand(SrcOpIdx2).getReg())\n";
+      OS.indent(4) << "    return false;\n";
+      OS.indent(2) << "}";
+    } else {
+      OS << "\n";
+      OS.indent(2) << "  return false;";
+    }
+    OS << "\n";
   } else
     PrintFatalError(Predicate->getLoc(),
                     "Unsupported predicate for both instruction: " +
diff --git a/llvm/utils/TableGen/X86ManualFoldTables.def b/llvm/utils/TableGen/X86ManualFoldTables.def
index 8e6cb4a7bd8798..2ebd92883c0719 100644
--- a/llvm/utils/TableGen/X86ManualFoldTables.def
+++ b/llvm/utils/TableGen/X86ManualFoldTables.def
@@ -227,6 +227,13 @@ NOFOLD(MMX_MOVQ64rr_REV)
 NOFOLD(INSERTPSrr)
 NOFOLD(VINSERTPSZrr)
 NOFOLD(VINSERTPSrr)
+// Memory faults are suppressed for CFCMOV with memory operand.
+NOFOLD(CFCMOV16rr_REV)
+NOFOLD(CFCMOV32rr_REV)
+NOFOLD(CFCMOV64rr_REV)
+NOFOLD(CFCMOV16rr_ND)
+NOFOLD(CFCMOV32rr_ND)
+NOFOLD(CFCMOV64rr_ND)
 #undef NOFOLD
 
 #ifndef ENTRY
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 39006a40daeb6a..a2bc037b690c6f 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -540,6 +540,13 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPERAND(relocation)
     HANDLE_OPERAND(opcodeModifier)
     break;
+  case X86Local::MRMDestRegCC:
+    assert(numPhysicalOperands == 3 &&
+           "Unexpected number of operands for MRMDestRegCC");
+    HANDLE_OPERAND(rmRegister)
+    HANDLE_OPERAND(roRegister)
+    HANDLE_OPERAND(opcodeModifier)
+    break;
   case X86Local::MRMDestReg:
     // Operand 1 is a register operand in the R/M field.
     // - In AVX512 there may be a mask operand here -
@@ -566,6 +573,13 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPTIONAL(immediate)
     HANDLE_OPTIONAL(immediate)
     break;
+  case X86Local::MRMDestMemCC:
+    assert(numPhysicalOperands == 3 &&
+           "Unexpected number of operands for MRMDestMemCC");
+    HANDLE_OPERAND(memory)
+    HANDLE_OPERAND(roRegister)
+    HANDLE_OPERAND(opcodeModifier)
+    break;
   case X86Local::MRMDestMem4VOp3CC:
     // Operand 1 is a register operand in the Reg/Opcode field.
     // Operand 2 is a register operand in the R/M field.
@@ -650,8 +664,10 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPTIONAL(immediate)
     break;
   case X86Local::MRMSrcRegCC:
-    assert(numPhysicalOperands == 3 &&
+    assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 4 &&
            "Unexpected number of operands for MRMSrcRegCC");
+    if (IsND)
+      HANDLE_OPERAND(vvvvRegister)
     HANDLE_OPERAND(roRegister)
     HANDLE_OPERAND(rmRegister)
     HANDLE_OPERAND(opcodeModifier)
@@ -700,8 +716,10 @@ void RecognizableInstr::emitInstructionSpecifier() {
     HANDLE_OPTIONAL(immediate)
     break;
   case X86Local::MRMSrcMemCC:
-    assert(numPhysicalOperands == 3 &&
+    assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 4 &&
            "Unexpected number of operands for MRMSrcMemCC");
+    if (IsND)
+      HANDLE_OPERAND(vvvvRegister)
     HANDLE_OPERAND(roRegister)
     HANDLE_OPERAND(memory)
     HANDLE_OPERAND(opcodeModifier)
@@ -871,6 +889,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
     filter = std::make_unique<DumbFilter>();
     break;
   case X86Local::MRMDestReg:
+  case X86Local::MRMDestRegCC:
   case X86Local::MRMSrcReg:
   case X86Local::MRMSrcReg4VOp3:
   case X86Local::MRMSrcRegOp4:
@@ -880,6 +899,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
     filter = std::make_unique<ModFilter>(true);
     break;
   case X86Local::MRMDestMem:
+  case X86Local::MRMDestMemCC:
   case X86Local::MRMDestMem4VOp3CC:
   case X86Local::MRMDestMemFSIB:
   case X86Local::MRMSrcMem:
@@ -950,6 +970,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   if (Form == X86Local::AddRegFrm || Form == X86Local::MRMSrcRegCC ||
       Form == X86Local::MRMSrcMemCC || Form == X86Local::MRMXrCC ||
       Form == X86Local::MRMXmCC || Form == X86Local::AddCCFrm ||
+      Form == X86Local::MRMDestRegCC || Form == X86Local::MRMDestMemCC ||
       Form == X86Local::MRMDestMem4VOp3CC) {
     uint8_t Count = Form == X86Local::AddRegFrm ? 8 : 16;
     assert(((opcodeToSet % Count) == 0) && "ADDREG_FRM opcode not aligned");
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index ad319b3f28b723..68af68fb5aa033 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -106,6 +106,8 @@ enum {
   RawFrmImm16 = 8,
   AddCCFrm = 9,
   PrefixByte = 10,
+  MRMDestRegCC = 18,
+  MRMDestMemCC = 19,
   MRMDestMem4VOp3CC = 20,
   MRMr0 = 21,
   MRMSrcMemFSIB = 22,
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 3b41f9b4007f34..cd90c7752e1642 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -81,6 +81,7 @@ static_library("bugprone") {
     "SuspiciousReallocUsageCheck.cpp",
     "SuspiciousSemicolonCheck.cpp",
     "SuspiciousStringCompareCheck.cpp",
+    "SuspiciousStringviewDataUsageCheck.cpp",
     "SwappedArgumentsCheck.cpp",
     "SwitchMissingDefaultCaseCheck.cpp",
     "TerminatingContinueCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index d484ff9cddd91c..b18b109dcff56d 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -25,6 +25,7 @@ diag_groups = [
   "CrossTU",
   "Driver",
   "Frontend",
+  "InstallAPI",
   "Lex",
   "Parse",
   "Refactoring",
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
index e69567c2d9c652..04f20211b3c710 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
@@ -23,8 +23,8 @@ static_library("FlowSensitive") {
     target_gen_dir,
   ]
   sources = [
+    "AdornedCFG.cpp",
     "Arena.cpp",
-    "ControlFlowContext.cpp",
     "DataflowAnalysisContext.cpp",
     "DataflowEnvironment.cpp",
     "DebugSupport.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 7c00aaffdc1b91..25fcdc4d001512 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -205,6 +205,7 @@ copy("Headers") {
     "ia32intrin.h",
     "immintrin.h",
     "intrin.h",
+    "intrin0.h",
     "inttypes.h",
     "invpcidintrin.h",
     "iso646.h",
@@ -256,6 +257,7 @@ copy("Headers") {
     "ppc_wrappers/xmmintrin.h",
     "prfchiintrin.h",
     "prfchwintrin.h",
+    "ptrauth.h",
     "ptwriteintrin.h",
     "raointintrin.h",
     "rdpruintrin.h",
@@ -309,6 +311,7 @@ copy("Headers") {
     "xsaveoptintrin.h",
     "xsavesintrin.h",
     "xtestintrin.h",
+    "yvals_core.h",
   ]
   outputs = [ "$clang_resource_dir/include/{{source_target_relative}}" ]
 }
diff --git a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
index 5e533bf23ec475..2f43a915b2a664 100644
--- a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
@@ -3,10 +3,12 @@ static_library("InstallAPI") {
   configs += [ "//llvm/utils/gn/build:clang_code" ]
   deps = [
     "//clang/lib/AST",
+    "//llvm/lib/Demangle",
     "//llvm/lib/Support",
     "//llvm/lib/TextAPI",
   ]
   sources = [
+    "DylibVerifier.cpp",
     "FileList.cpp",
     "Frontend.cpp",
     "HeaderFile.cpp",
diff --git a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn
index 4f6895181f552a..89f0107686f91d 100644
--- a/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/clang-installapi/BUILD.gn
@@ -1,8 +1,15 @@
+import("//llvm/utils/TableGen/tablegen.gni")
 import("//llvm/utils/gn/build/driver_executable.gni")
 
+tablegen("InstallAPIOpts") {
+  visibility = [ ":clang-installapi" ]
+  args = [ "-gen-opt-parser-defs" ]
+}
+
 driver_executable("clang-installapi") {
   configs += [ "//llvm/utils/gn/build:clang_code" ]
   deps = [
+    ":InstallAPIOpts",
     "//clang/lib/Driver",
     "//clang/lib/Frontend",
     "//clang/lib/InstallAPI",
@@ -10,6 +17,7 @@ driver_executable("clang-installapi") {
     "//llvm/lib/Support",
     "//llvm/lib/TargetParser",
     "//llvm/lib/TextAPI",
+    "//llvm/lib/TextAPI/BinaryReader",
   ]
   sources = [
     "ClangInstallAPI.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index f3d7b1bceb4d51..3c8b80526773a3 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -824,10 +824,7 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_constant_evaluated.h",
       "__type_traits/is_constructible.h",
       "__type_traits/is_convertible.h",
-      "__type_traits/is_copy_assignable.h",
-      "__type_traits/is_copy_constructible.h",
       "__type_traits/is_core_convertible.h",
-      "__type_traits/is_default_constructible.h",
       "__type_traits/is_destructible.h",
       "__type_traits/is_empty.h",
       "__type_traits/is_enum.h",
@@ -843,17 +840,10 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_member_function_pointer.h",
       "__type_traits/is_member_object_pointer.h",
       "__type_traits/is_member_pointer.h",
-      "__type_traits/is_move_assignable.h",
-      "__type_traits/is_move_constructible.h",
       "__type_traits/is_nothrow_assignable.h",
       "__type_traits/is_nothrow_constructible.h",
       "__type_traits/is_nothrow_convertible.h",
-      "__type_traits/is_nothrow_copy_assignable.h",
-      "__type_traits/is_nothrow_copy_constructible.h",
-      "__type_traits/is_nothrow_default_constructible.h",
       "__type_traits/is_nothrow_destructible.h",
-      "__type_traits/is_nothrow_move_assignable.h",
-      "__type_traits/is_nothrow_move_constructible.h",
       "__type_traits/is_null_pointer.h",
       "__type_traits/is_object.h",
       "__type_traits/is_pod.h",
@@ -874,14 +864,9 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_trivial.h",
       "__type_traits/is_trivially_assignable.h",
       "__type_traits/is_trivially_constructible.h",
-      "__type_traits/is_trivially_copy_assignable.h",
-      "__type_traits/is_trivially_copy_constructible.h",
       "__type_traits/is_trivially_copyable.h",
-      "__type_traits/is_trivially_default_constructible.h",
       "__type_traits/is_trivially_destructible.h",
       "__type_traits/is_trivially_lexicographically_comparable.h",
-      "__type_traits/is_trivially_move_assignable.h",
-      "__type_traits/is_trivially_move_constructible.h",
       "__type_traits/is_trivially_relocatable.h",
       "__type_traits/is_unbounded_array.h",
       "__type_traits/is_union.h",
diff --git a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
index 39c3efaa040a54..7c9edfb016d7aa 100644
--- a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
@@ -16,6 +16,7 @@ static_library("Host") {
   ]
   public_deps = [ "//llvm/utils/gn/build/libs/xml" ]
   sources = [
+    "common/Alarm.cpp",
     "common/File.cpp",
     "common/FileAction.cpp",
     "common/FileCache.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
index 6c667b280aa765..9d57f2f0877e3e 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn
@@ -34,6 +34,7 @@ static_library("CPlusPlus") {
     "LibCxxMap.cpp",
     "LibCxxQueue.cpp",
     "LibCxxRangesRefView.cpp",
+    "LibCxxSliceArray.cpp",
     "LibCxxSpan.cpp",
     "LibCxxTuple.cpp",
     "LibCxxUnorderedMap.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/LogicalView/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/LogicalView/BUILD.gn
index 763fdf21a1c81e..12b9540cc6d98e 100644
--- a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/LogicalView/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/LogicalView/BUILD.gn
@@ -29,6 +29,6 @@ static_library("LogicalView") {
     "Readers/LVBinaryReader.cpp",
     "Readers/LVCodeViewReader.cpp",
     "Readers/LVCodeViewVisitor.cpp",
-    "Readers/LVELFReader.cpp",
+    "Readers/LVDWARFReader.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 04ae43a13d5b09..ba0f6d8c0f8cff 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -90,6 +90,8 @@ static_library("Support") {
     "GlobPattern.cpp",
     "GraphWriter.cpp",
     "Hashing.cpp",
+    "HexagonAttributeParser.cpp",
+    "HexagonAttributes.cpp",
     "InitLLVM.cpp",
     "InstructionCost.cpp",
     "IntEqClasses.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
index 3d8e3e66d26db1..12d875cf40c98b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/MCTargetDesc/BUILD.gn
@@ -103,6 +103,7 @@ static_library("MCTargetDesc") {
     "AMDGPUInstPrinter.cpp",
     "AMDGPUMCAsmInfo.cpp",
     "AMDGPUMCCodeEmitter.cpp",
+    "AMDGPUMCExpr.cpp",
     "AMDGPUMCTargetDesc.cpp",
     "AMDGPUTargetStreamer.cpp",
     "R600InstPrinter.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn
index d22d0433656e51..5710e07b07cf7b 100644
--- a/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn
@@ -31,7 +31,6 @@ driver_executable("dsymutil") {
     "MachOUtils.cpp",
     "RelocationMap.cpp",
     "Reproducer.cpp",
-    "SymbolMap.cpp",
     "dsymutil.cpp",
   ]
   if (host_os == "mac") {
diff --git a/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/LogicalView/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/LogicalView/BUILD.gn
index 9cbddd8af9a8f8..d87bac64e7b349 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/LogicalView/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/LogicalView/BUILD.gn
@@ -14,7 +14,7 @@ unittest("DebugInfoLogicalViewTests") {
     "CodeViewReaderTest.cpp",
     "CommandLineOptionsTest.cpp",
     "CompareElementsTest.cpp",
-    "ELFReaderTest.cpp",
+    "DWARFReaderTest.cpp",
     "LocationRangesTest.cpp",
     "LogicalElementsTest.cpp",
     "SelectElementsTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
index be1efff15f5aa0..3e05b9b1f8c3f7 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
@@ -24,6 +24,7 @@ unittest("OrcJITTests") {
     "JITTargetMachineBuilderTest.cpp",
     "LazyCallThroughAndReexportsTest.cpp",
     "LookupAndRecordAddrsTest.cpp",
+    "MachOPlatformTest.cpp",
     "MapperJITLinkMemoryManagerTest.cpp",
     "MemoryMapperTest.cpp",
     "ObjectFormatsTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/AArch64/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/AArch64/BUILD.gn
index 8fde227c451796..cddec7aee57454 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Target/AArch64/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Target/AArch64/BUILD.gn
@@ -20,6 +20,7 @@ unittest("AArch64Tests") {
     "AArch64SVESchedPseudoTest.cpp",
     "AddressingModes.cpp",
     "DecomposeStackOffsetTest.cpp",
+    "Immediates.cpp",
     "InstSizes.cpp",
     "MatrixRegisterAliasing.cpp",
     "SMEAttributesTest.cpp",
diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh
index 9fd906c49ea6e5..66bef82586a348 100755
--- a/llvm/utils/release/export.sh
+++ b/llvm/utils/release/export.sh
@@ -84,7 +84,7 @@ export_sources() {
     # Determine the release by fetching the version from LLVM's CMakeLists.txt
     # in the specified git ref.
     if [ -n "$snapshot" ]; then
-        release=$(git -C $llvm_src_dir show $snapshot:llvm/CMakeLists.txt | grep -ioP 'set\(\s*LLVM_VERSION_(MAJOR|MINOR|PATCH)\s\K[0-9]+' | paste -sd '.')
+        release=$(git -C $llvm_src_dir show $snapshot:cmake/Modules/LLVMVersion.cmake | grep -ioP 'set\(\s*LLVM_VERSION_(MAJOR|MINOR|PATCH)\s\K[0-9]+' | paste -sd '.')
     fi
     
     tag="llvmorg-$release"
diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index d823afb659c8db..b3d7a788ccbba3 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -229,10 +229,10 @@ MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIBasicTypeAttrGet(
 
 /// Creates a LLVM DICompositeType attribute.
 MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDICompositeTypeAttrGet(
-    MlirContext ctx, unsigned int tag, MlirAttribute name, MlirAttribute file,
-    uint32_t line, MlirAttribute scope, MlirAttribute baseType, int64_t flags,
-    uint64_t sizeInBits, uint64_t alignInBits, intptr_t nElements,
-    MlirAttribute const *elements);
+    MlirContext ctx, unsigned int tag, MlirAttribute recId, MlirAttribute name,
+    MlirAttribute file, uint32_t line, MlirAttribute scope,
+    MlirAttribute baseType, int64_t flags, uint64_t sizeInBits,
+    uint64_t alignInBits, intptr_t nElements, MlirAttribute const *elements);
 
 /// Creates a LLVM DIDerivedType attribute.
 MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIDerivedTypeAttrGet(
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 36c2e691247318..d688022b6dccd8 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -142,11 +142,21 @@ class OpFilter {
   /// This function adds an ALLOW entry.
   void allowDialect(StringRef dialectNamespace) {
     Entry::FilterFn filterFn = [=](Operation *op) {
-      return op->getDialect()->getNamespace() == dialectNamespace;
+      return op->getName().getDialectNamespace() == dialectNamespace;
     };
     entries.push_back(Entry{filterFn, Entry::FilterType::ALLOW});
   }
 
+  /// Deny the given dialect.
+  ///
+  /// This function adds a DENY entry.
+  void denyDialect(StringRef dialectNamespace) {
+    Entry::FilterFn filterFn = [=](Operation *op) {
+      return op->getDialect()->getNamespace() == dialectNamespace;
+    };
+    entries.push_back(Entry{filterFn, Entry::FilterType::DENY});
+  }
+
   /// Allow the given ops.
   ///
   /// This function adds one or multiple ALLOW entries.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
index 862abf00d03450..759de745440c21 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
@@ -29,6 +29,8 @@ add_mlir_doc(LLVMIntrinsicOps LLVMIntrinsicOps Dialects/ -gen-op-doc)
 set(LLVM_TARGET_DEFINITIONS LLVMInterfaces.td)
 mlir_tablegen(LLVMInterfaces.h.inc -gen-op-interface-decls)
 mlir_tablegen(LLVMInterfaces.cpp.inc -gen-op-interface-defs)
+mlir_tablegen(LLVMAttrInterfaces.h.inc -gen-attr-interface-decls)
+mlir_tablegen(LLVMAttrInterfaces.cpp.inc -gen-attr-interface-defs)
 mlir_tablegen(LLVMTypeInterfaces.h.inc -gen-type-interface-decls)
 mlir_tablegen(LLVMTypeInterfaces.cpp.inc -gen-type-interface-defs)
 add_public_tablegen_target(MLIRLLVMInterfacesIncGen)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index a831b076fb864f..d4d56ae0f762bc 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -10,6 +10,7 @@
 #define LLVMIR_ATTRDEFS
 
 include "mlir/Dialect/LLVMIR/LLVMDialect.td"
+include "mlir/Dialect/LLVMIR/LLVMInterfaces.td"
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/CommonAttrConstraints.td"
 
@@ -238,7 +239,7 @@ def LoopAnnotationAttr : LLVM_Attr<"LoopAnnotation", "loop_annotation"> {
 //===----------------------------------------------------------------------===//
 
 class LLVM_DIParameter<string summary, string default, string parseName,
-                       string printName = parseName>
+                       string errorCase, string printName = parseName>
     : AttrOrTypeParameter<"unsigned", "debug info " # summary> {
   let parser = [{ [&]() -> FailureOr<unsigned> {
     SMLoc tagLoc = $_parser.getCurrentLocation();
@@ -246,33 +247,35 @@ class LLVM_DIParameter<string summary, string default, string parseName,
     if ($_parser.parseKeyword(&name))
       return failure();
 
-    if (unsigned tag = llvm::dwarf::get}] # parseName # [{(name))
-      return tag;
-    return $_parser.emitError(tagLoc)
-      << "invalid debug info }] # summary # [{ name: " << name;
+    unsigned tag = llvm::dwarf::get}] # parseName # [{(name);
+    if (tag == }] # errorCase # [{)
+      return $_parser.emitError(tagLoc)
+        << "invalid debug info }] # summary # [{ name: " << name;
+    return tag;
   }() }];
   let printer = "$_printer << llvm::dwarf::" # printName # "String($_self)";
   let defaultValue = default;
 }
 
 def LLVM_DICallingConventionParameter : LLVM_DIParameter<
-  "calling convention", /*default=*/"0", "CallingConvention", "Convention"
+  "calling convention", /*default=*/"0", "CallingConvention", /*errorCase=*/"0",
+  "Convention"
 >;
 
 def LLVM_DIEncodingParameter : LLVM_DIParameter<
-  "encoding", /*default=*/"0", "AttributeEncoding"
+  "encoding", /*default=*/"0", "AttributeEncoding", /*errorCase=*/"0"
 >;
 
 def LLVM_DILanguageParameter : LLVM_DIParameter<
-  "language", /*default=*/"", "Language"
+  "language", /*default=*/"", "Language", /*errorCase=*/"0"
 >;
 
 def LLVM_DITagParameter : LLVM_DIParameter<
-  "tag", /*default=*/"", "Tag"
+  "tag", /*default=*/"", "Tag", /*errorCase=*/"llvm::dwarf::DW_TAG_invalid"
 >;
 
 def LLVM_DIOperationEncodingParameter : LLVM_DIParameter<
-  "operation encoding", /*default=*/"", "OperationEncoding"
+  "operation encoding", /*default=*/"", "OperationEncoding", /*errorCase=*/"0"
 >;
 
 //===----------------------------------------------------------------------===//
@@ -357,9 +360,11 @@ def LLVM_DICompileUnitAttr : LLVM_Attr<"DICompileUnit", "di_compile_unit",
 //===----------------------------------------------------------------------===//
 
 def LLVM_DICompositeTypeAttr : LLVM_Attr<"DICompositeType", "di_composite_type",
-                                         /*traits=*/[], "DITypeAttr"> {
+                                         [LLVM_DIRecursiveTypeAttrInterface],
+                                         "DITypeAttr"> {
   let parameters = (ins
     LLVM_DITagParameter:$tag,
+    OptionalParameter<"DistinctAttr">:$recId,
     OptionalParameter<"StringAttr">:$name,
     OptionalParameter<"DIFileAttr">:$file,
     OptionalParameter<"uint32_t">:$line,
@@ -371,6 +376,21 @@ def LLVM_DICompositeTypeAttr : LLVM_Attr<"DICompositeType", "di_composite_type",
     OptionalArrayRefParameter<"DINodeAttr">:$elements
   );
   let assemblyFormat = "`<` struct(params) `>`";
+  let extraClassDeclaration = [{
+    /// Requirements of DIRecursiveTypeAttrInterface.
+    /// @{
+
+    /// Get whether this attr describes a recursive self reference.
+    bool isRecSelf() { return getTag() == 0; }
+
+    /// Get a copy of this type attr but with the recursive ID set to `recId`.
+    DIRecursiveTypeAttrInterface withRecId(DistinctAttr recId);
+
+    /// Build a rec-self instance using the provided `recId`.
+    static DIRecursiveTypeAttrInterface getRecSelf(DistinctAttr recId);
+
+    /// @}
+  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h
index c370bfa2b733d6..091a817caf3d6e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrs.h
@@ -84,6 +84,8 @@ using linkage::Linkage;
 } // namespace LLVM
 } // namespace mlir
 
+#include "mlir/Dialect/LLVMIR/LLVMAttrInterfaces.h.inc"
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
index 3b2a132a881e4e..e7a1da8ee560ef 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines op and type interfaces for the LLVM dialect in MLIR.
+// This file defines interfaces for the LLVM dialect in MLIR.
 //
 //===----------------------------------------------------------------------===//
 
@@ -319,4 +319,58 @@ def LLVM_PointerElementTypeInterface
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// LLVM dialect attr interfaces.
+//===----------------------------------------------------------------------===//
+
+def LLVM_DIRecursiveTypeAttrInterface
+  : AttrInterface<"DIRecursiveTypeAttrInterface"> {
+  let description = [{
+    This attribute represents a DITypeAttr that is recursive. Only DITypeAttrs
+    that translate to LLVM DITypes that support mutation should implement this
+    interface.
+
+    There are two modes for conforming attributes:
+
+    1. "rec-decl":
+      - This attr is a recursive declaration identified by a recId.
+
+    2. "rec-self":
+      - This attr is considered a recursive self reference.
+      - This attr itself is a placeholder type that should be conceptually
+        replaced with the closest parent attr of the same type with the same
+        recId.
+
+    For example, to represent a linked list struct:
+
+      #rec_self = di_composite_type<recId = 0>
+      #ptr = di_derived_type<baseType: #rec_self, ...>
+      #field = di_derived_type<name = "next", baseType: #ptr, ...>
+      #rec = di_composite_type<recId = 0, name = "Node", elements: #field, ...>
+      #var = di_local_variable<type = #rec, ...>
+
+    Note that a rec-self without an outer rec-decl with the same recId is
+    conceptually the same as an "unbound" variable. The context needs to provide
+    meaning to the rec-self.
+  }];
+  let cppNamespace = "::mlir::LLVM";
+  let methods = [
+    InterfaceMethod<[{
+      Get whether this attr describes a recursive self reference.
+    }], "bool", "isRecSelf", (ins)>,
+    InterfaceMethod<[{
+      Get the recursive ID used for matching "rec-decl" with "rec-self".
+      If this attr instance is not recursive, return a null attribute.
+    }], "DistinctAttr", "getRecId", (ins)>,
+    InterfaceMethod<[{
+      Get a copy of this type attr but with the recursive ID set to `recId`.
+    }], "DIRecursiveTypeAttrInterface", "withRecId",
+    (ins "DistinctAttr":$recId)>,
+    StaticInterfaceMethod<[{
+      Build a rec-self instance using the provided `recId`.
+    }], "DIRecursiveTypeAttrInterface", "getRecSelf",
+    (ins "DistinctAttr":$recId)>
+  ];
+}
+
 #endif // LLVMIR_INTERFACES
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 8ec8e16f75c94b..728e92c9dc8dcf 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -409,6 +409,33 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
   let assemblyFormat = "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? attr-dict";
 }
 
+def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> 
+{
+  let arguments = (ins Optional<I32>:$barrierId, I32:$numberOfThreads);
+
+  let description = [{
+    Thread that executes this op announces their arrival at the barrier with 
+    given id and continue their execution.
+
+    The default barrier id is 0 that is similar to `nvvm.barrier` Op. When 
+    `barrierId` is not present, the default barrier id is used. 
+
+    [For more information, see PTX ISA]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
+  }];
+  
+  let assemblyFormat = "(`id` `=` $barrierId^)? `number_of_threads` `=` $numberOfThreads attr-dict";
+
+  let extraClassDefinition = [{
+    std::string $cppClass::getPtx() {
+      std::string ptx = "bar.arrive ";
+      if (getBarrierId()) { ptx += "%0, %1'"; } 
+      else { ptx += "0, %0;"; }
+      return ptx;
+    }
+  }];
+}
+
 def NVVM_ClusterArriveOp : NVVM_Op<"cluster.arrive"> {
   let arguments = (ins OptionalAttr<UnitAttr>:$aligned);
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index abb38a3df8068c..1dabf5d7979b70 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -162,10 +162,18 @@ def ROCDL_BallotOp :
   ROCDL_Op<"ballot">,
   Results<(outs LLVM_Type:$res)>,
   Arguments<(ins I1:$pred)> {
+  let summary = "Vote across thread group";
+
+  let description = [{
+      Ballot provides a bit mask containing the 1-bit predicate value from each lane. 
+      The nth bit of the result contains the 1 bit contributed by the nth warp lane.
+  }];
+
   string llvmBuilder = [{
       $res = createIntrinsicCall(builder,
-            llvm::Intrinsic::amdgcn_ballot, {$pred}, {llvm::Type::getInt32Ty(moduleTranslation.getLLVMContext())});
+            llvm::Intrinsic::amdgcn_ballot, {$pred}, {$_resultType});
   }];
+
   let assemblyFormat = "$pred attr-dict `:` type($res)";
 }
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index c64ecb79c5ca51..feb3b3f03cf538 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -460,7 +460,8 @@ LogicalResult promoteSubviewsPrecondition(Operation *op,
 LogicalResult vectorizeOpPrecondition(Operation *op,
                                       ArrayRef<int64_t> inputVectorSizes = {},
                                       ArrayRef<bool> inputScalableVecDims = {},
-                                      bool vectorizeNDExtract = false);
+                                      bool vectorizeNDExtract = false,
+                                      bool flatten1DDepthwiseConv = false);
 
 //===----------------------------------------------------------------------===//
 // Transformations exposed as functional-style API calls.
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
index e48a56f0625d3f..3ee239d6e1e3ec 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
@@ -800,6 +800,8 @@ def SPIRV_SelectOp : SPIRV_Op<"Select",
   // These ops require dynamic availability specification based on operand and
   // result types.
   bit autogenAvailability = 0;
+
+  let hasFolder = 1;
 }
 
 // -----
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index 0498576fcffc51..29cf8c32447ecf 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -1419,7 +1419,7 @@ def SparseTensor_ForeachOp : SparseTensor_Op<"foreach",
 }
 
 //===----------------------------------------------------------------------===//
-// Sparse Tensor Debugging Operations.
+// Sparse Tensor Debugging and Test-Only Operations.
 //===----------------------------------------------------------------------===//
 
 def SparseTensor_PrintOp : SparseTensor_Op<"print">,
@@ -1440,4 +1440,25 @@ def SparseTensor_PrintOp : SparseTensor_Op<"print">,
   let assemblyFormat = "$tensor attr-dict `:` type($tensor)";
 }
 
+def SparseTensor_HasRuntimeLibraryOp
+    : SparseTensor_Op<"has_runtime_library", []>, Results<(outs I1:$result)> {
+  string summary = "Indicates whether running in runtime/codegen mode";
+  string description = [{
+    Returns a boolean value that indicates whether the sparsifier runs in
+    runtime library mode or not. For testing only! This operation is useful
+    for writing test cases that require different code depending on
+    runtime/codegen mode.
+
+    Example:
+
+    ```mlir
+    %has_runtime = sparse_tensor.has_runtime_library
+    scf.if %has_runtime {
+      ...
+    }
+    ```
+  }];
+  let assemblyFormat = "attr-dict";
+}
+
 #endif // SPARSETENSOR_OPS
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index f6b03a0f2c8007..3ce16ef361f37d 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -9,12 +9,15 @@
 #ifndef MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
 #define MLIR_DIALECT_VECTOR_UTILS_VECTORUTILS_H_
 
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Support/LLVM.h"
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 namespace mlir {
 
@@ -98,6 +101,17 @@ bool isContiguousSlice(MemRefType memrefType, VectorType vectorType);
 std::optional<StaticTileOffsetRange>
 createUnrollIterator(VectorType vType, int64_t targetRank = 1);
 
+/// A wrapper for getMixedSizes for vector.transfer_read and
+/// vector.transfer_write Ops (for source and destination, respectively).
+///
+/// Tensor and MemRef types implement their own, very similar version of
+/// getMixedSizes. This method will call the appropriate version (depending on
+/// `hasTensorSemantics`). It will also automatically extract the operand for
+/// which to call it on (source for "read" and destination for "write" ops).
+SmallVector<OpFoldResult> getMixedSizesXfer(bool hasTensorSemantics,
+                                            Operation *xfer,
+                                            RewriterBase &rewriter);
+
 } // namespace vector
 
 /// Constructs a permutation map of invariant memref indices to vector
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
index f927b82628b1a6..714e664dd0f4eb 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -206,7 +206,7 @@ class SparseTensorReader final {
     auto *lvlCOO = readCOO<V>(map, lvlSizes);
     auto *tensor = SparseTensorStorage<P, I, V>::newFromCOO(
         dimRank, getDimSizes(), lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,
-        *lvlCOO);
+        lvlCOO);
     delete lvlCOO;
     return tensor;
   }
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index 468782ebef4e34..773957a8b51162 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -197,37 +197,22 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
                       const uint64_t *lvl2dim)
       : SparseTensorStorageBase(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
                                 dim2lvl, lvl2dim),
-        positions(lvlRank), coordinates(lvlRank), lvlCursor(lvlRank), coo() {}
+        positions(lvlRank), coordinates(lvlRank), lvlCursor(lvlRank) {}
 
 public:
   /// Constructs a sparse tensor with the given encoding, and allocates
-  /// overhead storage according to some simple heuristics. When the
-  /// `bool` argument is true and `lvlTypes` are all dense, then this
-  /// ctor will also initialize the values array with zeros. That
-  /// argument should be true when an empty tensor is intended; whereas
-  /// it should usually be false when the ctor will be followed up by
-  /// some other form of initialization.
+  /// overhead storage according to some simple heuristics. When lvlCOO
+  /// is set, the sparse tensor initializes with the contents from that
+  /// data structure. Otherwise, an empty sparse tensor results.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const uint64_t *lvlSizes,
                       const LevelType *lvlTypes, const uint64_t *dim2lvl,
-                      const uint64_t *lvl2dim, SparseTensorCOO<V> *lvlCOO,
-                      bool initializeValuesIfAllDense);
+                      const uint64_t *lvl2dim, SparseTensorCOO<V> *lvlCOO);
 
   /// Constructs a sparse tensor with the given encoding, and initializes
-  /// the contents from the COO. This ctor performs the same heuristic
-  /// overhead-storage allocation as the ctor above.
-  SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
-                      uint64_t lvlRank, const uint64_t *lvlSizes,
-                      const LevelType *lvlTypes, const uint64_t *dim2lvl,
-                      const uint64_t *lvl2dim, SparseTensorCOO<V> &lvlCOO);
-
-  /// Constructs a sparse tensor with the given encoding, and initializes
-  /// the contents from the level buffers. This ctor allocates exactly
-  /// the required amount of overhead storage, not using any heuristics.
-  /// It assumes that the data provided by `lvlBufs` can be directly used to
-  /// interpret the result sparse tensor and performs *NO* integrity test on the
-  /// input data. It also assume that the trailing COO coordinate buffer is
-  /// passed in as a single AoS memory.
+  /// the contents from the level buffers. The constructor assumes that the
+  /// data provided by `lvlBufs` can be directly used to interpret the result
+  /// sparse tensor and performs no integrity test on the input data.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const uint64_t *lvlSizes,
                       const LevelType *lvlTypes, const uint64_t *dim2lvl,
@@ -244,16 +229,14 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   newFromCOO(uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
              const uint64_t *lvlSizes, const LevelType *lvlTypes,
              const uint64_t *dim2lvl, const uint64_t *lvl2dim,
-             SparseTensorCOO<V> &lvlCOO);
+             SparseTensorCOO<V> *lvlCOO);
 
-  /// Allocates a new sparse tensor and initialize it with the data stored level
-  /// buffers directly.
+  /// Allocates a new sparse tensor and initialize it from the given buffers.
   static SparseTensorStorage<P, C, V> *
-  packFromLvlBuffers(uint64_t dimRank, const uint64_t *dimSizes,
-                     uint64_t lvlRank, const uint64_t *lvlSizes,
-                     const LevelType *lvlTypes, const uint64_t *dim2lvl,
-                     const uint64_t *lvl2dim, uint64_t srcRank,
-                     const intptr_t *buffers);
+  newFromBuffers(uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
+                 const uint64_t *lvlSizes, const LevelType *lvlTypes,
+                 const uint64_t *dim2lvl, const uint64_t *lvl2dim,
+                 uint64_t srcRank, const intptr_t *buffers);
 
   ~SparseTensorStorage() final = default;
 
@@ -337,16 +320,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     }
   }
 
-  /// Allocates a new COO object and initializes it with the contents.
-  /// Callers must make sure to delete the COO when they're done with it.
-  SparseTensorCOO<V> *toCOO() {
-    std::vector<uint64_t> dimCoords(getDimRank());
-    coo = new SparseTensorCOO<V>(getDimSizes(), values.size());
-    toCOO(0, 0, dimCoords);
-    assert(coo->getElements().size() == values.size());
-    return coo;
-  }
-
   /// Sort the unordered tensor in place, the method assumes that it is
   /// an unordered COO tensor.
   void sortInPlace() {
@@ -556,58 +529,10 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     return -1u;
   }
 
-  // Performs forall on level entries and inserts into dim COO.
-  void toCOO(uint64_t parentPos, uint64_t l, std::vector<uint64_t> &dimCoords) {
-    if (l == getLvlRank()) {
-      map.pushbackward(lvlCursor.data(), dimCoords.data());
-      assert(coo);
-      assert(parentPos < values.size());
-      coo->add(dimCoords, values[parentPos]);
-      return;
-    }
-    if (isCompressedLvl(l)) {
-      const std::vector<P> &positionsL = positions[l];
-      assert(parentPos + 1 < positionsL.size());
-      const uint64_t pstart = static_cast<uint64_t>(positionsL[parentPos]);
-      const uint64_t pstop = static_cast<uint64_t>(positionsL[parentPos + 1]);
-      const std::vector<C> &coordinatesL = coordinates[l];
-      assert(pstop <= coordinatesL.size());
-      for (uint64_t pos = pstart; pos < pstop; pos++) {
-        lvlCursor[l] = static_cast<uint64_t>(coordinatesL[pos]);
-        toCOO(pos, l + 1, dimCoords);
-      }
-    } else if (isLooseCompressedLvl(l)) {
-      const std::vector<P> &positionsL = positions[l];
-      assert(2 * parentPos + 1 < positionsL.size());
-      const uint64_t pstart = static_cast<uint64_t>(positionsL[2 * parentPos]);
-      const uint64_t pstop =
-          static_cast<uint64_t>(positionsL[2 * parentPos + 1]);
-      const std::vector<C> &coordinatesL = coordinates[l];
-      assert(pstop <= coordinatesL.size());
-      for (uint64_t pos = pstart; pos < pstop; pos++) {
-        lvlCursor[l] = static_cast<uint64_t>(coordinatesL[pos]);
-        toCOO(pos, l + 1, dimCoords);
-      }
-    } else if (isSingletonLvl(l) || isNOutOfMLvl(l)) {
-      assert(parentPos < coordinates[l].size());
-      lvlCursor[l] = static_cast<uint64_t>(coordinates[l][parentPos]);
-      toCOO(parentPos, l + 1, dimCoords);
-    } else { // Dense level.
-      assert(isDenseLvl(l));
-      const uint64_t sz = getLvlSizes()[l];
-      const uint64_t pstart = parentPos * sz;
-      for (uint64_t c = 0; c < sz; c++) {
-        lvlCursor[l] = c;
-        toCOO(pstart + c, l + 1, dimCoords);
-      }
-    }
-  }
-
   std::vector<std::vector<P>> positions;
   std::vector<std::vector<C>> coordinates;
   std::vector<V> values;
   std::vector<uint64_t> lvlCursor;
-  SparseTensorCOO<V> *coo;
 };
 
 //===----------------------------------------------------------------------===//
@@ -621,9 +546,9 @@ SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newEmpty(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const LevelType *lvlTypes,
     const uint64_t *dim2lvl, const uint64_t *lvl2dim) {
+  SparseTensorCOO<V> *noLvlCOO = nullptr;
   return new SparseTensorStorage<P, C, V>(dimRank, dimSizes, lvlRank, lvlSizes,
-                                          lvlTypes, dim2lvl, lvl2dim, nullptr,
-                                          true);
+                                          lvlTypes, dim2lvl, lvl2dim, noLvlCOO);
 }
 
 template <typename P, typename C, typename V>
@@ -631,13 +556,14 @@ SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newFromCOO(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const LevelType *lvlTypes,
     const uint64_t *dim2lvl, const uint64_t *lvl2dim,
-    SparseTensorCOO<V> &lvlCOO) {
+    SparseTensorCOO<V> *lvlCOO) {
+  assert(lvlCOO);
   return new SparseTensorStorage<P, C, V>(dimRank, dimSizes, lvlRank, lvlSizes,
                                           lvlTypes, dim2lvl, lvl2dim, lvlCOO);
 }
 
 template <typename P, typename C, typename V>
-SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::packFromLvlBuffers(
+SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newFromBuffers(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const LevelType *lvlTypes,
     const uint64_t *dim2lvl, const uint64_t *lvl2dim, uint64_t srcRank,
@@ -657,11 +583,9 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const LevelType *lvlTypes,
     const uint64_t *dim2lvl, const uint64_t *lvl2dim,
-    SparseTensorCOO<V> *lvlCOO, bool initializeValuesIfAllDense)
+    SparseTensorCOO<V> *lvlCOO)
     : SparseTensorStorage(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
                           dim2lvl, lvl2dim) {
-  assert(!lvlCOO || lvlRank == lvlCOO->getRank());
-  coo = lvlCOO;
   // Provide hints on capacity of positions and coordinates.
   // TODO: needs much fine-tuning based on actual sparsity; currently
   // we reserve position/coordinate space based on all previous dense
@@ -692,27 +616,20 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
       sz = detail::checkedMul(sz, lvlSizes[l]);
     }
   }
-  if (allDense && initializeValuesIfAllDense)
+  if (lvlCOO) {
+    /* New from COO: ensure it is sorted. */
+    assert(lvlCOO->getRank() == lvlRank);
+    lvlCOO->sort();
+    // Now actually insert the `elements`.
+    const auto &elements = lvlCOO->getElements();
+    const uint64_t nse = elements.size();
+    assert(values.size() == 0);
+    values.reserve(nse);
+    fromCOO(elements, 0, nse, 0);
+  } else if (allDense) {
+    /* New empty (all dense) */
     values.resize(sz, 0);
-}
-
-template <typename P, typename C, typename V>
-SparseTensorStorage<P, C, V>::SparseTensorStorage( // NOLINT
-    uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
-    const uint64_t *lvlSizes, const LevelType *lvlTypes,
-    const uint64_t *dim2lvl, const uint64_t *lvl2dim,
-    SparseTensorCOO<V> &lvlCOO)
-    : SparseTensorStorage(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
-                          dim2lvl, lvl2dim, nullptr, false) {
-  // Ensure lvlCOO is sorted.
-  assert(lvlRank == lvlCOO.getRank());
-  lvlCOO.sort();
-  // Now actually insert the `elements`.
-  const auto &elements = lvlCOO.getElements();
-  const uint64_t nse = elements.size();
-  assert(values.size() == 0);
-  values.reserve(nse);
-  fromCOO(elements, 0, nse, 0);
+  }
 }
 
 template <typename P, typename C, typename V>
diff --git a/mlir/include/mlir/Transforms/Mem2Reg.h b/mlir/include/mlir/Transforms/Mem2Reg.h
index 89244feb21754e..d145f7ed437582 100644
--- a/mlir/include/mlir/Transforms/Mem2Reg.h
+++ b/mlir/include/mlir/Transforms/Mem2Reg.h
@@ -9,8 +9,6 @@
 #ifndef MLIR_TRANSFORMS_MEM2REG_H
 #define MLIR_TRANSFORMS_MEM2REG_H
 
-#include "mlir/IR/Dominance.h"
-#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "llvm/ADT/Statistic.h"
@@ -25,24 +23,6 @@ struct Mem2RegStatistics {
   llvm::Statistic *newBlockArgumentAmount = nullptr;
 };
 
-/// Pattern applying mem2reg to the regions of the operations on which it
-/// matches.
-class Mem2RegPattern
-    : public OpInterfaceRewritePattern<PromotableAllocationOpInterface> {
-public:
-  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
-
-  Mem2RegPattern(MLIRContext *context, Mem2RegStatistics statistics = {},
-                 PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern(context, benefit), statistics(statistics) {}
-
-  LogicalResult matchAndRewrite(PromotableAllocationOpInterface allocator,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  Mem2RegStatistics statistics;
-};
-
 /// Attempts to promote the memory slots of the provided allocators. Succeeds if
 /// at least one memory slot was promoted.
 LogicalResult
diff --git a/mlir/include/mlir/Transforms/SROA.h b/mlir/include/mlir/Transforms/SROA.h
index 0b3e72400c2873..1af1fe930723f1 100644
--- a/mlir/include/mlir/Transforms/SROA.h
+++ b/mlir/include/mlir/Transforms/SROA.h
@@ -9,11 +9,9 @@
 #ifndef MLIR_TRANSFORMS_SROA_H
 #define MLIR_TRANSFORMS_SROA_H
 
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/Statistic.h"
-#include <variant>
 
 namespace mlir {
 
@@ -29,24 +27,6 @@ struct SROAStatistics {
   llvm::Statistic *maxSubelementAmount = nullptr;
 };
 
-/// Pattern applying SROA to the regions of the operations on which it
-/// matches.
-class SROAPattern
-    : public OpInterfaceRewritePattern<DestructurableAllocationOpInterface> {
-public:
-  using OpInterfaceRewritePattern::OpInterfaceRewritePattern;
-
-  SROAPattern(MLIRContext *context, SROAStatistics statistics = {},
-              PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern(context, benefit), statistics(statistics) {}
-
-  LogicalResult matchAndRewrite(DestructurableAllocationOpInterface allocator,
-                                PatternRewriter &rewriter) const override;
-
-private:
-  SROAStatistics statistics;
-};
-
 /// Attempts to destructure the slots of destructurable allocators. Returns
 /// failure if no slot was destructured.
 LogicalResult tryToDestructureMemorySlots(
diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp
index 2d938ce5f4834c..d0fd5ceecfff5f 100644
--- a/mlir/lib/CAPI/Dialect/LLVM.cpp
+++ b/mlir/lib/CAPI/Dialect/LLVM.cpp
@@ -152,18 +152,18 @@ MlirAttribute mlirLLVMDIBasicTypeAttrGet(MlirContext ctx, unsigned int tag,
 }
 
 MlirAttribute mlirLLVMDICompositeTypeAttrGet(
-    MlirContext ctx, unsigned int tag, MlirAttribute name, MlirAttribute file,
-    uint32_t line, MlirAttribute scope, MlirAttribute baseType, int64_t flags,
-    uint64_t sizeInBits, uint64_t alignInBits, intptr_t nElements,
-    MlirAttribute const *elements) {
+    MlirContext ctx, unsigned int tag, MlirAttribute recId, MlirAttribute name,
+    MlirAttribute file, uint32_t line, MlirAttribute scope,
+    MlirAttribute baseType, int64_t flags, uint64_t sizeInBits,
+    uint64_t alignInBits, intptr_t nElements, MlirAttribute const *elements) {
   SmallVector<Attribute> elementsStorage;
   elementsStorage.reserve(nElements);
 
   return wrap(DICompositeTypeAttr::get(
-      unwrap(ctx), tag, cast<StringAttr>(unwrap(name)),
-      cast<DIFileAttr>(unwrap(file)), line, cast<DIScopeAttr>(unwrap(scope)),
-      cast<DITypeAttr>(unwrap(baseType)), DIFlags(flags), sizeInBits,
-      alignInBits,
+      unwrap(ctx), tag, cast<DistinctAttr>(unwrap(recId)),
+      cast<StringAttr>(unwrap(name)), cast<DIFileAttr>(unwrap(file)), line,
+      cast<DIScopeAttr>(unwrap(scope)), cast<DITypeAttr>(unwrap(baseType)),
+      DIFlags(flags), sizeInBits, alignInBits,
       llvm::map_to_vector(unwrapList(nElements, elements, elementsStorage),
                           [](Attribute a) { return a.cast<DINodeAttr>(); })));
 }
diff --git a/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt b/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt
index 0c8816b65d194e..cbe85789b29a37 100644
--- a/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/LinalgToStandard/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_conversion_library(MLIRLinalgToStandard
   MLIRFuncDialect
   MLIRIR
   MLIRLinalgDialect
+  MLIRLinalgTransforms
   MLIRMemRefDialect
   MLIRPass
   MLIRSCFDialect
diff --git a/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt b/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt
index 1400618c93e850..f0d95f5ada290e 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/MemRefToLLVM/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_conversion_library(MLIRMemRefToLLVM
   LINK_LIBS PUBLIC
   MLIRAnalysis
   MLIRDataLayoutInterfaces
+  MLIRFuncDialect
   MLIRLLVMCommonConversion
   MLIRMemRefDialect
   MLIRMemRefUtils
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
index 1ba0bc8b6bfbe5..61244921bc38ac 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
@@ -17,7 +17,6 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IntegerSet.h"
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 8a070d25639870..c591e5056480ca 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -17,6 +17,7 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/ShapedOpInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/ScopeExit.h"
@@ -220,6 +221,8 @@ void AffineDialect::initialize() {
 #include "mlir/Dialect/Affine/IR/AffineOps.cpp.inc"
                 >();
   addInterfaces<AffineInlinerInterface>();
+  declarePromisedInterfaces<ValueBoundsOpInterface, AffineApplyOp, AffineMaxOp,
+                            AffineMinOp>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
diff --git a/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp b/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
index 745c5706a838ce..6a593185ccedce 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithDialect.cpp
@@ -8,9 +8,12 @@
 
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -46,6 +49,12 @@ void arith::ArithDialect::initialize() {
       >();
   addInterfaces<ArithInlinerInterface>();
   declarePromisedInterface<ArithDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterface<SelectOp,
+                           bufferization::BufferDeallocationOpInterface>();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, ConstantOp,
+                            IndexCastOp, SelectOp>();
+  declarePromisedInterfaces<ValueBoundsOpInterface, AddIOp, ConstantOp, SubIOp,
+                            MulIOp>();
 }
 
 /// Materialize an integer or floating point constant.
diff --git a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp
index 47c84708f3c38b..1f48d27aa27b17 100644
--- a/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp
+++ b/mlir/lib/Dialect/ArmNeon/Transforms/LowerContractionToSMMLAPattern.cpp
@@ -16,7 +16,9 @@
 #include "mlir/Dialect/ArmNeon/Transforms.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -36,8 +38,10 @@ static Type matchContainerType(Type element, Type container) {
   return element;
 }
 
-/// Lowering from a single vector::contractOp directly to the arm neon smmla
-/// intrinsic. The shapes of the contract and intrinsic must match.
+/// Lowering from a vector::contractOp arm neon smmla intrinsic. This will tile
+/// any vector.contract into multiple smmla instructions with unrolling so long
+/// as [2,2,8] is a divisor of its shape. If no unrolling is necessary, a single
+/// smmla instruction is emitted.
 class LowerContractionToSMMLAPattern
     : public OpRewritePattern<vector::ContractionOp> {
 public:
@@ -45,10 +49,6 @@ class LowerContractionToSMMLAPattern
   LogicalResult matchAndRewrite(vector::ContractionOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    Value lhs = op.getLhs();
-    Value rhs = op.getRhs();
-    Value res = op.getAcc();
-
     // Check index maps that represent M N K in contract.
     auto indexingMaps = op.getIndexingMapsArray();
     if (llvm::any_of(indexingMaps, [](mlir::AffineMap affineMap) {
@@ -57,7 +57,6 @@ class LowerContractionToSMMLAPattern
         })) {
       return failure();
     }
-
     // Check iterator types for contract.
     auto iteratorTypes = op.getIteratorTypesArray();
     if (iteratorTypes.size() != 3 ||
@@ -66,22 +65,24 @@ class LowerContractionToSMMLAPattern
         iteratorTypes[2] != vector::IteratorType::reduction) {
       return failure();
     }
-
-    // Check the tile size by mapping the dimensions of the contract.
+    // Infer tile sizes from operands; Note: RHS is not transposed.
     mlir::VectorType lhsType = op.getLhsType();
     mlir::VectorType rhsType = op.getRhsType();
     auto dimM = lhsType.getDimSize(0);
     auto dimN = rhsType.getDimSize(0);
     auto dimK = lhsType.getDimSize(1);
-    if (rhsType.getDimSize(1) != dimK || dimM != 2 || dimN != 2 || dimK != 8) {
+
+    // Unrolling patterns can handle any [2, 2, 8] shaped multiple of inputs for
+    // tiling.
+    if (dimM % 2 != 0 || dimN % 2 != 0 || dimK % 8 != 0) {
       return failure();
     }
 
     // Check two extsi inputs Rhs Lhs for contract.
     arith::ExtSIOp origLhsExtOp =
-        dyn_cast_or_null<arith::ExtSIOp>(lhs.getDefiningOp());
+        dyn_cast_or_null<arith::ExtSIOp>(op.getLhs().getDefiningOp());
     arith::ExtSIOp origRhsExtOp =
-        dyn_cast_or_null<arith::ExtSIOp>(rhs.getDefiningOp());
+        dyn_cast_or_null<arith::ExtSIOp>(op.getRhs().getDefiningOp());
     if (!origLhsExtOp || !origRhsExtOp) {
       return failure();
     }
@@ -113,26 +114,73 @@ class LowerContractionToSMMLAPattern
       return failure();
     }
 
-    // Collapse to 1D vectors required by smmla intrinsic
-    auto collapsedInputType = VectorType::get(
-        {16}, extsiLhs.getType().cast<ShapedType>().getElementType());
-    auto collapsedOutputType =
-        VectorType::get({4}, res.getType().cast<ShapedType>().getElementType());
-    auto collapsedLhs = rewriter.createOrFold<vector::ShapeCastOp>(
-        extsiLhs.getLoc(), collapsedInputType, extsiLhs);
-    auto collapsedRhs = rewriter.createOrFold<vector::ShapeCastOp>(
-        extsiRhs.getLoc(), collapsedInputType, extsiRhs);
-    auto collapsedRes = rewriter.createOrFold<vector::ShapeCastOp>(
-        res.getLoc(), collapsedOutputType, res);
-
-    // Replace the contract with a neon op
-    auto smmlaOp = rewriter.createOrFold<arm_neon::SmmlaOp>(
-        op.getLoc(), collapsedRes.getType(), collapsedRes, collapsedLhs,
-        collapsedRhs);
-
-    // Reshape output back to 2D
-    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, op.getResultType(),
-                                                     smmlaOp);
+    // Initial accumulator for the final result. This is the un-tiled result if
+    // tiling is done.
+    Value result = rewriter.create<arith::ConstantOp>(
+        loc, op.getResultType(), rewriter.getZeroAttr(op.getResultType()));
+
+    SmallVector<int64_t> unrolledSize = *op.getShapeForUnroll();
+    SmallVector<int64_t> smmlaShape{2, 2, 8};
+    SmallVector<int64_t> loopOrder{0, 1, 2};
+    for (SmallVector<int64_t> offsets :
+         StaticTileOffsetRange(unrolledSize, smmlaShape, loopOrder)) {
+
+      // Helper to compute the new shape of each operand and extract the slice.
+      auto extractOperand = [&](Value operand, AffineMap permutationMap,
+                                ArrayRef<int64_t> operandOffsets) {
+        SmallVector<int64_t> operandShape =
+            applyPermutationMap(permutationMap, ArrayRef<int64_t>(smmlaShape));
+        SmallVector<int64_t> operandStrides(operandOffsets.size(), 1);
+        return rewriter.createOrFold<vector::ExtractStridedSliceOp>(
+            loc, operand, operandOffsets, operandShape, operandStrides);
+      };
+
+      // Extract tiled lhs, rhs, and acc
+      AffineMap lhsPermutationMap = op.getIndexingMapsArray()[0];
+      SmallVector<int64_t> lhsOffsets =
+          applyPermutationMap(lhsPermutationMap, ArrayRef<int64_t>(offsets));
+      Value tiledLhs = extractOperand(extsiLhs, lhsPermutationMap, lhsOffsets);
+      AffineMap rhsPermutationMap = op.getIndexingMapsArray()[1];
+      SmallVector<int64_t> rhsOffsets =
+          applyPermutationMap(rhsPermutationMap, ArrayRef<int64_t>(offsets));
+      Value tiledRhs = extractOperand(extsiRhs, rhsPermutationMap, rhsOffsets);
+      AffineMap accPermutationMap = op.getIndexingMapsArray()[2];
+      SmallVector<int64_t> accOffsets =
+          applyPermutationMap(accPermutationMap, ArrayRef<int64_t>(offsets));
+      Value tiledAcc =
+          extractOperand(op.getAcc(), accPermutationMap, accOffsets);
+
+      // Collapse tiled operands to 1D vectors required by smmla intrinsic
+      auto collapsedInputType = VectorType::get(
+          tiledLhs.getType().cast<ShapedType>().getNumElements(),
+          tiledLhs.getType().cast<ShapedType>().getElementType());
+      auto collapsedOutputType = VectorType::get(
+          {4}, tiledAcc.getType().cast<ShapedType>().getElementType());
+      auto collapsedLhs = rewriter.createOrFold<vector::ShapeCastOp>(
+          tiledLhs.getLoc(), collapsedInputType, tiledLhs);
+      auto collapsedRhs = rewriter.createOrFold<vector::ShapeCastOp>(
+          tiledRhs.getLoc(), collapsedInputType, tiledRhs);
+      auto collapsedRes = rewriter.createOrFold<vector::ShapeCastOp>(
+          tiledAcc.getLoc(), collapsedOutputType, tiledAcc);
+
+      // Insert contract op
+      auto smmlaOp = rewriter.createOrFold<arm_neon::SmmlaOp>(
+          op.getLoc(), collapsedRes.getType(), collapsedRes, collapsedLhs,
+          collapsedRhs);
+
+      // Reshape output back to 2D
+      Value tiledRes = rewriter.createOrFold<vector::ShapeCastOp>(
+          smmlaOp.getLoc(), tiledAcc.getType(), smmlaOp);
+
+      // Insert the tiled result back into the non tiled result of the
+      // contract op.
+      SmallVector<int64_t> strides(
+          tiledRes.getType().cast<ShapedType>().getRank(), 1);
+      result = rewriter.createOrFold<vector::InsertStridedSliceOp>(
+          loc, tiledRes, result, accOffsets, strides);
+    }
+
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 8f0f6d1fcc8490..55c9299c58effd 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -344,11 +344,11 @@ bool BufferizationOptions::isOpAllowed(Operation *op) const {
 
 BufferizableOpInterface
 BufferizationOptions::dynCastBufferizableOp(Operation *op) const {
+  if (!isOpAllowed(op))
+    return nullptr;
   auto bufferizableOp = dyn_cast<BufferizableOpInterface>(op);
   if (!bufferizableOp)
     return nullptr;
-  if (!isOpAllowed(op))
-    return nullptr;
   return bufferizableOp;
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index 88b61c4df15b84..9f1295222c3525 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -257,16 +257,6 @@ LogicalResult AllocTensorOp::verify() {
            << getType().getNumDynamicDims() << " dynamic sizes";
   if (getCopy() && getCopy().getType() != getType())
     return emitError("expected that `copy` and return type match");
-
-  // For sparse tensor allocation, we require that none of its
-  // uses escapes the function boundary directly.
-  if (sparse_tensor::getSparseTensorEncoding(getType())) {
-    for (auto &use : getOperation()->getUses())
-      if (isa<func::ReturnOp, func::CallOp, func::CallIndirectOp>(
-              use.getOwner()))
-        return emitError("sparse tensor allocation should not escape function");
-  }
-
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index f0b1c9d0c1630a..5d8c1749272b1b 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -454,7 +454,7 @@ LogicalResult bufferization::bufferizeOp(Operation *op,
   // canonicalize away (or canonicalize to more precise layouts).
   SmallVector<Operation *> worklist;
   op->walk<WalkOrder::PostOrder>([&](Operation *op) {
-    if (hasTensorSemantics(op))
+    if (options.isOpAllowed(op) && hasTensorSemantics(op))
       worklist.push_back(op);
   });
 
@@ -473,8 +473,6 @@ LogicalResult bufferization::bufferizeOp(Operation *op,
     auto bufferizableOp = options.dynCastBufferizableOp(nextOp);
     if (!bufferizableOp)
       continue;
-    if (!options.isOpAllowed(nextOp))
-      continue;
     // Skip ops that no longer have tensor semantics.
     if (!hasTensorSemantics(nextOp))
       continue;
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index d242d75bd51fa7..c6b02b9703e75f 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -10,6 +10,8 @@
 
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
@@ -69,6 +71,10 @@ void ControlFlowDialect::initialize() {
       >();
   addInterfaces<ControlFlowInlinerInterface>();
   declarePromisedInterface<ControlFlowDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, BranchOp,
+                            CondBranchOp>();
+  declarePromisedInterface<CondBranchOp,
+                           bufferization::BufferDeallocationOpInterface>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Func/IR/FuncOps.cpp b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
index d18ec279e85c04..ed2ecfe9d0fb51 100644
--- a/mlir/lib/Dialect/Func/IR/FuncOps.cpp
+++ b/mlir/lib/Dialect/Func/IR/FuncOps.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
@@ -43,6 +44,8 @@ void FuncDialect::initialize() {
       >();
   declarePromisedInterface<FuncDialect, DialectInlinerInterface>();
   declarePromisedInterface<FuncDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, CallOp,
+                            FuncOp, ReturnOp>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 33ce5c159db4f9..a02eca8b11790c 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -215,6 +216,8 @@ void GPUDialect::initialize() {
 #include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc"
       >();
   addInterfaces<GPUInlinerInterface>();
+  declarePromisedInterface<TerminatorOp,
+                           bufferization::BufferDeallocationOpInterface>();
 }
 
 static std::string getSparseHandleKeyword(SparseHandleKind kind) {
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
index 645a45dd96befb..a44e83313c9c1f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
@@ -35,6 +35,7 @@ static LogicalResult parseExpressionArg(AsmParser &parser, uint64_t opcode,
 static void printExpressionArg(AsmPrinter &printer, uint64_t opcode,
                                ArrayRef<uint64_t> args);
 
+#include "mlir/Dialect/LLVMIR/LLVMAttrInterfaces.cpp.inc"
 #include "mlir/Dialect/LLVMIR/LLVMOpsEnums.cpp.inc"
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.cpp.inc"
@@ -185,6 +186,24 @@ void printExpressionArg(AsmPrinter &printer, uint64_t opcode,
   });
 }
 
+//===----------------------------------------------------------------------===//
+// DICompositeTypeAttr
+//===----------------------------------------------------------------------===//
+
+DIRecursiveTypeAttrInterface
+DICompositeTypeAttr::withRecId(DistinctAttr recId) {
+  return DICompositeTypeAttr::get(getContext(), getTag(), recId, getName(),
+                                  getFile(), getLine(), getScope(),
+                                  getBaseType(), getFlags(), getSizeInBits(),
+                                  getAlignInBits(), getElements());
+}
+
+DIRecursiveTypeAttrInterface
+DICompositeTypeAttr::getRecSelf(DistinctAttr recId) {
+  return DICompositeTypeAttr::get(recId.getContext(), 0, recId, {}, {}, 0, {},
+                                  {}, DIFlags(), 0, 0, {});
+}
+
 //===----------------------------------------------------------------------===//
 // TargetFeaturesAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index bf24194d03ddb2..81766b307d3d78 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -344,6 +344,10 @@ LogicalResult LLVM::GEPOp::ensureOnlySafeAccesses(
     return failure();
   if (!isFirstIndexZero(*this))
     return failure();
+  // Dynamic indices can be out-of-bounds (even negative), so an access with
+  // dynamic indices can never be considered safe.
+  if (!getDynamicIndices().empty())
+    return failure();
   Type reachedType = getResultPtrElementType();
   if (!reachedType)
     return failure();
@@ -362,6 +366,10 @@ bool LLVM::GEPOp::canRewire(const DestructurableMemorySlot &slot,
     return false;
   if (!isFirstIndexZero(*this))
     return false;
+  // Dynamic indices can be out-of-bounds (even negative), so an access with
+  // dynamic indices can never be properly rewired.
+  if (!getDynamicIndices().empty())
+    return false;
   Type reachedType = getResultPtrElementType();
   if (!reachedType || getIndices().size() < 2)
     return false;
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 4780ec09b81b9b..9e8407451a0855 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -214,8 +214,7 @@ void MmaOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict(this->getOperation()->getAttrs(), ignoreAttrNames);
 
   // Print the types of the operands and result.
-  p << " : "
-    << "(";
+  p << " : " << "(";
   llvm::interleaveComma(SmallVector<Type, 3>{frags[0].regs[0].getType(),
                                              frags[1].regs[0].getType(),
                                              frags[2].regs[0].getType()},
@@ -956,9 +955,7 @@ std::string NVVM::WgmmaMmaAsyncOp::getPtx() {
   ss << "},";
   // Need to map read/write registers correctly.
   regCnt = (regCnt * 2);
-  ss << " $" << (regCnt) << ","
-     << " $" << (regCnt + 1) << ","
-     << " p";
+  ss << " $" << (regCnt) << "," << " $" << (regCnt + 1) << "," << " p";
   if (getTypeD() != WGMMATypes::s32) {
     ss << ", $" << (regCnt + 3) << ",  $" << (regCnt + 4);
   }
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
index 027058d4de6328..a6936fde43709d 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
@@ -21,7 +21,10 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/InliningUtils.h"
@@ -123,6 +126,23 @@ void mlir::linalg::LinalgDialect::initialize() {
   declarePromisedInterface<GenericOp, mesh::ShardingInterface>();
   declarePromisedInterfaces<mesh::ShardingInterface,
 #define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+                            >();
+  declarePromisedInterface<CopyOp, SubsetOpInterface>();
+  declarePromisedInterface<CopyOp, SubsetInsertionOpInterface>();
+  declarePromisedInterface<IndexOp, ValueBoundsOpInterface>();
+  declarePromisedInterface<linalg::GenericOp, TilingInterface>();
+  declarePromisedInterface<linalg::GenericOp, PartialReductionOpInterface>();
+  declarePromisedInterfaces<TilingInterface,
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+                            >();
+  declarePromisedInterfaces<PartialReductionOpInterface,
+#define GET_OP_LIST
+#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
+                            >();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface,
+#define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
                             >();
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 1e703dacfd0c75..c74ab1e6448bec 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -24,6 +24,7 @@
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Interfaces/MaskableOpInterface.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
@@ -54,6 +55,8 @@ using namespace mlir::linalg;
 /// Try to vectorize `convOp` as a convolution.
 static FailureOr<Operation *>
 vectorizeConvolution(RewriterBase &rewriter, LinalgOp convOp,
+                     ArrayRef<int64_t> inputVecSizes = {},
+                     ArrayRef<bool> inputVecScalableFlags = {},
                      bool flatten1DDepthwiseConv = false);
 
 /// Return the unique instance of OpType in `block` if it is indeed unique.
@@ -1712,7 +1715,40 @@ static LogicalResult reductionPreconditions(LinalgOp op) {
   return success();
 }
 
-static LogicalResult vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op) {
+static LogicalResult
+vectorizeDynamicConvOpPrecondition(linalg::LinalgOp conv,
+                                   bool flatten1DDepthwiseConv) {
+  if (flatten1DDepthwiseConv) {
+    LDBG("Vectorization of flattened convs with dynamic shapes is not "
+         "supported\n");
+    return failure();
+  }
+
+  if (!isa<linalg::DepthwiseConv1DNwcWcOp>(conv)) {
+    LDBG("Not a 1D depth-wise WC conv, dynamic shapes are not supported\n");
+    return failure();
+  }
+
+  // Support dynamic shapes in 1D depthwise convolution, but only in the
+  // _channel_ dimension.
+  Value lhs = conv.getDpsInputOperand(0)->get();
+  ArrayRef<int64_t> lhsShape = cast<ShapedType>(lhs.getType()).getShape();
+  auto shapeWithoutCh = lhsShape.drop_back(1);
+  if (ShapedType::isDynamicShape(shapeWithoutCh)) {
+    LDBG("Dynamically-shaped op vectorization precondition failed: only "
+         "channel dim can be dynamic\n");
+    return failure();
+  }
+
+  return success();
+}
+
+static LogicalResult
+vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
+                                     bool flatten1DDepthwiseConv) {
+  if (isa<ConvolutionOpInterface>(op.getOperation()))
+    return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv);
+
   // TODO: Masking only supports dynamic element-wise ops, linalg.generic ops,
   // linalg.copy ops and ops that implement ContractionOpInterface for now.
   if (!isElementwise(op) &&
@@ -1778,10 +1814,9 @@ vectorizeUnPackOpPrecondition(tensor::UnPackOp unpackOp,
   return success();
 }
 
-static LogicalResult
-vectorizeLinalgOpPrecondition(LinalgOp linalgOp,
-                              ArrayRef<int64_t> inputVectorSizes,
-                              bool vectorizeNDExtract) {
+static LogicalResult vectorizeLinalgOpPrecondition(
+    LinalgOp linalgOp, ArrayRef<int64_t> inputVectorSizes,
+    bool vectorizeNDExtract, bool flatten1DDepthwiseConv) {
   // tensor with dimension of 0 cannot be vectorized.
   if (llvm::is_contained(linalgOp.getStaticShape(), 0))
     return failure();
@@ -1791,8 +1826,8 @@ vectorizeLinalgOpPrecondition(LinalgOp linalgOp,
                                       inputVectorSizes)))
     return failure();
 
-  if (linalgOp.hasDynamicShape() &&
-      failed(vectorizeDynamicLinalgOpPrecondition(linalgOp))) {
+  if (linalgOp.hasDynamicShape() && failed(vectorizeDynamicLinalgOpPrecondition(
+                                        linalgOp, flatten1DDepthwiseConv))) {
     LDBG("Dynamically-shaped op failed vectorization pre-conditions\n");
     return failure();
   }
@@ -1911,14 +1946,17 @@ vectorizeScalableVectorPrecondition(Operation *op,
   if (!isScalable)
     return success();
 
-  // Only element-wise ops supported in the presence of scalable dims.
+  // Only element-wise and 1d depthwise conv ops supported in the presence of
+  // scalable dims.
   auto linalgOp = dyn_cast<LinalgOp>(op);
-  return success(linalgOp && isElementwise(linalgOp));
+  return success(linalgOp && (isElementwise(linalgOp) ||
+                              isa<linalg::DepthwiseConv1DNwcWcOp>(op)));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
     Operation *op, ArrayRef<int64_t> inputVectorSizes,
-    ArrayRef<bool> inputScalableVecDims, bool vectorizeNDExtract) {
+    ArrayRef<bool> inputScalableVecDims, bool vectorizeNDExtract,
+    bool flatten1DDepthwiseConv) {
   if (failed(vectorizeScalableVectorPrecondition(op, inputVectorSizes,
                                                  inputScalableVecDims)))
     return failure();
@@ -1926,7 +1964,8 @@ LogicalResult mlir::linalg::vectorizeOpPrecondition(
   return TypeSwitch<Operation *, LogicalResult>(op)
       .Case<linalg::LinalgOp>([&](auto linalgOp) {
         return vectorizeLinalgOpPrecondition(linalgOp, inputVectorSizes,
-                                             vectorizeNDExtract);
+                                             vectorizeNDExtract,
+                                             flatten1DDepthwiseConv);
       })
       .Case<tensor::PadOp>([&](auto padOp) {
         return vectorizePadOpPrecondition(padOp, inputVectorSizes);
@@ -1975,7 +2014,8 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
   LLVM_DEBUG(llvm::dbgs() << "\n");
 
   if (failed(vectorizeOpPrecondition(op, inputVectorSizes, inputScalableVecDims,
-                                     vectorizeNDExtract))) {
+                                     vectorizeNDExtract,
+                                     flatten1DDepthwiseConv))) {
     LDBG("Vectorization pre-conditions failed\n");
     return failure();
   }
@@ -1999,7 +2039,8 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op,
             // inference.
             if (isa<ConvolutionOpInterface>(linalgOp.getOperation())) {
               FailureOr<Operation *> convOr = vectorizeConvolution(
-                  rewriter, linalgOp, flatten1DDepthwiseConv);
+                  rewriter, linalgOp, inputVectorSizes, inputScalableVecDims,
+                  flatten1DDepthwiseConv);
               if (succeeded(convOr)) {
                 llvm::append_range(results, (*convOr)->getResults());
                 return success();
@@ -3131,13 +3172,30 @@ struct Conv1DGenerator
   /// kw is always unrolled.
   /// TODO: w (resp. kw) is unrolled when the strideW ( resp. dilationW) is
   /// > 1.
-  FailureOr<Operation *> depthwiseConv(bool flatten) {
+  FailureOr<Operation *> depthwiseConv(uint64_t channelDimVecSize,
+                                       bool channelDimScalableFlag,
+                                       bool flatten) {
     if (!valid)
       return rewriter.notifyMatchFailure(op, "unvectorizable depthwise conv");
 
+    bool scalableChDim = false;
+    bool useMasking = false;
     int64_t nSize, wSize, cSize, kwSize;
     // kernel{kw, c}
     bindShapeDims(rhsShapedType, kwSize, cSize);
+    if (ShapedType::isDynamic(cSize)) {
+      assert(channelDimVecSize != 0 && "Channel dim vec size must be > 0");
+      cSize = channelDimVecSize;
+      // Scalable vectors are only used when both conditions are met:
+      //  1. channel dim is dynamic
+      //  2. channelDimScalableFlag is set
+      scalableChDim = channelDimScalableFlag;
+      useMasking = true;
+    }
+
+    assert(!(useMasking && flatten) &&
+           "Unsupported flattened conv with dynamic shapes");
+
     // out{n, w, c}
     bindShapeDims(resShapedType, nSize, wSize);
 
@@ -3158,20 +3216,51 @@ struct Conv1DGenerator
          //   (i.e. 16 convolved with 3 (@stride 1 dilation 1) -> 14)
          ((wSize - 1) * strideW + 1) + ((kwSize - 1) * dilationW + 1) - 1,
          cSize},
-        lhsEltType);
-    VectorType rhsType = VectorType::get({kwSize, cSize}, rhsEltType);
-    VectorType resType = VectorType::get({nSize, wSize, cSize}, resEltType);
+        lhsEltType, /*scalableDims=*/{false, false, scalableChDim});
+    VectorType rhsType =
+        VectorType::get({kwSize, cSize}, rhsEltType,
+                        /*scalableDims=*/{false, scalableChDim});
+    VectorType resType =
+        VectorType::get({nSize, wSize, cSize}, resEltType,
+                        /*scalableDims=*/{false, false, scalableChDim});
+
+    // Masks the input xfer Op along the channel dim, iff the corresponding
+    // scalable flag is set.
+    auto maybeMaskXferOp = [&](ArrayRef<int64_t> maskShape,
+                               ArrayRef<bool> scalableDims,
+                               Operation *opToMask) {
+      if (!useMasking)
+        return opToMask;
+      auto maskType =
+          VectorType::get(maskShape, rewriter.getI1Type(), scalableDims);
+
+      SmallVector<OpFoldResult> mixedDims = vector::getMixedSizesXfer(
+          cast<LinalgOp>(op).hasPureTensorSemantics(), opToMask, rewriter);
+
+      Value maskOp =
+          rewriter.create<vector::CreateMaskOp>(loc, maskType, mixedDims);
+
+      return mlir::vector::maskOperation(rewriter, opToMask, maskOp);
+    };
 
     // Read lhs slice of size {n, w * strideW + kw * dilationW, c} @ [0, 0,
     // 0].
     Value lhs = rewriter.create<vector::TransferReadOp>(
         loc, lhsType, lhsShaped, ValueRange{zero, zero, zero});
+    auto maybeMaskedLhs = maybeMaskXferOp(
+        lhsType.getShape(), lhsType.getScalableDims(), lhs.getDefiningOp());
+
     // Read rhs slice of size {kw, c} @ [0, 0].
     Value rhs = rewriter.create<vector::TransferReadOp>(loc, rhsType, rhsShaped,
                                                         ValueRange{zero, zero});
+    auto maybeMaskedRhs = maybeMaskXferOp(
+        rhsType.getShape(), rhsType.getScalableDims(), rhs.getDefiningOp());
+
     // Read res slice of size {n, w, c} @ [0, 0, 0].
     Value res = rewriter.create<vector::TransferReadOp>(
         loc, resType, resShaped, ValueRange{zero, zero, zero});
+    auto maybeMaskedRes = maybeMaskXferOp(
+        resType.getShape(), resType.getScalableDims(), res.getDefiningOp());
 
     //===------------------------------------------------------------------===//
     // Begin vector-only rewrite part
@@ -3186,7 +3275,7 @@ struct Conv1DGenerator
     for (int64_t kw = 0; kw < kwSize; ++kw) {
       for (int64_t w = 0; w < wSize; w += wSizeStep) {
         lhsVals.push_back(rewriter.create<vector::ExtractStridedSliceOp>(
-            loc, lhs,
+            loc, maybeMaskedLhs->getResult(0),
             /*offsets=*/ArrayRef<int64_t>{0, w * strideW + kw * dilationW, 0},
             inOutSliceSizes, inOutStrides));
       }
@@ -3194,12 +3283,13 @@ struct Conv1DGenerator
     // Extract rhs slice of size {c} @ [kw].
     for (int64_t kw = 0; kw < kwSize; ++kw) {
       rhsVals.push_back(rewriter.create<vector::ExtractOp>(
-          loc, rhs, /*offsets=*/ArrayRef<int64_t>{kw}));
+          loc, maybeMaskedRhs->getResult(0),
+          /*offsets=*/ArrayRef<int64_t>{kw}));
     }
     // Extract res slice: {n, wSizeStep, c} @ [0, w, 0].
     for (int64_t w = 0; w < wSize; w += wSizeStep) {
       resVals.push_back(rewriter.create<vector::ExtractStridedSliceOp>(
-          loc, res,
+          loc, maybeMaskedRes->getResult(0),
           /*offsets=*/ArrayRef<int64_t>{0, w, 0}, inOutSliceSizes,
           inOutStrides));
     }
@@ -3208,10 +3298,15 @@ struct Conv1DGenerator
       return kw * (wSize / wSizeStep) + w;
     };
 
+    // Note - the scalable flags are ignored as flattening combined with
+    // scalable vectorization is not supported.
     auto inOutFlattenSliceSizes =
         SmallVector<int64_t>{nSize, wSizeStep * cSize};
-    auto lhsCastType = VectorType::get(inOutFlattenSliceSizes, lhsEltType);
-    auto resCastType = VectorType::get(inOutFlattenSliceSizes, resEltType);
+    auto lhsTypeAfterFlattening =
+        VectorType::get(inOutFlattenSliceSizes, lhsEltType);
+    auto resTypeAfterFlattening =
+        VectorType::get(inOutFlattenSliceSizes, resEltType);
+
     // Compute contraction: O{n, w, c} += I{n, sw * w + dw * kw, c} * F{c}
     for (int64_t kw = 0; kw < kwSize; ++kw) {
       for (int64_t w = 0; w < wSize; w += wSizeStep) {
@@ -3221,9 +3316,9 @@ struct Conv1DGenerator
           // Flatten the input and output vectors (collapse the channel
           // dimension)
           lhsVal = rewriter.create<vector::ShapeCastOp>(
-              loc, lhsCastType, lhsVals[linearIndex(kw, w)]);
-          resVal = rewriter.create<vector::ShapeCastOp>(loc, resCastType,
-                                                        resVals[w]);
+              loc, lhsTypeAfterFlattening, lhsVals[linearIndex(kw, w)]);
+          resVal = rewriter.create<vector::ShapeCastOp>(
+              loc, resTypeAfterFlattening, resVals[w]);
         }
         resVals[w] = depthwiseConv1dSliceAsMulAcc(rewriter, loc, lhsVal,
                                                   rhsVals[kw], resVal, flatten);
@@ -3248,8 +3343,8 @@ struct Conv1DGenerator
     // Write back res slice: {n, wSizeStep, c} @ [0, w, 0].
     // This does not depend on kw.
     for (int64_t w = 0; w < wSize; w += wSizeStep) {
-      res = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, resVals[w], res,
+      maybeMaskedRes = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, resVals[w], maybeMaskedRes->getResult(0),
           /*offsets=*/ArrayRef<int64_t>{0, w, 0},
           /*strides=*/ArrayRef<int64_t>{1, 1, 1});
     }
@@ -3258,10 +3353,11 @@ struct Conv1DGenerator
     //===------------------------------------------------------------------===//
 
     // Write back res slice of size {n, w, c} @ [0, 0, 0].
-    return rewriter
-        .create<vector::TransferWriteOp>(loc, res, resShaped,
-                                         ValueRange{zero, zero, zero})
-        .getOperation();
+    Operation *resOut = rewriter.create<vector::TransferWriteOp>(
+        loc, maybeMaskedRes->getResult(0), resShaped,
+        ValueRange{zero, zero, zero});
+    return maybeMaskXferOp(resType.getShape(), resType.getScalableDims(),
+                           resOut);
   }
 
   /// Lower:
@@ -3278,6 +3374,10 @@ struct Conv1DGenerator
     lhs = promote(rewriter, loc, lhs, resTy);
 
     if (flatten) {
+      // NOTE: This following logic won't work for scalable vectors. For this
+      // reason, "flattening" is not supported when shapes are dynamic (this
+      // should be captured by one of the pre-conditions).
+
       // There are two options for handling the filter:
       //  * shape_cast(broadcast(filter))
       //  * broadcast(shuffle(filter))
@@ -3399,7 +3499,9 @@ struct Conv1DGenerator
 
   /// Entry point that transposes into the common form:
   ///   {{n, strideW * w + dilationW * kw, c}, {kw, c}, {n, w, c}}
-  FailureOr<Operation *> generateDilatedConv(bool flatten = false) {
+  FailureOr<Operation *> generateDilatedConv(uint64_t vecChDimSize = 0,
+                                             bool vecChDimScalableFlag = false,
+                                             bool flatten = false) {
     AffineExpr n, w, c, kw;
     bindDims(ctx, n, w, c, kw);
     if (!iters({Par(), Par(), Par(), Red()}))
@@ -3410,7 +3512,7 @@ struct Conv1DGenerator
     if (layout({/*lhsIndex*/ {n, strideW * w + dilationW * kw, c},
                 /*rhsIndex*/ {kw, c},
                 /*resIndex*/ {n, w, c}}))
-      return depthwiseConv(flatten);
+      return depthwiseConv(vecChDimSize, vecChDimScalableFlag, flatten);
 
     return rewriter.notifyMatchFailure(op, "not a depthwise::Nwc layout");
   }
@@ -3475,9 +3577,9 @@ struct Conv1DGenerator
 
 /// Helper function to vectorize a LinalgOp with convolution semantics.
 // TODO: extend the generic vectorization to support windows and drop this.
-static FailureOr<Operation *>
-vectorizeConvolution(RewriterBase &rewriter, LinalgOp op,
-                     bool flatten1DDepthwiseConv) {
+static FailureOr<Operation *> vectorizeConvolution(
+    RewriterBase &rewriter, LinalgOp op, ArrayRef<int64_t> inputVecSizes,
+    ArrayRef<bool> inputScalableVecDims, bool flatten1DDepthwiseConv) {
   // The ConvolutionOpInterface gives us guarantees of existence for
   // strides/dilations. However, we do not need to rely on those, we can
   // simply use them if present, otherwise use the default and let the generic
@@ -3502,7 +3604,28 @@ vectorizeConvolution(RewriterBase &rewriter, LinalgOp op,
   res = e.generateNcwPooling();
   if (succeeded(res))
     return res;
-  return e.generateDilatedConv(flatten1DDepthwiseConv);
+
+  // Only depthwise 1D NWC convs are left - these can be vectorized using masks
+  // and scalable vectors. Note that ATM the only dim that can be dynamic (i.e.
+  // masked/scalable) is the channel dim (i.e. the trailing dim).
+  uint64_t vecChDimSize = ShapedType::kDynamic;
+  bool vecChDimScalableFlag = false;
+  if (!inputVecSizes.empty()) {
+    // Only use the input vector size corresponding to the channel dim. Other
+    // vector dims will be inferred from the Ops.
+    assert((isa<linalg::DepthwiseConv1DNwcWcOp>(*op) ||
+            isa<linalg::DepthwiseConv1DNcwCwOp>(*op)) &&
+           "Not a 1D depthwise conv!");
+    size_t chDimIdx =
+        TypeSwitch<Operation *, size_t>(op)
+            .Case<linalg::DepthwiseConv1DNwcWcOp>([](auto conv) { return 2; })
+            .Case<linalg::DepthwiseConv1DNcwCwOp>([](auto conv) { return 1; });
+
+    vecChDimSize = inputVecSizes[chDimIdx];
+    vecChDimScalableFlag = inputScalableVecDims[chDimIdx];
+  }
+  return e.generateDilatedConv(vecChDimSize, vecChDimScalableFlag,
+                               flatten1DDepthwiseConv);
 }
 
 struct VectorizeConvolution : public OpInterfaceRewritePattern<LinalgOp> {
diff --git a/mlir/lib/Dialect/MLProgram/IR/CMakeLists.txt b/mlir/lib/Dialect/MLProgram/IR/CMakeLists.txt
index 90c9c9aa787503..725bb5fd9da9ed 100644
--- a/mlir/lib/Dialect/MLProgram/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/MLProgram/IR/CMakeLists.txt
@@ -15,5 +15,6 @@ add_mlir_dialect_library(MLIRMLProgramDialect
   MLIRControlFlowInterfaces
   MLIRFunctionInterfaces
   MLIRInferTypeOpInterface
+  MLIRTransforms
   MLIRIR
   )
diff --git a/mlir/lib/Dialect/MLProgram/IR/MLProgramDialect.cpp b/mlir/lib/Dialect/MLProgram/IR/MLProgramDialect.cpp
index 1a8fe208d4099e..bda1032ed98847 100644
--- a/mlir/lib/Dialect/MLProgram/IR/MLProgramDialect.cpp
+++ b/mlir/lib/Dialect/MLProgram/IR/MLProgramDialect.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/MLProgram/IR/MLProgram.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
@@ -24,6 +25,18 @@ using namespace mlir::ml_program;
 #include "mlir/Dialect/MLProgram/IR/MLProgramTypes.cpp.inc"
 
 namespace {
+
+struct MLProgramInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  bool isLegalToInline(Operation *, Region *, bool,
+                       IRMapping &) const override {
+    // We have no specific opinion on whether ops defined in this dialect should
+    // be inlined.
+    return true;
+  }
+};
+
 struct MLProgramOpAsmDialectInterface : public OpAsmDialectInterface {
   using OpAsmDialectInterface::OpAsmDialectInterface;
 
@@ -53,5 +66,5 @@ void ml_program::MLProgramDialect::initialize() {
 #include "mlir/Dialect/MLProgram/IR/MLProgramOps.cpp.inc"
       >();
 
-  addInterfaces<MLProgramOpAsmDialectInterface>();
+  addInterfaces<MLProgramInlinerInterface, MLProgramOpAsmDialectInterface>();
 }
diff --git a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
index 73258991b24b71..b31f9b09fc47da 100644
--- a/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/ExpandPatterns.cpp
@@ -91,34 +91,42 @@ static LogicalResult convertCoshOp(math::CoshOp op, PatternRewriter &rewriter) {
 }
 
 /// Expands tanh op into
-///   1) 1-exp^{-2x} / 1+exp^{-2x}, if x => 0
-///   2) exp^{2x}-1 / exp^{2x}+1  , if x < 0
+/// 1-exp^{-2x} / 1+exp^{-2x}
+/// To avoid overflow we exploit the reflection symmetry `tanh(-x) = -tanh(x)`.
+/// We compute a "signs" value which is -1 if input is negative and +1 if input
+/// is positive.  Then multiply the input by this value, guaranteeing that the
+/// result is positive, which also guarantees `exp^{-2x * sign(x)}` is in (0,
+/// 1]. Expand the computation on the input `x * sign(x)`, then multiply the
+/// result by `sign(x)` to retain sign of the real result.
 static LogicalResult convertTanhOp(math::TanhOp op, PatternRewriter &rewriter) {
   auto floatType = op.getOperand().getType();
   Location loc = op.getLoc();
+  Value zero = createFloatConst(loc, floatType, 0.0, rewriter);
   Value one = createFloatConst(loc, floatType, 1.0, rewriter);
-  Value two = createFloatConst(loc, floatType, 2.0, rewriter);
-  Value doubledX = rewriter.create<arith::MulFOp>(loc, op.getOperand(), two);
-
-  // Case 1: tanh(x) = 1-exp^{-2x} / 1+exp^{-2x}
-  Value negDoubledX = rewriter.create<arith::NegFOp>(loc, doubledX);
+  Value negTwo = createFloatConst(loc, floatType, -2.0, rewriter);
+
+  // Compute sign(x) = cast<float_type>(x < 0) * (-2) + 1
+  Value isNegative = rewriter.create<arith::CmpFOp>(
+      loc, arith::CmpFPredicate::OLT, op.getOperand(), zero);
+  Value isNegativeFloat =
+      rewriter.create<arith::UIToFPOp>(loc, floatType, isNegative);
+  Value isNegativeTimesNegTwo =
+      rewriter.create<arith::MulFOp>(loc, isNegativeFloat, negTwo);
+  Value sign = rewriter.create<arith::AddFOp>(loc, isNegativeTimesNegTwo, one);
+
+  // Normalize input to positive value: y = sign(x) * x
+  Value positiveX = rewriter.create<arith::MulFOp>(loc, sign, op.getOperand());
+
+  // Decompose on normalized input
+  Value negDoubledX = rewriter.create<arith::MulFOp>(loc, negTwo, positiveX);
   Value exp2x = rewriter.create<math::ExpOp>(loc, negDoubledX);
   Value dividend = rewriter.create<arith::SubFOp>(loc, one, exp2x);
   Value divisor = rewriter.create<arith::AddFOp>(loc, one, exp2x);
   Value positiveRes = rewriter.create<arith::DivFOp>(loc, dividend, divisor);
 
-  // Case 2: tanh(x) = exp^{2x}-1 / exp^{2x}+1
-  exp2x = rewriter.create<math::ExpOp>(loc, doubledX);
-  dividend = rewriter.create<arith::SubFOp>(loc, exp2x, one);
-  divisor = rewriter.create<arith::AddFOp>(loc, exp2x, one);
-  Value negativeRes = rewriter.create<arith::DivFOp>(loc, dividend, divisor);
+  // Multiply result by sign(x) to retain signs from negative inputs
+  rewriter.replaceOpWithNewOp<arith::MulFOp>(op, sign, positiveRes);
 
-  // tanh(x) = x >= 0 ? positiveRes : negativeRes
-  Value zero = createFloatConst(loc, floatType, 0.0, rewriter);
-  Value cmpRes = rewriter.create<arith::CmpFOp>(loc, arith::CmpFPredicate::OGE,
-                                                op.getOperand(), zero);
-  rewriter.replaceOpWithNewOp<arith::SelectOp>(op, cmpRes, positiveRes,
-                                               negativeRes);
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index 962cb28b7c2ab9..428c1c37c4e8b5 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -39,14 +39,24 @@ using namespace mlir;
 using namespace mlir::math;
 using namespace mlir::vector;
 
+// Helper to encapsulate a vector's shape (including scalable dims).
+struct VectorShape {
+  ArrayRef<int64_t> sizes;
+  ArrayRef<bool> scalableFlags;
+
+  bool empty() const { return sizes.empty(); }
+};
+
 // Returns vector shape if the type is a vector. Returns an empty shape if it is
 // not a vector.
-static ArrayRef<int64_t> vectorShape(Type type) {
+static VectorShape vectorShape(Type type) {
   auto vectorType = dyn_cast<VectorType>(type);
-  return vectorType ? vectorType.getShape() : ArrayRef<int64_t>();
+  return vectorType
+             ? VectorShape{vectorType.getShape(), vectorType.getScalableDims()}
+             : VectorShape{};
 }
 
-static ArrayRef<int64_t> vectorShape(Value value) {
+static VectorShape vectorShape(Value value) {
   return vectorShape(value.getType());
 }
 
@@ -55,14 +65,16 @@ static ArrayRef<int64_t> vectorShape(Value value) {
 //----------------------------------------------------------------------------//
 
 // Broadcasts scalar type into vector type (iff shape is non-scalar).
-static Type broadcast(Type type, ArrayRef<int64_t> shape) {
+static Type broadcast(Type type, VectorShape shape) {
   assert(!isa<VectorType>(type) && "must be scalar type");
-  return !shape.empty() ? VectorType::get(shape, type) : type;
+  return !shape.empty()
+             ? VectorType::get(shape.sizes, type, shape.scalableFlags)
+             : type;
 }
 
 // Broadcasts scalar value into vector (iff shape is non-scalar).
 static Value broadcast(ImplicitLocOpBuilder &builder, Value value,
-                       ArrayRef<int64_t> shape) {
+                       VectorShape shape) {
   assert(!isa<VectorType>(value.getType()) && "must be scalar value");
   auto type = broadcast(value.getType(), shape);
   return !shape.empty() ? builder.create<BroadcastOp>(type, value) : value;
@@ -215,7 +227,7 @@ static Value clamp(ImplicitLocOpBuilder &builder, Value value, Value lowerBound,
 static std::pair<Value, Value> frexp(ImplicitLocOpBuilder &builder, Value arg,
                                      bool isPositive = false) {
   assert(getElementTypeOrSelf(arg).isF32() && "arg must be f32 type");
-  ArrayRef<int64_t> shape = vectorShape(arg);
+  VectorShape shape = vectorShape(arg);
 
   auto bcast = [&](Value value) -> Value {
     return broadcast(builder, value, shape);
@@ -255,7 +267,7 @@ static std::pair<Value, Value> frexp(ImplicitLocOpBuilder &builder, Value arg,
 // Computes exp2 for an i32 argument.
 static Value exp2I32(ImplicitLocOpBuilder &builder, Value arg) {
   assert(getElementTypeOrSelf(arg).isInteger(32) && "arg must be i32 type");
-  ArrayRef<int64_t> shape = vectorShape(arg);
+  VectorShape shape = vectorShape(arg);
 
   auto bcast = [&](Value value) -> Value {
     return broadcast(builder, value, shape);
@@ -281,7 +293,7 @@ Value makePolynomialCalculation(ImplicitLocOpBuilder &builder,
   Type elementType = getElementTypeOrSelf(x);
   assert((elementType.isF32() || elementType.isF16()) &&
          "x must be f32 or f16 type");
-  ArrayRef<int64_t> shape = vectorShape(x);
+  VectorShape shape = vectorShape(x);
 
   if (coeffs.empty())
     return broadcast(builder, floatCst(builder, 0.0f, elementType), shape);
@@ -379,7 +391,7 @@ AtanApproximation::matchAndRewrite(math::AtanOp op,
   if (!getElementTypeOrSelf(operand).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   Value abs = builder.create<math::AbsFOp>(operand);
@@ -478,7 +490,7 @@ Atan2Approximation::matchAndRewrite(math::Atan2Op op,
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
-  ArrayRef<int64_t> shape = vectorShape(op.getResult());
+  VectorShape shape = vectorShape(op.getResult());
 
   // Compute atan in the valid range.
   auto div = builder.create<arith::DivFOp>(y, x);
@@ -544,7 +556,7 @@ TanhApproximation::matchAndRewrite(math::TanhOp op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -632,7 +644,7 @@ LogApproximationBase<Op>::logMatchAndRewrite(Op op, PatternRewriter &rewriter,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -779,7 +791,7 @@ Log1pApproximation::matchAndRewrite(math::Log1pOp op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -829,7 +841,7 @@ ErfPolynomialApproximation::matchAndRewrite(math::ErfOp op,
   if (!(elementType.isF32() || elementType.isF16()))
     return rewriter.notifyMatchFailure(op,
                                        "only f32 and f16 type is supported.");
-  ArrayRef<int64_t> shape = vectorShape(operand);
+  VectorShape shape = vectorShape(operand);
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -938,9 +950,8 @@ ErfPolynomialApproximation::matchAndRewrite(math::ErfOp op,
 
 namespace {
 
-Value clampWithNormals(ImplicitLocOpBuilder &builder,
-                       const llvm::ArrayRef<int64_t> shape, Value value,
-                       float lowerBound, float upperBound) {
+Value clampWithNormals(ImplicitLocOpBuilder &builder, const VectorShape shape,
+                       Value value, float lowerBound, float upperBound) {
   assert(!std::isnan(lowerBound));
   assert(!std::isnan(upperBound));
 
@@ -1131,7 +1142,7 @@ ExpM1Approximation::matchAndRewrite(math::ExpM1Op op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -1201,7 +1212,7 @@ LogicalResult SinAndCosApproximation<isSine, OpTy>::matchAndRewrite(
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
   auto bcast = [&](Value value) -> Value {
@@ -1328,7 +1339,7 @@ CbrtApproximation::matchAndRewrite(math::CbrtOp op,
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-  ArrayRef<int64_t> shape = vectorShape(operand);
+  VectorShape shape = vectorShape(operand);
 
   Type floatTy = getElementTypeOrSelf(operand.getType());
   Type intTy = b.getIntegerType(floatTy.getIntOrFloatBitWidth());
@@ -1417,10 +1428,10 @@ RsqrtApproximation::matchAndRewrite(math::RsqrtOp op,
   if (!getElementTypeOrSelf(op.getOperand()).isF32())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
-  ArrayRef<int64_t> shape = vectorShape(op.getOperand());
+  VectorShape shape = vectorShape(op.getOperand());
 
   // Only support already-vectorized rsqrt's.
-  if (shape.empty() || shape.back() % 8 != 0)
+  if (shape.empty() || shape.sizes.back() % 8 != 0)
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
 
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
index d71669a274b8fc..41082a85a485f2 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefDialect.cpp
@@ -8,8 +8,13 @@
 
 #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Interfaces/MemorySlotInterfaces.h"
+#include "mlir/Interfaces/RuntimeVerifiableOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include <optional>
 
@@ -43,6 +48,13 @@ void mlir::memref::MemRefDialect::initialize() {
       >();
   addInterfaces<MemRefInlinerInterface>();
   declarePromisedInterface<MemRefDialect, ConvertToLLVMPatternInterface>();
+  declarePromisedInterfaces<bufferization::AllocationOpInterface, AllocOp,
+                            AllocaOp, ReallocOp>();
+  declarePromisedInterfaces<RuntimeVerifiableOpInterface, CastOp, ExpandShapeOp,
+                            LoadOp, ReinterpretCastOp, StoreOp, SubViewOp>();
+  declarePromisedInterfaces<ValueBoundsOpInterface, AllocOp, AllocaOp, CastOp,
+                            DimOp, GetGlobalOp, RankOp, SubViewOp>();
+  declarePromisedInterface<MemRefType, DestructurableTypeInterface>();
 }
 
 /// Finds the unique dealloc operation (if one exists) for `allocValue`.
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 233e702dbb2292..ddb9676eb4f628 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -9,6 +9,8 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
@@ -18,6 +20,7 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/MapVector.h"
@@ -71,6 +74,12 @@ void SCFDialect::initialize() {
 #include "mlir/Dialect/SCF/IR/SCFOps.cpp.inc"
       >();
   addInterfaces<SCFInlinerInterface>();
+  declarePromisedInterfaces<bufferization::BufferDeallocationOpInterface,
+                            InParallelOp, ReduceReturnOp>();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, ConditionOp,
+                            ExecuteRegionOp, ForOp, IfOp, IndexSwitchOp,
+                            ForallOp, InParallelOp, WhileOp, YieldOp>();
+  declarePromisedInterface<ForOp, ValueBoundsOpInterface>();
 }
 
 /// Default callback for IfOp builders. Inserts a yield without arguments.
diff --git a/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt
index 2b5cedafae1e85..b185264211474f 100644
--- a/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/SPIRV/IR/CMakeLists.txt
@@ -27,6 +27,7 @@ add_mlir_dialect_library(MLIRSPIRVDialect
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SPIRV
 
   DEPENDS
+  MLIRGPUDialect
   MLIRSPIRVAttributeIncGen
   MLIRSPIRVAttrUtilsGen
   MLIRSPIRVAvailabilityIncGen
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
index 4c62289a1e9458..ff4bace9a4d882 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
@@ -797,6 +797,49 @@ OpFoldResult spirv::LogicalOrOp::fold(FoldAdaptor adaptor) {
   return Attribute();
 }
 
+//===----------------------------------------------------------------------===//
+// spirv.SelectOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult spirv::SelectOp::fold(FoldAdaptor adaptor) {
+  // spirv.Select _ x x -> x
+  Value trueVals = getTrueValue();
+  Value falseVals = getFalseValue();
+  if (trueVals == falseVals)
+    return trueVals;
+
+  ArrayRef<Attribute> operands = adaptor.getOperands();
+
+  // spirv.Select true  x y -> x
+  // spirv.Select false x y -> y
+  if (auto boolAttr = getScalarOrSplatBoolAttr(operands[0]))
+    return *boolAttr ? trueVals : falseVals;
+
+  // Check that all the operands are constant
+  if (!operands[0] || !operands[1] || !operands[2])
+    return Attribute();
+
+  // Note: getScalarOrSplatBoolAttr will always return a boolAttr if we are in
+  // the scalar case. Hence, we are only required to consider the case of
+  // DenseElementsAttr in foldSelectOp.
+  auto condAttrs = dyn_cast<DenseElementsAttr>(operands[0]);
+  auto trueAttrs = dyn_cast<DenseElementsAttr>(operands[1]);
+  auto falseAttrs = dyn_cast<DenseElementsAttr>(operands[2]);
+  if (!condAttrs || !trueAttrs || !falseAttrs)
+    return Attribute();
+
+  auto elementResults = llvm::to_vector<4>(trueAttrs.getValues<Attribute>());
+  auto iters = llvm::zip_equal(elementResults, condAttrs.getValues<BoolAttr>(),
+                               falseAttrs.getValues<Attribute>());
+  for (auto [result, cond, falseRes] : iters) {
+    if (!cond.getValue())
+      result = falseRes;
+  }
+
+  auto resultType = trueAttrs.getType();
+  return DenseElementsAttr::get(cast<ShapedType>(resultType), elementResults);
+}
+
 //===----------------------------------------------------------------------===//
 // spirv.IEqualOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index d9ee39a4e8dd32..f5a3717f815de5 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -11,6 +11,7 @@
 #include "mlir/Dialect/Shape/IR/Shape.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/CommonFolders.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Traits.h"
@@ -143,6 +144,8 @@ void ShapeDialect::initialize() {
   // still evolving it makes it simple to start with an unregistered ops and
   // try different variants before actually defining the op.
   allowUnknownOperations();
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface, AssumingOp,
+                            AssumingYieldOp>();
 }
 
 Operation *ShapeDialect::materializeConstant(OpBuilder &builder,
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 7750efdd9add0f..6da51bb6b9cacf 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -1956,6 +1957,10 @@ void SparseTensorDialect::initialize() {
 #define GET_OP_LIST
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorOps.cpp.inc"
       >();
+  declarePromisedInterfaces<
+      bufferization::BufferizableOpInterface, ConcatenateOp, ConvertOp, LoadOp,
+      NewOp, NumberOfEntriesOp, AssembleOp, DisassembleOp,
+      ToCoordinatesBufferOp, ToCoordinatesOp, ToPositionsOp, ToValuesOp>();
 }
 
 #define GET_OP_CLASSES
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.cpp
index a942a721e218fe..7734d1d258453c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -187,6 +187,35 @@ struct DisassembleOpInterface
   }
 };
 
+struct ForeachOpInterface : public SparseBufferizableOpInterfaceExternalModel<
+                                ForeachOpInterface, sparse_tensor::ForeachOp> {
+  bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
+                              const AnalysisState &state) const {
+    return true;
+  }
+
+  bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
+                               const AnalysisState &state) const {
+    return false;
+  }
+
+  AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
+                                      const AnalysisState &state) const {
+    return {};
+  }
+
+  LogicalResult verifyAnalysis(Operation *op,
+                               const AnalysisState &state) const {
+    // A more complex analysis (similar to scf.for) is needed if the op returns
+    // a tensor. That tensor would have to be bufferized (not implemented yet).
+    for (OpResult result : op->getResults()) {
+      if (isa<TensorType>(result.getType()))
+        return op->emitOpError("tensor results are not supported yet");
+    }
+    return success();
+  }
+};
+
 struct NumberOfEntriesOpInterface
     : public SparseBufferizableOpInterfaceExternalModel<
           NumberOfEntriesOpInterface, sparse_tensor::NumberOfEntriesOp> {
@@ -307,6 +336,7 @@ void mlir::sparse_tensor::registerBufferizableOpInterfaceExternalModels(
         NumberOfEntriesOpInterface>(*ctx);
     sparse_tensor::AssembleOp::attachInterface<AssembleOpInterface>(*ctx);
     sparse_tensor::DisassembleOp::attachInterface<DisassembleOpInterface>(*ctx);
+    sparse_tensor::ForeachOp::attachInterface<ForeachOpInterface>(*ctx);
     sparse_tensor::ToCoordinatesBufferOp::attachInterface<
         ToCoordinatesBufferOpInterface>(*ctx);
     sparse_tensor::ToCoordinatesOp::attachInterface<ToCoordinatesOpInterface>(
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index f93b59de29e57b..14ea07f0b54b82 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -573,6 +573,12 @@ struct GenericOpScheduler : public OpRewritePattern<linalg::GenericOp> {
       rewriter.modifyOpInPlace(linalgOp, [&]() {
         linalgOp->setOperand(t->getOperandNumber(), dst);
       });
+
+      // Release the transposed form afterwards.
+      // TODO: CSE when used in more than one following op?
+      rewriter.setInsertionPointAfter(linalgOp);
+      rewriter.create<bufferization::DeallocTensorOp>(dst.getLoc(), dst);
+
       return success();
     }
     // Cannot be resolved with a single conversion.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 7ff2fc25328a6c..5679f277e14866 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -1561,6 +1561,19 @@ struct SparseNewConverter : public OpConversionPattern<NewOp> {
   }
 };
 
+struct SparseHasRuntimeLibraryConverter
+    : public OpConversionPattern<HasRuntimeLibraryOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(HasRuntimeLibraryOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto i1Type = rewriter.getI1Type();
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(
+        op, i1Type, rewriter.getIntegerAttr(i1Type, 0));
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -1572,21 +1585,21 @@ struct SparseNewConverter : public OpConversionPattern<NewOp> {
 void mlir::populateSparseTensorCodegenPatterns(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     bool createSparseDeallocs, bool enableBufferInitialization) {
-  patterns.add<SparseAssembleOpConverter, SparseDisassembleOpConverter,
-               SparseReturnConverter, SparseCallConverter, SparseLvlOpConverter,
-               SparseCastConverter, SparseExtractSliceConverter,
-               SparseTensorLoadConverter, SparseExpandConverter,
-               SparseCompressConverter, SparseInsertConverter,
-               SparseReorderCOOConverter, SparseReMapConverter,
-               SparseSliceGetterOpConverter<ToSliceOffsetOp,
-                                            StorageSpecifierKind::DimOffset>,
-               SparseSliceGetterOpConverter<ToSliceStrideOp,
-                                            StorageSpecifierKind::DimStride>,
-               SparseToPositionsConverter, SparseToCoordinatesConverter,
-               SparseToCoordinatesBufferConverter, SparseToValuesConverter,
-               SparseConvertConverter, SparseNewConverter,
-               SparseNumberOfEntriesConverter>(typeConverter,
-                                               patterns.getContext());
+  patterns.add<
+      SparseAssembleOpConverter, SparseDisassembleOpConverter,
+      SparseReturnConverter, SparseCallConverter, SparseLvlOpConverter,
+      SparseCastConverter, SparseExtractSliceConverter,
+      SparseTensorLoadConverter, SparseExpandConverter, SparseCompressConverter,
+      SparseInsertConverter, SparseReorderCOOConverter, SparseReMapConverter,
+      SparseSliceGetterOpConverter<ToSliceOffsetOp,
+                                   StorageSpecifierKind::DimOffset>,
+      SparseSliceGetterOpConverter<ToSliceStrideOp,
+                                   StorageSpecifierKind::DimStride>,
+      SparseToPositionsConverter, SparseToCoordinatesConverter,
+      SparseToCoordinatesBufferConverter, SparseToValuesConverter,
+      SparseConvertConverter, SparseNewConverter,
+      SparseNumberOfEntriesConverter, SparseHasRuntimeLibraryConverter>(
+      typeConverter, patterns.getContext());
   patterns.add<SparseTensorDeallocConverter>(
       typeConverter, patterns.getContext(), createSparseDeallocs);
   patterns.add<SparseTensorAllocConverter, SparseTensorEmptyConverter>(
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index 0937c10f257283..92c98b34af6027 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -840,6 +840,19 @@ class SparseTensorDisassembleConverter
   }
 };
 
+struct SparseHasRuntimeLibraryConverter
+    : public OpConversionPattern<HasRuntimeLibraryOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(HasRuntimeLibraryOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto i1Type = rewriter.getI1Type();
+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(
+        op, i1Type, rewriter.getIntegerAttr(i1Type, 1));
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -868,6 +881,7 @@ void mlir::populateSparseTensorConversionPatterns(TypeConverter &typeConverter,
            SparseTensorToValuesConverter, SparseNumberOfEntriesConverter,
            SparseTensorLoadConverter, SparseTensorInsertConverter,
            SparseTensorExpandConverter, SparseTensorCompressConverter,
-           SparseTensorAssembleConverter, SparseTensorDisassembleConverter>(
-          typeConverter, patterns.getContext());
+           SparseTensorAssembleConverter, SparseTensorDisassembleConverter,
+           SparseHasRuntimeLibraryConverter>(typeConverter,
+                                             patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
index 5b4395cc31a46b..c370d104e09858 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
@@ -16,6 +17,37 @@ using namespace mlir::sparse_tensor;
 
 namespace {
 
+struct GuardSparseAlloc
+    : public OpRewritePattern<bufferization::AllocTensorOp> {
+  using OpRewritePattern<bufferization::AllocTensorOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(bufferization::AllocTensorOp op,
+                                PatternRewriter &rewriter) const override {
+    // Only rewrite sparse allocations.
+    if (!getSparseTensorEncoding(op.getResult().getType()))
+      return failure();
+
+    // Only rewrite sparse allocations that escape the method
+    // without any chance of a finalizing operation in between.
+    // Here we assume that sparse tensor setup never crosses
+    // method boundaries. The current rewriting only repairs
+    // the most obvious allocate-call/return cases.
+    if (!llvm::all_of(op->getUses(), [](OpOperand &use) {
+          return isa<func::ReturnOp, func::CallOp, func::CallIndirectOp>(
+              use.getOwner());
+        }))
+      return failure();
+
+    // Guard escaping empty sparse tensor allocations with a finalizing
+    // operation that leaves the underlying storage in a proper state
+    // before the tensor escapes across the method boundary.
+    rewriter.setInsertionPointAfter(op);
+    auto load = rewriter.create<LoadOp>(op.getLoc(), op.getResult(), true);
+    rewriter.replaceAllUsesExcept(op, load, load);
+    return success();
+  }
+};
+
 template <typename StageWithSortOp>
 struct StageUnorderedSparseOps : public OpRewritePattern<StageWithSortOp> {
   using OpRewritePattern<StageWithSortOp>::OpRewritePattern;
@@ -37,6 +69,6 @@ struct StageUnorderedSparseOps : public OpRewritePattern<StageWithSortOp> {
 } // namespace
 
 void mlir::populateStageSparseOperationsPatterns(RewritePatternSet &patterns) {
-  patterns.add<StageUnorderedSparseOps<ConvertOp>,
+  patterns.add<GuardSparseAlloc, StageUnorderedSparseOps<ConvertOp>,
                StageUnorderedSparseOps<ConcatenateOp>>(patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
index 62032ff301bece..5ca9510408c301 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
@@ -8,8 +8,11 @@
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 
 using namespace mlir;
@@ -45,4 +48,22 @@ void TensorDialect::initialize() {
 #include "mlir/Dialect/Tensor/IR/TensorOps.cpp.inc"
       >();
   addInterfaces<TensorInlinerInterface>();
+  declarePromisedInterfaces<
+      bufferization::BufferizableOpInterface, CastOp, CollapseShapeOp, DimOp,
+      EmptyOp, ExpandShapeOp, ExtractSliceOp, ExtractOp, FromElementsOp,
+      GenerateOp, InsertOp, InsertSliceOp, PadOp, ParallelInsertSliceOp, RankOp,
+      ReshapeOp, SplatOp>();
+  declarePromisedInterfaces<transform::FindPayloadReplacementOpInterface,
+                            CollapseShapeOp, ExpandShapeOp, ExtractSliceOp,
+                            InsertSliceOp, ReshapeOp>();
+  declarePromisedInterfaces<ReifyRankedShapedTypeOpInterface, ExpandShapeOp,
+                            CollapseShapeOp, PadOp>();
+  declarePromisedInterfaces<SubsetOpInterface, ExtractSliceOp, InsertSliceOp,
+                            ParallelInsertSliceOp>();
+  declarePromisedInterfaces<SubsetInsertionOpInterface, InsertSliceOp,
+                            ParallelInsertSliceOp>();
+  declarePromisedInterface<ExtractSliceOp, SubsetExtractionOpInterface>();
+  declarePromisedInterfaces<TilingInterface, PadOp, PackOp, UnPackOp>();
+  declarePromisedInterfaces<ValueBoundsOpInterface, CastOp, DimOp, EmptyOp,
+                            ExtractSliceOp, PadOp, RankOp>();
 }
diff --git a/mlir/lib/Dialect/Tosa/CMakeLists.txt b/mlir/lib/Dialect/Tosa/CMakeLists.txt
index ba5343dcd7ac6c..1911405c63cd58 100644
--- a/mlir/lib/Dialect/Tosa/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tosa/CMakeLists.txt
@@ -12,6 +12,7 @@ add_mlir_dialect_library(MLIRTosaDialect
   MLIRTosaDialectBytecodeIncGen
   MLIRTosaOpsIncGen
   MLIRTosaInterfacesIncGen
+  MLIRShardingInterfaceIncGen
 
   LINK_LIBS PUBLIC
   MLIRIR
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index e29dd9abe72b16..59458db23bb4ca 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"
 #include "mlir/Dialect/Quant/QuantOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tosa/Utils/QuantUtils.h"
@@ -136,6 +137,14 @@ void TosaDialect::initialize() {
 #include "mlir/Dialect/Tosa/IR/TosaAttributes.cpp.inc"
       >();
   addInterfaces<TosaDialectBytecodeInterface, TosaInlinerInterface>();
+  declarePromisedInterfaces<
+      mesh::ShardingInterface, ClampOp, SigmoidOp, TanhOp, AddOp,
+      ArithmeticRightShiftOp, BitwiseAndOp, BitwiseOrOp, BitwiseXorOp, DivOp,
+      LogicalAndOp, LogicalLeftShiftOp, LogicalRightShiftOp, LogicalOrOp,
+      LogicalXorOp, MaximumOp, MinimumOp, MulOp, PowOp, SubOp, AbsOp,
+      BitwiseNotOp, CeilOp, ClzOp, ExpOp, FloorOp, LogOp, LogicalNotOp,
+      NegateOp, ReciprocalOp, RsqrtOp, SelectOp, EqualOp, GreaterOp,
+      GreaterEqualOp, MatMulOp>();
 }
 
 Operation *TosaDialect::materializeConstant(OpBuilder &builder, Attribute value,
diff --git a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
index 71a9d61198e3fb..fe2eea535ffdcf 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
@@ -1278,14 +1278,11 @@ void transform::TrackingListener::notifyMatchFailure(
 }
 
 void transform::TrackingListener::notifyOperationErased(Operation *op) {
-  // TODO: Walk can be removed when D144193 has landed.
-  op->walk([&](Operation *op) {
-    // Remove mappings for result values.
-    for (OpResult value : op->getResults())
-      (void)replacePayloadValue(value, nullptr);
-    // Remove mapping for op.
-    (void)replacePayloadOp(op, nullptr);
-  });
+  // Remove mappings for result values.
+  for (OpResult value : op->getResults())
+    (void)replacePayloadValue(value, nullptr);
+  // Remove mapping for op.
+  (void)replacePayloadOp(op, nullptr);
 }
 
 void transform::TrackingListener::notifyOperationReplaced(
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 75f6220ad8f3fa..35296824246eb6 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -31,6 +32,7 @@
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/InliningUtils.h"
@@ -374,6 +376,14 @@ void VectorDialect::initialize() {
       >();
 
   addInterfaces<VectorInlinerInterface>();
+
+  declarePromisedInterfaces<bufferization::BufferizableOpInterface,
+                            TransferReadOp, TransferWriteOp, GatherOp, MaskOp,
+                            YieldOp>();
+  declarePromisedInterfaces<SubsetOpInterface, TransferReadOp,
+                            TransferWriteOp>();
+  declarePromisedInterface<TransferReadOp, SubsetExtractionOpInterface>();
+  declarePromisedInterface<TransferWriteOp, SubsetInsertionOpInterface>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
@@ -6054,7 +6064,7 @@ LogicalResult MaskOp::fold(FoldAdaptor adaptor,
   maskableOp->dropAllUses();
   maskableOp->moveBefore(getOperation());
 
-  results.push_back(maskableOp->getResult(0));
+  llvm::append_range(results, maskableOp->getResults());
   return success();
 }
 
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index d613672608c3ad..63ed0947cf6ce2 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -300,3 +300,20 @@ vector::createUnrollIterator(VectorType vType, int64_t targetRank) {
   shapeToUnroll = shapeToUnroll.slice(0, firstScalableDim);
   return StaticTileOffsetRange(shapeToUnroll, /*unrollStep=*/1);
 }
+
+SmallVector<OpFoldResult> vector::getMixedSizesXfer(bool hasTensorSemantics,
+                                                    Operation *xfer,
+                                                    RewriterBase &rewriter) {
+  auto loc = xfer->getLoc();
+
+  Value base = TypeSwitch<Operation *, Value>(xfer)
+                   .Case<vector::TransferReadOp>(
+                       [&](auto readOp) { return readOp.getSource(); })
+                   .Case<vector::TransferWriteOp>(
+                       [&](auto writeOp) { return writeOp.getOperand(1); });
+
+  SmallVector<OpFoldResult> mixedSourceDims =
+      hasTensorSemantics ? tensor::getMixedSizes(rewriter, loc, base)
+                         : memref::getMixedSizes(rewriter, loc, base);
+  return mixedSourceDims;
+}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 0e89ac4df6ef28..b356c397fb8369 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -12,6 +12,7 @@
 
 namespace mlir {
 namespace xegpu {
+
 // this file is for position occupation,
 // we will add functions in following PRs.
 
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index b9a3429e37b885..09dc30365e37c0 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -28,6 +28,7 @@
 #endif // MLIR_ENABLE_CUDA_CUSPARSE
 
 #ifdef _WIN32
+#include <malloc.h>
 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
 #else
 #define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default")))
@@ -287,7 +288,11 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
                           int64_t elementSizeBytes) {
   // Only densely packed tensors are currently supported.
+#ifdef _WIN32
+  int64_t *denseStrides = (int64_t *)_alloca(rank * sizeof(int64_t));
+#else
   int64_t *denseStrides = (int64_t *)alloca(rank * sizeof(int64_t));
+#endif // _WIN32
   int64_t *sizes = descriptor->sizes;
   for (int64_t i = rank - 1, runningStride = 1; i >= 0; i--) {
     denseStrides[i] = runningStride;
@@ -423,24 +428,27 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled(
               elementStrides[4], interleave, swizzle, l2Promotion, oobFill);
 }
 
-namespace {
-
-template <int rank>
-void mgpuGetMemRefDataAndShape(void *raw_descriptor, char **addr,
-                               uint64_t *globalDim) {
+template <int Rank>
+void mgpuGetMemRefDataAndShape(void *rawDescriptor, char **addr,
+                               uint64_t *globalDim, uint64_t *globalStrides,
+                               const CUtensorMapDataType tensorDataType) {
   auto descriptor =
-      reinterpret_cast<StridedMemRefType<char, rank> *>(raw_descriptor);
+      reinterpret_cast<StridedMemRefType<char, Rank> *>(rawDescriptor);
   *addr = descriptor->data;
-  for (int i = 0; i < rank; ++i) {
-    globalDim[i] = static_cast<uint64_t>(descriptor->sizes[rank - i - 1]);
+  for (int i = 0; i < Rank; ++i) {
+    globalDim[i] = static_cast<uint64_t>(descriptor->sizes[Rank - i - 1]);
+  }
+  static constexpr int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2,
+                                               4, 8, 2, 4, 4, 4};
+  for (int i = 0; i < Rank - 1; ++i) {
+    globalStrides[i] = static_cast<uint64_t>(
+        descriptor->strides[Rank - i - 2] * elementSizeInBytes[tensorDataType]);
   }
 }
 
-} // namespace
-
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
     int64_t tensorRank,                       // Dimensionality of tensor
-    void *ranked_descriptor,                  // Ranked MemRef descriptor
+    void *rankedDescriptor,                   // Ranked MemRef descriptor
     const CUtensorMapDataType tensorDataType, // Stride size (in bytes)
     CUtensorMapInterleave interleave,         // Type of interleaved layout
     CUtensorMapSwizzle swizzle,               // Bank swizzling pattern
@@ -457,38 +465,36 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
   char *globalAddress = nullptr;
   switch (tensorRank) {
   case 1:
-    mgpuGetMemRefDataAndShape<1>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<1>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   case 2:
-    mgpuGetMemRefDataAndShape<2>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<2>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   case 3:
-    mgpuGetMemRefDataAndShape<3>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<3>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   case 4:
-    mgpuGetMemRefDataAndShape<4>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<4>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   case 5:
-    mgpuGetMemRefDataAndShape<5>(ranked_descriptor, &globalAddress, globalDim);
+    mgpuGetMemRefDataAndShape<5>(rankedDescriptor, &globalAddress, globalDim,
+                                 globalStrides, tensorDataType);
     break;
   default:
     fprintf(
         stderr,
         "'mgpuTensorMapEncodeTiledMemref' failed with 'rank is too high'\n");
-    return NULL;
+    return nullptr;
   }
 
-  static const int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2,
-                                           4, 8, 2, 4, 4, 4};
   for (int64_t r = 0; r < tensorRank; ++r) {
-    elementStrides[r] = uint32_t(1);
     boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - 1]);
   }
 
-  globalStrides[0] = globalDim[0] * elementSizeInBytes[tensorDataType];
-  for (int r = 1; r < tensorRank - 1; r++)
-    globalStrides[r] = globalStrides[r - 1] * globalDim[r];
-
   ScopedContext scopedContext;
   mgpuTensorMapEncodeTiled(&tensorMap, tensorDataType, tensorRank32,
                            globalAddress, globalDim, globalStrides, boxDim,
diff --git a/mlir/lib/ExecutionEngine/Float16bits.cpp b/mlir/lib/ExecutionEngine/Float16bits.cpp
index 841610e3c161dc..e5b4f18dd644b8 100644
--- a/mlir/lib/ExecutionEngine/Float16bits.cpp
+++ b/mlir/lib/ExecutionEngine/Float16bits.cpp
@@ -165,7 +165,7 @@ bool operator==(const bf16 &f1, const bf16 &f2) { return f1.bits == f2.bits; }
 #endif
 #endif
 
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(_M_X64)
 // On x86 bfloat16 is passed in SSE registers. Since both float and __bf16
 // are passed in the same register we can use the wider type and careful casting
 // to conform to x86_64 psABI. This only works with the assumption that we're
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
index 731abcbbf1f39e..8835056099d234 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -127,7 +127,7 @@ extern "C" {
     case Action::kPack: {                                                      \
       assert(ptr && "Received nullptr for SparseTensorStorage object");        \
       intptr_t *buffers = static_cast<intptr_t *>(ptr);                        \
-      return SparseTensorStorage<P, C, V>::packFromLvlBuffers(                 \
+      return SparseTensorStorage<P, C, V>::newFromBuffers(                     \
           dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
           dimRank, buffers);                                                   \
     }                                                                          \
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 56a2d850ad3b67..94562d0f15a24a 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -774,7 +774,8 @@ static AffineExpr simplifyMul(AffineExpr lhs, AffineExpr rhs) {
     return getAffineConstantExpr(lhsConst.getValue() * rhsConst.getValue(),
                                  lhs.getContext());
 
-  assert(lhs.isSymbolicOrConstant() || rhs.isSymbolicOrConstant());
+  if (!lhs.isSymbolicOrConstant() && !rhs.isSymbolicOrConstant())
+    return nullptr;
 
   // Canonicalize the mul expression so that the constant/symbolic term is the
   // RHS. If both the lhs and rhs are symbolic, swap them if the lhs is a
diff --git a/mlir/lib/Interfaces/ViewLikeInterface.cpp b/mlir/lib/Interfaces/ViewLikeInterface.cpp
index 7c5905010eb413..6d1ff03756ace9 100644
--- a/mlir/lib/Interfaces/ViewLikeInterface.cpp
+++ b/mlir/lib/Interfaces/ViewLikeInterface.cpp
@@ -27,7 +27,7 @@ LogicalResult mlir::verifyListOfOperandsOrIntegers(Operation *op,
     return op->emitError("expected ") << numElements << " " << name
                                       << " values, got " << staticVals.size();
   unsigned expectedNumDynamicEntries =
-      llvm::count_if(staticVals, [&](int64_t staticVal) {
+      llvm::count_if(staticVals, [](int64_t staticVal) {
         return ShapedType::isDynamic(staticVal);
       });
   if (values.size() != expectedNumDynamicEntries)
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
index c631617f973544..5ba90bba18b197 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
@@ -13,6 +13,7 @@
 #include "mlir/IR/Location.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -51,10 +52,9 @@ DICompileUnitAttr DebugImporter::translateImpl(llvm::DICompileUnit *node) {
   std::optional<DIEmissionKind> emissionKind =
       symbolizeDIEmissionKind(node->getEmissionKind());
   return DICompileUnitAttr::get(
-      context, DistinctAttr::create(UnitAttr::get(context)),
-      node->getSourceLanguage(), translate(node->getFile()),
-      getStringAttrOrNull(node->getRawProducer()), node->isOptimized(),
-      emissionKind.value());
+      context, getOrCreateDistinctID(node), node->getSourceLanguage(),
+      translate(node->getFile()), getStringAttrOrNull(node->getRawProducer()),
+      node->isOptimized(), emissionKind.value());
 }
 
 DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) {
@@ -64,11 +64,7 @@ DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) {
     assert(element && "expected a non-null element type");
     elements.push_back(translate(element));
   }
-  // Drop the elements parameter if a cyclic dependency is detected. We
-  // currently cannot model these cycles and thus drop the parameter if
-  // required. A cyclic dependency is detected if one of the element nodes
-  // translates to a nullptr since the node is already on the translation stack.
-  // TODO: Support debug metadata with cyclic dependencies.
+  // Drop the elements parameter if any of the elements are invalid.
   if (llvm::is_contained(elements, nullptr))
     elements.clear();
   DITypeAttr baseType = translate(node->getBaseType());
@@ -77,14 +73,15 @@ DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) {
   if (node->getTag() == llvm::dwarf::DW_TAG_array_type && !baseType)
     return nullptr;
   return DICompositeTypeAttr::get(
-      context, node->getTag(), getStringAttrOrNull(node->getRawName()),
-      translate(node->getFile()), node->getLine(), translate(node->getScope()),
-      baseType, flags.value_or(DIFlags::Zero), node->getSizeInBits(),
+      context, node->getTag(), /*recId=*/{},
+      getStringAttrOrNull(node->getRawName()), translate(node->getFile()),
+      node->getLine(), translate(node->getScope()), baseType,
+      flags.value_or(DIFlags::Zero), node->getSizeInBits(),
       node->getAlignInBits(), elements);
 }
 
 DIDerivedTypeAttr DebugImporter::translateImpl(llvm::DIDerivedType *node) {
-  // Return nullptr if the base type is a cyclic dependency.
+  // Return nullptr if the base type is invalid.
   DITypeAttr baseType = translate(node->getBaseType());
   if (node->getBaseType() && !baseType)
     return nullptr;
@@ -179,10 +176,10 @@ DISubprogramAttr DebugImporter::translateImpl(llvm::DISubprogram *node) {
   // Only definitions require a distinct identifier.
   mlir::DistinctAttr id;
   if (node->isDistinct())
-    id = DistinctAttr::create(UnitAttr::get(context));
+    id = getOrCreateDistinctID(node);
   std::optional<DISubprogramFlags> subprogramFlags =
       symbolizeDISubprogramFlags(node->getSubprogram()->getSPFlags());
-  // Return nullptr if the scope or type is a cyclic dependency.
+  // Return nullptr if the scope or type is invalid.
   DIScopeAttr scope = translate(node->getScope());
   if (node->getScope() && !scope)
     return nullptr;
@@ -229,7 +226,7 @@ DebugImporter::translateImpl(llvm::DISubroutineType *node) {
     }
     types.push_back(translate(type));
   }
-  // Return nullptr if any of the types is a cyclic dependency.
+  // Return nullptr if any of the types is invalid.
   if (llvm::is_contained(types, nullptr))
     return nullptr;
   return DISubroutineTypeAttr::get(context, node->getCC(), types);
@@ -247,12 +244,42 @@ DINodeAttr DebugImporter::translate(llvm::DINode *node) {
   if (DINodeAttr attr = nodeToAttr.lookup(node))
     return attr;
 
-  // Return nullptr if a cyclic dependency is detected since the same node is
-  // being traversed twice. This check avoids infinite recursion if the debug
-  // metadata contains cycles.
-  if (!translationStack.insert(node))
-    return nullptr;
-  auto guard = llvm::make_scope_exit([&]() { translationStack.pop_back(); });
+  // If the node type is capable of being recursive, check if it's seen before.
+  auto recSelfCtor = getRecSelfConstructor(node);
+  if (recSelfCtor) {
+    // If a cyclic dependency is detected since the same node is being traversed
+    // twice, emit a recursive self type, and mark the duplicate node on the
+    // translationStack so it can emit a recursive decl type.
+    auto [iter, inserted] = translationStack.try_emplace(node, nullptr);
+    if (!inserted) {
+      // The original node may have already been assigned a recursive ID from
+      // a different self-reference. Use that if possible.
+      DistinctAttr recId = iter->second;
+      if (!recId) {
+        recId = DistinctAttr::create(UnitAttr::get(context));
+        iter->second = recId;
+      }
+      unboundRecursiveSelfRefs.back().insert(recId);
+      return cast<DINodeAttr>(recSelfCtor(recId));
+    }
+  }
+
+  unboundRecursiveSelfRefs.emplace_back();
+
+  auto guard = llvm::make_scope_exit([&]() {
+    if (recSelfCtor)
+      translationStack.pop_back();
+
+    // Copy unboundRecursiveSelfRefs down to the previous level.
+    if (unboundRecursiveSelfRefs.size() == 1)
+      assert(unboundRecursiveSelfRefs.back().empty() &&
+             "internal error: unbound recursive self reference at top level.");
+    else
+      unboundRecursiveSelfRefs[unboundRecursiveSelfRefs.size() - 2].insert(
+          unboundRecursiveSelfRefs.back().begin(),
+          unboundRecursiveSelfRefs.back().end());
+    unboundRecursiveSelfRefs.pop_back();
+  });
 
   // Convert the debug metadata if possible.
   auto translateNode = [this](llvm::DINode *node) -> DINodeAttr {
@@ -289,7 +316,19 @@ DINodeAttr DebugImporter::translate(llvm::DINode *node) {
     return nullptr;
   };
   if (DINodeAttr attr = translateNode(node)) {
-    nodeToAttr.insert({node, attr});
+    // If this node was marked as recursive, set its recId.
+    if (auto recType = dyn_cast<DIRecursiveTypeAttrInterface>(attr)) {
+      if (DistinctAttr recId = translationStack.lookup(node)) {
+        attr = cast<DINodeAttr>(recType.withRecId(recId));
+        // Remove the unbound recursive ID from the set of unbound self
+        // references in the translation stack.
+        unboundRecursiveSelfRefs.back().erase(recId);
+      }
+    }
+
+    // Only cache fully self-contained nodes.
+    if (unboundRecursiveSelfRefs.back().empty())
+      nodeToAttr.try_emplace(node, attr);
     return attr;
   }
   return nullptr;
@@ -346,3 +385,20 @@ StringAttr DebugImporter::getStringAttrOrNull(llvm::MDString *stringNode) {
     return StringAttr();
   return StringAttr::get(context, stringNode->getString());
 }
+
+DistinctAttr DebugImporter::getOrCreateDistinctID(llvm::DINode *node) {
+  DistinctAttr &id = nodeToDistinctAttr[node];
+  if (!id)
+    id = DistinctAttr::create(UnitAttr::get(context));
+  return id;
+}
+
+function_ref<DIRecursiveTypeAttrInterface(DistinctAttr)>
+DebugImporter::getRecSelfConstructor(llvm::DINode *node) {
+  using CtorType = function_ref<DIRecursiveTypeAttrInterface(DistinctAttr)>;
+  return TypeSwitch<llvm::DINode *, CtorType>(node)
+      .Case([&](llvm::DICompositeType *concreteNode) {
+        return CtorType(DICompositeTypeAttr::getRecSelf);
+      })
+      .Default(CtorType());
+}
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.h b/mlir/lib/Target/LLVMIR/DebugImporter.h
index 7d4a371284b68b..bcf628fc4234fb 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.h
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.h
@@ -82,12 +82,28 @@ class DebugImporter {
   /// null attribute otherwise.
   StringAttr getStringAttrOrNull(llvm::MDString *stringNode);
 
+  /// Get the DistinctAttr used to represent `node` if one was already created
+  /// for it, or create a new one if not.
+  DistinctAttr getOrCreateDistinctID(llvm::DINode *node);
+
+  /// Get the `getRecSelf` constructor for the translated type of `node` if its
+  /// translated DITypeAttr supports recursion. Otherwise, returns nullptr.
+  function_ref<DIRecursiveTypeAttrInterface(DistinctAttr)>
+  getRecSelfConstructor(llvm::DINode *node);
+
   /// A mapping between LLVM debug metadata and the corresponding attribute.
   DenseMap<llvm::DINode *, DINodeAttr> nodeToAttr;
+  /// A mapping between distinct LLVM debug metadata nodes and the corresponding
+  /// distinct id attribute.
+  DenseMap<llvm::DINode *, DistinctAttr> nodeToDistinctAttr;
 
   /// A stack that stores the metadata nodes that are being traversed. The stack
   /// is used to detect cyclic dependencies during the metadata translation.
-  SetVector<llvm::DINode *> translationStack;
+  /// A node is pushed with a null value. If it is ever seen twice, it is given
+  /// a recursive id attribute, indicating that it is a recursive node.
+  llvm::MapVector<llvm::DINode *, DistinctAttr> translationStack;
+  /// All the unbound recursive self references in the translation stack.
+  SmallVector<DenseSet<DistinctAttr>> unboundRecursiveSelfRefs;
 
   MLIRContext *context;
   ModuleOp mlirModule;
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
index 16918aab549788..126a671a58e808 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
@@ -116,12 +116,20 @@ static DINodeT *getDistinctOrUnique(bool isDistinct, Ts &&...args) {
   return DINodeT::get(std::forward<Ts>(args)...);
 }
 
+llvm::TempDICompositeType
+DebugTranslation::translateTemporaryImpl(DICompositeTypeAttr attr) {
+  return llvm::DICompositeType::getTemporary(
+      llvmCtx, attr.getTag(), getMDStringOrNull(attr.getName()), nullptr,
+      attr.getLine(), nullptr, nullptr, attr.getSizeInBits(),
+      attr.getAlignInBits(),
+      /*OffsetInBits=*/0,
+      /*Flags=*/static_cast<llvm::DINode::DIFlags>(attr.getFlags()),
+      /*Elements=*/nullptr, /*RuntimeLang=*/0,
+      /*VTableHolder=*/nullptr);
+}
+
 llvm::DICompositeType *
 DebugTranslation::translateImpl(DICompositeTypeAttr attr) {
-  SmallVector<llvm::Metadata *> elements;
-  for (auto member : attr.getElements())
-    elements.push_back(translate(member));
-
   // TODO: Use distinct attributes to model this, once they have landed.
   // Depending on the tag, composite types must be distinct.
   bool isDistinct = false;
@@ -133,6 +141,10 @@ DebugTranslation::translateImpl(DICompositeTypeAttr attr) {
     isDistinct = true;
   }
 
+  SmallVector<llvm::Metadata *> elements;
+  for (DINodeAttr member : attr.getElements())
+    elements.push_back(translate(member));
+
   return getDistinctOrUnique<llvm::DICompositeType>(
       isDistinct, llvmCtx, attr.getTag(), getMDStringOrNull(attr.getName()),
       translate(attr.getFile()), attr.getLine(), translate(attr.getScope()),
@@ -150,7 +162,8 @@ llvm::DIDerivedType *DebugTranslation::translateImpl(DIDerivedTypeAttr attr) {
       /*File=*/nullptr, /*Line=*/0,
       /*Scope=*/nullptr, translate(attr.getBaseType()), attr.getSizeInBits(),
       attr.getAlignInBits(), attr.getOffsetInBits(),
-      /*DWARFAddressSpace=*/std::nullopt, /*Flags=*/llvm::DINode::FlagZero);
+      /*DWARFAddressSpace=*/std::nullopt, /*PtrAuthData=*/std::nullopt,
+      /*Flags=*/llvm::DINode::FlagZero);
 }
 
 llvm::DIFile *DebugTranslation::translateImpl(DIFileAttr attr) {
@@ -200,22 +213,76 @@ DebugTranslation::translateImpl(DIGlobalVariableAttr attr) {
       attr.getIsDefined(), nullptr, nullptr, attr.getAlignInBits(), nullptr);
 }
 
+llvm::DIType *
+DebugTranslation::translateRecursive(DIRecursiveTypeAttrInterface attr) {
+  DistinctAttr recursiveId = attr.getRecId();
+  if (attr.isRecSelf()) {
+    auto *iter = recursiveTypeMap.find(recursiveId);
+    assert(iter != recursiveTypeMap.end() && "unbound DI recursive self type");
+    return iter->second;
+  }
+
+  auto setRecursivePlaceholder = [&](llvm::DIType *placeholder) {
+    [[maybe_unused]] auto [iter, inserted] =
+        recursiveTypeMap.try_emplace(recursiveId, placeholder);
+    (void)iter;
+    (void)inserted;
+    assert(inserted && "illegal reuse of recursive id");
+  };
+
+  llvm::DIType *result =
+      TypeSwitch<DIRecursiveTypeAttrInterface, llvm::DIType *>(attr)
+          .Case<DICompositeTypeAttr>([&](auto attr) {
+            auto temporary = translateTemporaryImpl(attr);
+            setRecursivePlaceholder(temporary.get());
+            // Must call `translateImpl` directly instead of `translate` to
+            // avoid handling the recursive interface again.
+            auto *concrete = translateImpl(attr);
+            temporary->replaceAllUsesWith(concrete);
+            return concrete;
+          });
+
+  assert(recursiveTypeMap.back().first == recursiveId &&
+         "internal inconsistency: unexpected recursive translation stack");
+  recursiveTypeMap.pop_back();
+
+  return result;
+}
+
 llvm::DIScope *DebugTranslation::translateImpl(DIScopeAttr attr) {
   return cast<llvm::DIScope>(translate(DINodeAttr(attr)));
 }
 
 llvm::DISubprogram *DebugTranslation::translateImpl(DISubprogramAttr attr) {
+  if (auto iter = distinctAttrToNode.find(attr.getId());
+      iter != distinctAttrToNode.end())
+    return cast<llvm::DISubprogram>(iter->second);
+
+  llvm::DIScope *scope = translate(attr.getScope());
+  llvm::DIFile *file = translate(attr.getFile());
+  llvm::DIType *type = translate(attr.getType());
+  llvm::DICompileUnit *compileUnit = translate(attr.getCompileUnit());
+
+  // Check again after recursive calls in case this distinct node recurses back
+  // to itself.
+  if (auto iter = distinctAttrToNode.find(attr.getId());
+      iter != distinctAttrToNode.end())
+    return cast<llvm::DISubprogram>(iter->second);
+
   bool isDefinition = static_cast<bool>(attr.getSubprogramFlags() &
                                         LLVM::DISubprogramFlags::Definition);
-  return getDistinctOrUnique<llvm::DISubprogram>(
-      isDefinition, llvmCtx, translate(attr.getScope()),
-      getMDStringOrNull(attr.getName()),
-      getMDStringOrNull(attr.getLinkageName()), translate(attr.getFile()),
-      attr.getLine(), translate(attr.getType()), attr.getScopeLine(),
+  llvm::DISubprogram *node = getDistinctOrUnique<llvm::DISubprogram>(
+      isDefinition, llvmCtx, scope, getMDStringOrNull(attr.getName()),
+      getMDStringOrNull(attr.getLinkageName()), file, attr.getLine(), type,
+      attr.getScopeLine(),
       /*ContainingType=*/nullptr, /*VirtualIndex=*/0,
       /*ThisAdjustment=*/0, llvm::DINode::FlagZero,
       static_cast<llvm::DISubprogram::DISPFlags>(attr.getSubprogramFlags()),
-      translate(attr.getCompileUnit()));
+      compileUnit);
+
+  if (attr.getId())
+    distinctAttrToNode.try_emplace(attr.getId(), node);
+  return node;
 }
 
 llvm::DIModule *DebugTranslation::translateImpl(DIModuleAttr attr) {
@@ -268,15 +335,23 @@ llvm::DINode *DebugTranslation::translate(DINodeAttr attr) {
   if (llvm::DINode *node = attrToNode.lookup(attr))
     return node;
 
-  llvm::DINode *node =
-      TypeSwitch<DINodeAttr, llvm::DINode *>(attr)
-          .Case<DIBasicTypeAttr, DICompileUnitAttr, DICompositeTypeAttr,
-                DIDerivedTypeAttr, DIFileAttr, DIGlobalVariableAttr,
-                DILabelAttr, DILexicalBlockAttr, DILexicalBlockFileAttr,
-                DILocalVariableAttr, DIModuleAttr, DINamespaceAttr,
-                DINullTypeAttr, DISubprogramAttr, DISubrangeAttr,
-                DISubroutineTypeAttr>(
-              [&](auto attr) { return translateImpl(attr); });
+  llvm::DINode *node = nullptr;
+  // Recursive types go through a dedicated handler. All other types are
+  // dispatched directly to their specific handlers.
+  if (auto recTypeAttr = dyn_cast<DIRecursiveTypeAttrInterface>(attr))
+    if (recTypeAttr.getRecId())
+      node = translateRecursive(recTypeAttr);
+
+  if (!node)
+    node = TypeSwitch<DINodeAttr, llvm::DINode *>(attr)
+               .Case<DIBasicTypeAttr, DICompileUnitAttr, DICompositeTypeAttr,
+                     DIDerivedTypeAttr, DIFileAttr, DIGlobalVariableAttr,
+                     DILabelAttr, DILexicalBlockAttr, DILexicalBlockFileAttr,
+                     DILocalVariableAttr, DIModuleAttr, DINamespaceAttr,
+                     DINullTypeAttr, DISubprogramAttr, DISubrangeAttr,
+                     DISubroutineTypeAttr>(
+                   [&](auto attr) { return translateImpl(attr); });
+
   attrToNode.insert({attr, node});
   return node;
 }
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.h b/mlir/lib/Target/LLVMIR/DebugTranslation.h
index 627c6846844983..c7a5228cbf5e92 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.h
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.h
@@ -88,6 +88,17 @@ class DebugTranslation {
   llvm::DISubroutineType *translateImpl(DISubroutineTypeAttr attr);
   llvm::DIType *translateImpl(DITypeAttr attr);
 
+  /// Attributes that support self recursion need to implement an additional
+  /// method to hook into `translateRecursive`.
+  /// - `<temp llvm type> translateTemporaryImpl(<mlir type>)`:
+  ///   Create a temporary translation of the DI attr without recursively
+  ///   translating any nested DI attrs.
+  llvm::DIType *translateRecursive(DIRecursiveTypeAttrInterface attr);
+
+  /// Translate the given attribute to a temporary llvm debug metadata of the
+  /// corresponding type.
+  llvm::TempDICompositeType translateTemporaryImpl(DICompositeTypeAttr attr);
+
   /// Constructs a string metadata node from the string attribute. Returns
   /// nullptr if `stringAttr` is null or contains and empty string.
   llvm::MDString *getMDStringOrNull(StringAttr stringAttr);
@@ -102,6 +113,14 @@ class DebugTranslation {
   /// metadata.
   DenseMap<Attribute, llvm::DINode *> attrToNode;
 
+  /// A mapping between recursive ID and the translated DIType.
+  llvm::MapVector<DistinctAttr, llvm::DIType *> recursiveTypeMap;
+
+  /// A mapping between a distinct ID and the translated LLVM metadata node.
+  /// This helps identify attrs that should translate into the same LLVM debug
+  /// node.
+  DenseMap<DistinctAttr, llvm::DINode *> distinctAttrToNode;
+
   /// A mapping between filename and llvm debug file.
   /// TODO: Change this to DenseMap<Identifier, ...> when we can
   /// access the Identifier filename in FileLineColLoc.
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index c00628a420a000..995544238e4a3c 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -716,6 +716,8 @@ void ModuleTranslation::forgetMapping(Region &region) {
           branchMapping.erase(&op);
         if (isa<LLVM::GlobalOp>(op))
           globalsMapping.erase(&op);
+        if (isa<LLVM::CallOp>(op))
+          callMapping.erase(&op);
         llvm::append_range(
             toProcess,
             llvm::map_range(op.getRegions(), [](Region &r) { return &r; }));
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index da775f1b3051b3..44c5e9826f3b79 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -31,6 +31,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Support/Timing.h"
 #include "mlir/Support/ToolUtilities.h"
 #include "mlir/Tools/ParseUtilities.h"
@@ -127,7 +128,7 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
         cl::desc("Print the list of registered dialects and exit"),
         cl::location(showDialectsFlag), cl::init(false));
 
-    static cl::opt<std::string, /*ExternalStorage=*/true> splitInputFile(
+    static cl::opt<std::string, /*ExternalStorage=*/true> splitInputFile{
         "split-input-file", llvm::cl::ValueOptional,
         cl::callback([&](const std::string &str) {
           // Implicit value: use default marker if flag was used without value.
@@ -136,7 +137,7 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
         }),
         cl::desc("Split the input file into chunks using the given or "
                  "default marker and process each chunk independently"),
-        cl::location(splitInputFileFlag), cl::init(""));
+        cl::location(splitInputFileFlag), cl::init("")};
 
     static cl::opt<std::string, /*ExternalStorage=*/true> outputSplitMarker(
         "output-split-marker",
@@ -513,15 +514,19 @@ mlir::registerAndParseCLIOptions(int argc, char **argv,
   return std::make_pair(inputFilename.getValue(), outputFilename.getValue());
 }
 
+static LogicalResult printRegisteredDialects(DialectRegistry &registry) {
+  llvm::outs() << "Available Dialects: ";
+  interleave(registry.getDialectNames(), llvm::outs(), ",");
+  llvm::outs() << "\n";
+  return success();
+}
+
 LogicalResult mlir::MlirOptMain(llvm::raw_ostream &outputStream,
                                 std::unique_ptr<llvm::MemoryBuffer> buffer,
                                 DialectRegistry &registry,
                                 const MlirOptMainConfig &config) {
-  if (config.shouldShowDialects()) {
-    llvm::outs() << "Available Dialects: ";
-    interleave(registry.getDialectNames(), llvm::outs(), ",");
-    llvm::outs() << "\n";
-  }
+  if (config.shouldShowDialects())
+    return printRegisteredDialects(registry);
 
   // The split-input-file mode is a very specific mode that slices the file
   // up into small pieces and checks each independently.
@@ -556,6 +561,9 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv,
 
   MlirOptMainConfig config = MlirOptMainConfig::createFromCLOptions();
 
+  if (config.shouldShowDialects())
+    return printRegisteredDialects(registry);
+
   // When reading from stdin and the input is a tty, it is often a user mistake
   // and the process "appears to be stuck". Print a message to let the user know
   // about it!
diff --git a/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
index 1aaf8adb50a7a5..bd9928950ecc76 100644
--- a/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
+++ b/mlir/lib/Tools/mlir-translate/MlirTranslateMain.cpp
@@ -62,7 +62,7 @@ LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
       llvm::cl::desc("Allow operation with no registered dialects (discouraged: testing only!)"),
       llvm::cl::init(false));
 
-  static llvm::cl::opt<std::string> inputSplitMarker(
+  static llvm::cl::opt<std::string> inputSplitMarker{
       "split-input-file", llvm::cl::ValueOptional,
       llvm::cl::callback([&](const std::string &str) {
         // Implicit value: use default marker if flag was used without value.
@@ -71,7 +71,7 @@ LogicalResult mlir::mlirTranslateMain(int argc, char **argv,
       }),
       llvm::cl::desc("Split the input file into chunks using the given or "
                      "default marker and process each chunk independently"),
-      llvm::cl::init(""));
+      llvm::cl::init("")};
 
   static llvm::cl::opt<bool> verifyDiagnostics(
       "verify-diagnostics",
diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index f3a973d9994083..84ac69b4514b4f 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -14,10 +14,8 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/GenericIteratedDominanceFrontier.h"
@@ -635,13 +633,6 @@ LogicalResult mlir::tryToPromoteMemorySlots(
   return success(promotedAny);
 }
 
-LogicalResult
-Mem2RegPattern::matchAndRewrite(PromotableAllocationOpInterface allocator,
-                                PatternRewriter &rewriter) const {
-  hasBoundedRewriteRecursion();
-  return tryToPromoteMemorySlots({allocator}, rewriter, statistics);
-}
-
 namespace {
 
 struct Mem2Reg : impl::Mem2RegBase<Mem2Reg> {
@@ -650,17 +641,36 @@ struct Mem2Reg : impl::Mem2RegBase<Mem2Reg> {
   void runOnOperation() override {
     Operation *scopeOp = getOperation();
 
-    Mem2RegStatistics statictics{&promotedAmount, &newBlockArgumentAmount};
+    Mem2RegStatistics statistics{&promotedAmount, &newBlockArgumentAmount};
+
+    bool changed = false;
+
+    for (Region &region : scopeOp->getRegions()) {
+      if (region.getBlocks().empty())
+        continue;
 
-    GreedyRewriteConfig config;
-    config.enableRegionSimplification = enableRegionSimplification;
+      OpBuilder builder(&region.front(), region.front().begin());
+      IRRewriter rewriter(builder);
 
-    RewritePatternSet rewritePatterns(&getContext());
-    rewritePatterns.add<Mem2RegPattern>(&getContext(), statictics);
-    FrozenRewritePatternSet frozen(std::move(rewritePatterns));
+      // Promoting a slot can allow for further promotion of other slots,
+      // promotion is tried until no promotion succeeds.
+      while (true) {
+        SmallVector<PromotableAllocationOpInterface> allocators;
+        // Build a list of allocators to attempt to promote the slots of.
+        region.walk([&](PromotableAllocationOpInterface allocator) {
+          allocators.emplace_back(allocator);
+        });
 
-    if (failed(applyPatternsAndFoldGreedily(scopeOp, frozen, config)))
-      signalPassFailure();
+        // Attempt promoting until no promotion succeeds.
+        if (failed(tryToPromoteMemorySlots(allocators, rewriter, statistics)))
+          break;
+
+        changed = true;
+        getAnalysisManager().invalidate({});
+      }
+    }
+    if (!changed)
+      markAllAnalysesPreserved();
   }
 };
 
diff --git a/mlir/lib/Transforms/SROA.cpp b/mlir/lib/Transforms/SROA.cpp
index 3ceda51e1c894c..6111489bdebefd 100644
--- a/mlir/lib/Transforms/SROA.cpp
+++ b/mlir/lib/Transforms/SROA.cpp
@@ -9,7 +9,6 @@
 #include "mlir/Transforms/SROA.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
 namespace mlir {
@@ -205,13 +204,6 @@ LogicalResult mlir::tryToDestructureMemorySlots(
   return success(destructuredAny);
 }
 
-LogicalResult
-SROAPattern::matchAndRewrite(DestructurableAllocationOpInterface allocator,
-                             PatternRewriter &rewriter) const {
-  hasBoundedRewriteRecursion();
-  return tryToDestructureMemorySlots({allocator}, rewriter, statistics);
-}
-
 namespace {
 
 struct SROA : public impl::SROABase<SROA> {
@@ -223,12 +215,35 @@ struct SROA : public impl::SROABase<SROA> {
     SROAStatistics statistics{&destructuredAmount, &slotsWithMemoryBenefit,
                               &maxSubelementAmount};
 
-    RewritePatternSet rewritePatterns(&getContext());
-    rewritePatterns.add<SROAPattern>(&getContext(), statistics);
-    FrozenRewritePatternSet frozen(std::move(rewritePatterns));
+    bool changed = false;
+
+    for (Region &region : scopeOp->getRegions()) {
+      if (region.getBlocks().empty())
+        continue;
 
-    if (failed(applyPatternsAndFoldGreedily(scopeOp, frozen)))
-      signalPassFailure();
+      OpBuilder builder(&region.front(), region.front().begin());
+      IRRewriter rewriter(builder);
+
+      // Destructuring a slot can allow for further destructuring of other
+      // slots, destructuring is tried until no destructuring succeeds.
+      while (true) {
+        SmallVector<DestructurableAllocationOpInterface> allocators;
+        // Build a list of allocators to attempt to destructure the slots of.
+        // TODO: Update list on the fly to avoid repeated visiting of the same
+        // allocators.
+        region.walk([&](DestructurableAllocationOpInterface allocator) {
+          allocators.emplace_back(allocator);
+        });
+
+        if (failed(
+                tryToDestructureMemorySlots(allocators, rewriter, statistics)))
+          break;
+
+        changed = true;
+      }
+    }
+    if (!changed)
+      markAllAnalysesPreserved();
   }
 };
 
diff --git a/mlir/test/CAPI/llvm.c b/mlir/test/CAPI/llvm.c
index 2fd98b29f487c8..48b887ee4a954c 100644
--- a/mlir/test/CAPI/llvm.c
+++ b/mlir/test/CAPI/llvm.c
@@ -301,7 +301,7 @@ static void testDebugInfoAttributes(MlirContext ctx) {
 
   // CHECK: #llvm.di_composite_type<{{.*}}>
   mlirAttributeDump(mlirLLVMDICompositeTypeAttrGet(
-      ctx, 0, foo, file, 1, compile_unit, di_type, 0, 64, 8, 1, &di_type));
+      ctx, 0, id, foo, file, 1, compile_unit, di_type, 0, 64, 8, 1, &di_type));
 
   MlirAttribute subroutine_type =
       mlirLLVMDISubroutineTypeAttrGet(ctx, 0x0, 1, &di_type);
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 0ac7331e1f6987..8920bf86d89b1d 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -680,3 +680,15 @@ func.func @fence_proxy() {
   nvvm.fence.proxy { kind = #nvvm.proxy_kind<async.shared>, space = #nvvm.shared_space<cluster>}
   func.return
 }
+
+// -----
+
+// CHECK-LABEL: @llvm_nvvm_barrier_arrive
+// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
+llvm.func @llvm_nvvm_barrier_arrive(%barID : i32, %numberOfThreads : i32) {
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive 0, $0;", "r" %[[numberOfThreads]] : (i32) -> ()
+  nvvm.barrier.arrive number_of_threads = %numberOfThreads
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "bar.arrive $0, $1'", "r,r" %[[barId]], %[[numberOfThreads]] : (i32, i32) -> ()
+  nvvm.barrier.arrive id = %barID number_of_threads = %numberOfThreads
+  llvm.return
+}
diff --git a/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
index 9fe1e532dfc77c..31da59dcdc7260 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
@@ -43,18 +43,18 @@ spirv.func @composite_insert_vector(%arg0: vector<3xf32>, %arg1: f32) "None" {
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: @select_scalar
-spirv.func @select_scalar(%arg0: i1, %arg1: vector<3xi32>, %arg2: f32) "None" {
+spirv.func @select_scalar(%arg0: i1, %arg1: vector<3xi32>, %arg2: vector<3xi32>, %arg3: f32, %arg4: f32) "None" {
   // CHECK: llvm.select %{{.*}}, %{{.*}}, %{{.*}} : i1, vector<3xi32>
-  %0 = spirv.Select %arg0, %arg1, %arg1 : i1, vector<3xi32>
+  %0 = spirv.Select %arg0, %arg1, %arg2 : i1, vector<3xi32>
   // CHECK: llvm.select %{{.*}}, %{{.*}}, %{{.*}} : i1, f32
-  %1 = spirv.Select %arg0, %arg2, %arg2 : i1, f32
+  %1 = spirv.Select %arg0, %arg3, %arg4 : i1, f32
   spirv.Return
 }
 
 // CHECK-LABEL: @select_vector
-spirv.func @select_vector(%arg0: vector<2xi1>, %arg1: vector<2xi32>) "None" {
+spirv.func @select_vector(%arg0: vector<2xi1>, %arg1: vector<2xi32>, %arg2: vector<2xi32>) "None" {
   // CHECK: llvm.select %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi1>, vector<2xi32>
-  %0 = spirv.Select %arg0, %arg1, %arg1 : vector<2xi1>, vector<2xi32>
+  %0 = spirv.Select %arg0, %arg1, %arg2 : vector<2xi1>, vector<2xi32>
   spirv.Return
 }
 
diff --git a/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir b/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir
index cba7b00ba77a82..e2be87453bf6f2 100644
--- a/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir
+++ b/mlir/test/Dialect/ArmNeon/lower-to-arm-neon.mlir
@@ -40,3 +40,97 @@ func.func @test_lower_vector_arm_neon_without_extsi(%lhs: vector<2x8xi32>, %rhs:
   %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs, %rhs, %acc : vector<2x8xi32>, vector<2x8xi32> into vector<2x2xi32>
   return %res : vector<2x2xi32>
 }
+
+// -----
+
+// CHECK-LABEL: test_lower_vector_arm_neon_unroll
+// CHECK-SAME: %[[VAL_0:.*]]: vector<4x8xi8>, %[[VAL_1:.*]]: vector<4x8xi8>, %[[VAL_2:.*]]: vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_3:.*]] = arith.constant dense<0> : vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_4:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_5:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_6:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_7:.*]] = vector.shape_cast %[[VAL_4]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_8:.*]] = vector.shape_cast %[[VAL_5]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_9:.*]] = vector.shape_cast %[[VAL_6]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_10:.*]] = arm_neon.intr.smmla %[[VAL_9]], %[[VAL_7]], %[[VAL_8]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_11:.*]] = vector.shape_cast %[[VAL_10]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_12:.*]] = vector.insert_strided_slice %[[VAL_11]], %[[VAL_3]] {offsets = [0, 0], strides = [1, 1]} : vector<2x2xi32> into vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_13:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_14:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_15:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_16:.*]] = vector.shape_cast %[[VAL_13]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_17:.*]] = vector.shape_cast %[[VAL_14]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_18:.*]] = vector.shape_cast %[[VAL_15]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_19:.*]] = arm_neon.intr.smmla %[[VAL_18]], %[[VAL_16]], %[[VAL_17]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_20:.*]] = vector.shape_cast %[[VAL_19]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_21:.*]] = vector.insert_strided_slice %[[VAL_20]], %[[VAL_12]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xi32> into vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_22:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_23:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_24:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_25:.*]] = vector.shape_cast %[[VAL_22]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_26:.*]] = vector.shape_cast %[[VAL_23]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_27:.*]] = vector.shape_cast %[[VAL_24]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_28:.*]] = arm_neon.intr.smmla %[[VAL_27]], %[[VAL_25]], %[[VAL_26]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_29:.*]] = vector.shape_cast %[[VAL_28]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_30:.*]] = vector.insert_strided_slice %[[VAL_29]], %[[VAL_21]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xi32> into vector<4x4xi32>
+// CHECK-DAG:  %[[VAL_31:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_32:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_33:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_34:.*]] = vector.shape_cast %[[VAL_31]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_35:.*]] = vector.shape_cast %[[VAL_32]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_36:.*]] = vector.shape_cast %[[VAL_33]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_37:.*]] = arm_neon.intr.smmla %[[VAL_36]], %[[VAL_34]], %[[VAL_35]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_38:.*]] = vector.shape_cast %[[VAL_37]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_39:.*]] = vector.insert_strided_slice %[[VAL_38]], %[[VAL_30]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xi32> into vector<4x4xi32>
+// CHECK-DAG:  return %[[VAL_39]] : vector<4x4xi32>
+// CHECK-DAG:  }
+func.func @test_lower_vector_arm_neon_unroll(%lhs: vector<4x8xi8>, %rhs: vector<4x8xi8>, %acc : vector<4x4xi32>) -> vector<4x4xi32> {
+  %lhs_extsi = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %rhs_extsi = arith.extsi %rhs : vector<4x8xi8> to vector<4x8xi32>
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<4x8xi32>, vector<4x8xi32> into vector<4x4xi32>
+  return %res : vector<4x4xi32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @test_lower_vector_arm_neon_mixed_unroll(
+// CHECK-SAME:                                                       %[[VAL_0:.*]]: vector<4x8xi8>,
+// CHECK-SAME:                                                       %[[VAL_1:.*]]: vector<2x8xi4>,
+// CHECK-SAME:                                                       %[[VAL_2:.*]]: vector<4x2xi32>) -> vector<4x2xi32> {
+// CHECK-DAG:  %[[VAL_3:.*]] = arith.constant dense<0> : vector<4x2xi32>
+// CHECK-DAG:  %[[VAL_4:.*]] = arith.extsi %[[VAL_1]] : vector<2x8xi4> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_5:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_6:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_7:.*]] = vector.shape_cast %[[VAL_5]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_8:.*]] = vector.shape_cast %[[VAL_4]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_9:.*]] = vector.shape_cast %[[VAL_6]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_10:.*]] = arm_neon.intr.smmla %[[VAL_9]], %[[VAL_7]], %[[VAL_8]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_11:.*]] = vector.shape_cast %[[VAL_10]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_12:.*]] = vector.insert_strided_slice %[[VAL_11]], %[[VAL_3]] {offsets = [0, 0], strides = [1, 1]} : vector<2x2xi32> into vector<4x2xi32>
+// CHECK-DAG:  %[[VAL_13:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x8xi8> to vector<2x8xi8>
+// CHECK-DAG:  %[[VAL_14:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x2xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_15:.*]] = vector.shape_cast %[[VAL_13]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_16:.*]] = vector.shape_cast %[[VAL_4]] : vector<2x8xi8> to vector<16xi8>
+// CHECK-DAG:  %[[VAL_17:.*]] = vector.shape_cast %[[VAL_14]] : vector<2x2xi32> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_18:.*]] = arm_neon.intr.smmla %[[VAL_17]], %[[VAL_15]], %[[VAL_16]] : vector<16xi8> to vector<4xi32>
+// CHECK-DAG:  %[[VAL_19:.*]] = vector.shape_cast %[[VAL_18]] : vector<4xi32> to vector<2x2xi32>
+// CHECK-DAG:  %[[VAL_20:.*]] = vector.insert_strided_slice %[[VAL_19]], %[[VAL_12]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xi32> into vector<4x2xi32>
+// CHECK-DAG:  return %[[VAL_20]] : vector<4x2xi32>
+// CHECK-DAG:  }
+func.func @test_lower_vector_arm_neon_mixed_unroll(%lhs: vector<4x8xi8>, %rhs: vector<2x8xi4>, %acc : vector<4x2xi32>) -> vector<4x2xi32> {
+  %lhs_extsi = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %rhs_extsi = arith.extsi %rhs : vector<2x8xi4> to vector<2x8xi32>
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<4x8xi32>, vector<2x8xi32> into vector<4x2xi32>
+  return %res : vector<4x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @test_lower_vector_arm_neon_unroll_incompatible_shape(
+// CHECK-DAG:  %[[result:.*]] = vector.contract
+func.func @test_lower_vector_arm_neon_unroll_incompatible_shape(%lhs: vector<4x12xi8>, %rhs: vector<4x12xi8>, %acc : vector<4x4xi32>) -> vector<4x4xi32> {
+  %lhs_extsi = arith.extsi %lhs : vector<4x12xi8> to vector<4x12xi32>
+  %rhs_extsi = arith.extsi %rhs : vector<4x12xi8> to vector<4x12xi32>
+  %res = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_extsi, %rhs_extsi, %acc : vector<4x12xi32>, vector<4x12xi32> into vector<4x4xi32>
+  return %res : vector<4x4xi32>
+}
diff --git a/mlir/test/Dialect/Bufferization/invalid.mlir b/mlir/test/Dialect/Bufferization/invalid.mlir
index 83f8ef78615432..4ebdb0a8f0490e 100644
--- a/mlir/test/Dialect/Bufferization/invalid.mlir
+++ b/mlir/test/Dialect/Bufferization/invalid.mlir
@@ -26,29 +26,6 @@ func.func @alloc_tensor_copy_and_dims(%t: tensor<?xf32>, %sz: index) {
 
 // -----
 
-#DCSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
-
-func.func @sparse_alloc_direct_return() -> tensor<20x40xf32, #DCSR> {
-  // expected-error @+1{{sparse tensor allocation should not escape function}}
-  %0 = bufferization.alloc_tensor() : tensor<20x40xf32, #DCSR>
-  return %0 : tensor<20x40xf32, #DCSR>
-}
-
-// -----
-
-#DCSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
-
-func.func private @foo(tensor<20x40xf32, #DCSR>) -> ()
-
-func.func @sparse_alloc_call() {
-  // expected-error @+1{{sparse tensor allocation should not escape function}}
-  %0 = bufferization.alloc_tensor() : tensor<20x40xf32, #DCSR>
-  call @foo(%0) : (tensor<20x40xf32, #DCSR>) -> ()
-  return
-}
-
-// -----
-
 // expected-error @+1{{invalid value for 'bufferization.access'}}
 func.func private @invalid_buffer_access_type(tensor<*xf32> {bufferization.access = "foo"})
 
diff --git a/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir b/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir
index ce6338fb348837..4fc80a87f20df5 100644
--- a/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir
+++ b/mlir/test/Dialect/LLVMIR/mem2reg-intrinsics.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(llvm.func(mem2reg{region-simplify=false}))" --split-input-file | FileCheck %s
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(llvm.func(mem2reg))" --split-input-file | FileCheck %s
 
 // CHECK-LABEL: llvm.func @basic_memset
 // CHECK-SAME: (%[[MEMSET_VALUE:.*]]: i8)
@@ -6,13 +6,13 @@ llvm.func @basic_memset(%memset_value: i8) -> i32 {
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr
   %memset_len = llvm.mlir.constant(4 : i32) : i32
-  // CHECK-DAG: %[[C8:.*]] = llvm.mlir.constant(8 : i32) : i32
-  // CHECK-DAG: %[[C16:.*]] = llvm.mlir.constant(16 : i32) : i32
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
   // CHECK-NOT: "llvm.intr.memset"
   // CHECK: %[[VALUE_8:.*]] = llvm.zext %[[MEMSET_VALUE]] : i8 to i32
+  // CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : i32) : i32
   // CHECK: %[[SHIFTED_8:.*]] = llvm.shl %[[VALUE_8]], %[[C8]]
   // CHECK: %[[VALUE_16:.*]] = llvm.or %[[VALUE_8]], %[[SHIFTED_8]]
+  // CHECK: %[[C16:.*]] = llvm.mlir.constant(16 : i32) : i32
   // CHECK: %[[SHIFTED_16:.*]] = llvm.shl %[[VALUE_16]], %[[C16]]
   // CHECK: %[[VALUE_32:.*]] = llvm.or %[[VALUE_16]], %[[SHIFTED_16]]
   // CHECK-NOT: "llvm.intr.memset"
@@ -31,7 +31,14 @@ llvm.func @basic_memset_constant() -> i32 {
   %memset_len = llvm.mlir.constant(4 : i32) : i32
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
   %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32
-  // CHECK: %[[RES:.*]] = llvm.mlir.constant(707406378 : i32) : i32
+  // CHECK: %[[C42:.*]] = llvm.mlir.constant(42 : i8) : i8
+  // CHECK: %[[VALUE_42:.*]] = llvm.zext %[[C42]] : i8 to i32
+  // CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : i32) : i32
+  // CHECK: %[[SHIFTED_42:.*]] = llvm.shl %[[VALUE_42]], %[[C8]]  : i32
+  // CHECK: %[[OR0:.*]] = llvm.or %[[VALUE_42]], %[[SHIFTED_42]]  : i32
+  // CHECK: %[[C16:.*]] = llvm.mlir.constant(16 : i32) : i32
+  // CHECK: %[[SHIFTED:.*]] = llvm.shl %[[OR0]], %[[C16]]  : i32
+  // CHECK: %[[RES:..*]] = llvm.or %[[OR0]], %[[SHIFTED]]  : i32
   // CHECK: llvm.return %[[RES]] : i32
   llvm.return %2 : i32
 }
@@ -44,16 +51,16 @@ llvm.func @exotic_target_memset(%memset_value: i8) -> i40 {
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x i40 {alignment = 4 : i64} : (i32) -> !llvm.ptr
   %memset_len = llvm.mlir.constant(5 : i32) : i32
-  // CHECK-DAG: %[[C8:.*]] = llvm.mlir.constant(8 : i40) : i40
-  // CHECK-DAG: %[[C16:.*]] = llvm.mlir.constant(16 : i40) : i40
-  // CHECK-DAG: %[[C32:.*]] = llvm.mlir.constant(32 : i40) : i40
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
   // CHECK-NOT: "llvm.intr.memset"
   // CHECK: %[[VALUE_8:.*]] = llvm.zext %[[MEMSET_VALUE]] : i8 to i40
+  // CHECK: %[[C8:.*]] = llvm.mlir.constant(8 : i40) : i40
   // CHECK: %[[SHIFTED_8:.*]] = llvm.shl %[[VALUE_8]], %[[C8]]
   // CHECK: %[[VALUE_16:.*]] = llvm.or %[[VALUE_8]], %[[SHIFTED_8]]
+  // CHECK: %[[C16:.*]] = llvm.mlir.constant(16 : i40) : i40
   // CHECK: %[[SHIFTED_16:.*]] = llvm.shl %[[VALUE_16]], %[[C16]]
   // CHECK: %[[VALUE_32:.*]] = llvm.or %[[VALUE_16]], %[[SHIFTED_16]]
+  // CHECK: %[[C32:.*]] = llvm.mlir.constant(32 : i40) : i40
   // CHECK: %[[SHIFTED_COMPL:.*]] = llvm.shl %[[VALUE_32]], %[[C32]]
   // CHECK: %[[VALUE_COMPL:.*]] = llvm.or %[[VALUE_32]], %[[SHIFTED_COMPL]]
   // CHECK-NOT: "llvm.intr.memset"
@@ -64,21 +71,6 @@ llvm.func @exotic_target_memset(%memset_value: i8) -> i40 {
 
 // -----
 
-// CHECK-LABEL: llvm.func @exotic_target_memset_constant
-llvm.func @exotic_target_memset_constant() -> i40 {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  %1 = llvm.alloca %0 x i40 {alignment = 4 : i64} : (i32) -> !llvm.ptr
-  %memset_value = llvm.mlir.constant(42 : i8) : i8
-  %memset_len = llvm.mlir.constant(5 : i32) : i32
-  "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
-  %2 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i40
-  // CHECK: %[[RES:.*]] = llvm.mlir.constant(181096032810 : i40) : i40
-  // CHECK: llvm.return %[[RES]] : i40
-  llvm.return %2 : i40
-}
-
-// -----
-
 // CHECK-LABEL: llvm.func @no_volatile_memset
 llvm.func @no_volatile_memset() -> i32 {
   // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
@@ -195,7 +187,7 @@ llvm.func @basic_memcpy_dest(%destination: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @double_memcpy
 llvm.func @double_memcpy() -> i32 {
   %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NEXT: %[[DATA:.*]] = llvm.mlir.constant(42 : i32) : i32
+  // CHECK: %[[DATA:.*]] = llvm.mlir.constant(42 : i32) : i32
   %data = llvm.mlir.constant(42 : i32) : i32
   %is_volatile = llvm.mlir.constant(false) : i1
   %memcpy_len = llvm.mlir.constant(4 : i32) : i32
@@ -206,7 +198,7 @@ llvm.func @double_memcpy() -> i32 {
   "llvm.intr.memcpy"(%2, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
 
   %res = llvm.load %2 : !llvm.ptr -> i32
-  // CHECK-NEXT: llvm.return %[[DATA]] : i32
+  // CHECK: llvm.return %[[DATA]] : i32
   llvm.return %res : i32
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index f35393c5e95748..de2904d15b647b 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -55,6 +55,16 @@ llvm.func @llvm_nvvm_barrier(%barId : i32, %numberOfThreads : i32) {
   llvm.return
 }
 
+// CHECK-LABEL: @llvm_nvvm_barrier_arrive
+// CHECK-SAME: (%[[barId:.*]]: i32, %[[numberOfThreads:.*]]: i32)
+llvm.func @llvm_nvvm_barrier_arrive(%barId : i32, %numberOfThreads : i32) {
+  // CHECK: nvvm.barrier.arrive number_of_threads = %[[numberOfThreads]]
+  nvvm.barrier.arrive number_of_threads = %numberOfThreads
+  // CHECK: nvvm.barrier.arrive id = %[[barId]] number_of_threads = %[[numberOfThreads]]
+  nvvm.barrier.arrive id = %barId number_of_threads = %numberOfThreads
+  llvm.return
+}
+
 // CHECK-LABEL: @llvm_nvvm_cluster_arrive
 func.func @llvm_nvvm_cluster_arrive() {
   // CHECK: nvvm.cluster.arrive
diff --git a/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir b/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
index c2e3458134ba4b..ba73025814cc05 100644
--- a/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
+++ b/mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
@@ -146,12 +146,10 @@ llvm.func @invalid_indirect_memset() -> i32 {
 
 // CHECK-LABEL: llvm.func @memset_double_use
 llvm.func @memset_double_use() -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // CHECK-DAG: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
-  // CHECK-DAG: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
+  // CHECK: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // CHECK: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
   %memset_value = llvm.mlir.constant(42 : i8) : i8
@@ -159,8 +157,11 @@ llvm.func @memset_double_use() -> i32 {
   %memset_len = llvm.mlir.constant(8 : i32) : i32
   // We expect two generated memset, one for each field.
   // CHECK-NOT: "llvm.intr.memset"
-  // CHECK-DAG: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
+  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
   // CHECK-NOT: "llvm.intr.memset"
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
   %2 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f32)>
@@ -208,13 +209,10 @@ llvm.func @memset_considers_alignment() -> i32 {
 
 // CHECK-LABEL: llvm.func @memset_considers_packing
 llvm.func @memset_considers_packing() -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // CHECK-DAG: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
-  // CHECK-DAG: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
-  // After SROA, only 32-bit values will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMSET_LEN_WHOLE:.*]] = llvm.mlir.constant(4 : i32) : i32
-  // CHECK-DAG: %[[MEMSET_LEN_PARTIAL:.*]] = llvm.mlir.constant(3 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
+  // CHECK: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // CHECK: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.struct<"foo", packed (i8, i32, f32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
   %memset_value = llvm.mlir.constant(42 : i8) : i8
@@ -222,7 +220,10 @@ llvm.func @memset_considers_packing() -> i32 {
   %memset_len = llvm.mlir.constant(8 : i32) : i32
   // Now all fields are touched by the memset.
   // CHECK-NOT: "llvm.intr.memset"
+  // After SROA, only 32-bit values will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMSET_LEN_WHOLE:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN_WHOLE]]) <{isVolatile = false}>
+  // CHECK: %[[MEMSET_LEN_PARTIAL:.*]] = llvm.mlir.constant(3 : i32) : i32
   // CHECK: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN_PARTIAL]]) <{isVolatile = false}>
   // CHECK-NOT: "llvm.intr.memset"
   "llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
@@ -241,14 +242,14 @@ llvm.func @memset_considers_packing() -> i32 {
 // CHECK-LABEL: llvm.func @memcpy_dest
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memcpy_dest(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr
   %memcpy_len = llvm.mlir.constant(40 : i32) : i32
   // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
+  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMCPY_LEN]]) <{isVolatile = false}>
   "llvm.intr.memcpy"(%1, %other_array, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
@@ -261,9 +262,8 @@ llvm.func @memcpy_dest(%other_array: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @memcpy_src
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
   // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr
@@ -271,14 +271,18 @@ llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
   // Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from.
   // We can only check that the amount of operations and allocated slots is correct, which should be sufficient
   // as unused slots are not generated.
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
   "llvm.intr.memcpy"(%other_array, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
   %3 = llvm.load %2 : !llvm.ptr -> i32
@@ -289,14 +293,19 @@ llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
 
 // CHECK-LABEL: llvm.func @memcpy_double
 llvm.func @memcpy_double() -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
   %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-COUNT-2: = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // CHECK: = llvm.alloca %[[ALLOCA_LEN]] x i32
+  // TODO: This should also disappear as a GEP with all zero indices should be
+  // ignored.
+  // CHECK: = llvm.alloca %[[ALLOCA_LEN]] x !llvm.array<1 x i32>
   %1 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr
   %2 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr
+  // Match the dead constant, to avoid collision with the newly created one.
+  // CHECK: llvm.mlir.constant
   %memcpy_len = llvm.mlir.constant(4 : i32) : i32
   // CHECK-NOT: "llvm.intr.memcpy"
+  // CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
   // CHECK-NOT: "llvm.intr.memcpy"
   "llvm.intr.memcpy"(%1, %2, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
@@ -346,14 +355,14 @@ llvm.func @memcpy_no_volatile(%other_array: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @memmove_dest
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memmove_dest(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr
   %memmove_len = llvm.mlir.constant(40 : i32) : i32
   // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
+  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: "llvm.intr.memmove"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMMOVE_LEN]]) <{isVolatile = false}>
   "llvm.intr.memmove"(%1, %other_array, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
@@ -366,9 +375,7 @@ llvm.func @memmove_dest(%other_array: !llvm.ptr) -> i32 {
 // CHECK-LABEL: llvm.func @memmove_src
 // CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
 llvm.func @memmove_src(%other_array: !llvm.ptr) -> i32 {
-  // CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // After SROA, only one i32 will be actually used, so only 4 bytes will be set.
-  // CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32
   %0 = llvm.mlir.constant(1 : i32) : i32
   %1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr
@@ -376,14 +383,18 @@ llvm.func @memmove_src(%other_array: !llvm.ptr) -> i32 {
   // Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from.
   // We can only check that the amount of operations and allocated slots is correct, which should be sufficient
   // as unused slots are not generated.
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
-  // CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
-  // CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
+  // CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
+  // CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
   "llvm.intr.memmove"(%other_array, %1, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
   %3 = llvm.load %2 : !llvm.ptr -> i32
diff --git a/mlir/test/Dialect/LLVMIR/sroa.mlir b/mlir/test/Dialect/LLVMIR/sroa.mlir
index f56ea53ac029e7..02d25f27f978a6 100644
--- a/mlir/test/Dialect/LLVMIR/sroa.mlir
+++ b/mlir/test/Dialect/LLVMIR/sroa.mlir
@@ -197,3 +197,21 @@ llvm.func @no_dynamic_indexing(%arg: i32) -> i32 {
   // CHECK: llvm.return %[[RES]] : i32
   llvm.return %3 : i32
 }
+
+// -----
+
+// CHECK-LABEL: llvm.func @no_nested_dynamic_indexing
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+llvm.func @no_nested_dynamic_indexing(%arg: i32) -> i32 {
+  // CHECK: %[[SIZE:.*]] = llvm.mlir.constant(1 : i32)
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[SIZE]] x !llvm.struct<(array<10 x i32>, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  %1 = llvm.alloca %0 x !llvm.struct<(array<10 x i32>, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+  // CHECK-NOT: = llvm.alloca
+  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 0, %[[ARG]]]
+  %2 = llvm.getelementptr %1[0, 0, %arg] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.struct<(array<10 x i32>, i32)>
+  // CHECK: %[[RES:.*]] = llvm.load %[[GEP]]
+  %3 = llvm.load %2 : !llvm.ptr -> i32
+  // CHECK: llvm.return %[[RES]] : i32
+  llvm.return %3 : i32
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
index a1a52397386c97..9127eac5da9510 100644
--- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -19,10 +19,49 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @depthwise_conv1d_nwc_wc_dyn_ch_dim(%input: memref<3x5x?xf32>, %filter: memref<2x?xf32>, %output: memref<3x2x?xf32>) {
+// Masked vectorisation of 1D depthwise CW convs is not yet supported
+
+func.func @depthwise_conv1d_ncw_cw(%input: memref<3x?x4xf32>, %filter: memref<?x1xf32>, %output: memref<3x?x4xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.depthwise_conv_1d_ncw_cw
+    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x?x4xf32>, memref<?x1xf32>)
+    outs(%output : memref<3x?x4xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_ncw_cw"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [3, 4, 5, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_dyn_w_dim(%input: memref<3x?x4xf32>, %filter: memref<?x4xf32>, %output: memref<3x?x4xf32>) {
   // expected-error @+1 {{Attempted to vectorize, but failed}}
   linalg.depthwise_conv_1d_nwc_wc
     {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x?x4xf32>, memref<?x4xf32>)
+    outs(%output : memref<3x?x4xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [3, 2, 4, 2] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_dyn_ch_dim(%input: memref<3x5x?xf32>, %filter: memref<2x?xf32>, %output: memref<3x2x?xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.depthwise_conv_1d_nwc_wc
     ins(%input, %filter : memref<3x5x?xf32>, memref<2x?xf32>)
     outs(%output : memref<3x2x?xf32>)
   return
@@ -41,7 +80,6 @@ module attributes {transform.with_named_sequence} {
 func.func @depthwise_conv1d_nwc_wc_dyn_w_dim(%input: memref<3x?x3xf32>, %filter: memref<2x3xf32>, %output: memref<3x?x3xf32>) {
   // expected-error @+1 {{Attempted to vectorize, but failed}}
   linalg.depthwise_conv_1d_nwc_wc
-    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
     ins(%input, %filter : memref<3x?x3xf32>, memref<2x3xf32>)
     outs(%output : memref<3x?x3xf32>)
   return
diff --git a/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir b/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir
new file mode 100644
index 00000000000000..84b556d80887c0
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/vectorize-conv-masked-and-scalable.mlir
@@ -0,0 +1,185 @@
+// RUN: mlir-opt -split-input-file -transform-interpreter -cse %s | FileCheck %s
+
+func.func @depthwise_conv1d_nwc_wc_1x8x3xi8_tensor(%input: tensor<1x8x?xi8>,
+                                                   %filter: tensor<1x?xi8>,
+                                                   %output: tensor<1x8x?xi8>) -> (tensor<1x8x?xi8>) {
+  %res = linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<1> : vector<1xi64>,
+    strides = dense<1> : vector<1xi64>}
+    ins(%input, %filter : tensor<1x8x?xi8>, tensor<1x?xi8>)
+    outs(%output : tensor<1x8x?xi8>) -> tensor<1x8x?xi8>
+  return %res : tensor<1x8x?xi8>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [1, 8, 4, 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL:   func.func @depthwise_conv1d_nwc_wc_1x8x3xi8_tensor(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<1x8x?xi8>,
+// CHECK-SAME:      %[[FILTER:.*]]: tensor<1x?xi8>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: tensor<1x8x?xi8>) -> tensor<1x8x?xi8> {
+
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0 : i8
+
+/// Create a mask for the input tensor
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[CH_DIM_IN:.*]] = tensor.dim %[[INPUT]], %[[C2]] : tensor<1x8x?xi8>
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[MASK_IN:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_IN]] : vector<1x8x4xi1>
+/// Read the input tensor
+// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x4xi8> } : vector<1x8x4xi1> -> vector<1x8x4xi8>
+
+/// Create a mask for the filter tensor
+// CHECK:           %[[CH_DIM_FLT:.*]] = tensor.dim %[[FILTER]], %[[C1]] : tensor<1x?xi8>
+// CHECK:           %[[MASK_FLT:.*]] = vector.create_mask %[[C1]], %[[CH_DIM_FLT]] : vector<1x4xi1>
+/// Read the filter tensor
+// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : tensor<1x?xi8>, vector<1x4xi8> } : vector<1x4xi1> -> vector<1x4xi8>
+
+/// Create a mask for the output tensor
+// CHECK:           %[[CH_DIM_OUT:.*]] = tensor.dim %[[OUTPUT]], %[[C2]] : tensor<1x8x?xi8>
+// CHECK:           %[[MASK_OUT:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_OUT]] : vector<1x8x4xi1>
+// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x4xi8> } : vector<1x8x4xi1> -> vector<1x8x4xi8>
+
+/// Convolution
+// CHECK:           %[[IN_1:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x4xi8> to vector<1x8x4xi8>
+// CHECK:           %[[FLT_1:.*]] = vector.extract %[[VEC_FLT]][0] : vector<4xi8> from vector<1x4xi8>
+// CHECK:           %[[OUT_1:.*]] = vector.extract_strided_slice %[[VEC_OUT]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x4xi8> to vector<1x8x4xi8>
+// CHECK:           %[[FLT_1_B:.*]] = vector.broadcast %[[FLT_1]] : vector<4xi8> to vector<1x8x4xi8>
+// CHECK:           %[[MULI:.*]] = arith.muli %[[IN_1]], %[[FLT_1_B]] : vector<1x8x4xi8>
+// CHECK:           %[[ADDI:.*]] = arith.addi %[[MULI]], %[[OUT_1]] : vector<1x8x4xi8>
+// CHECK:           %[[OUT_INS:.*]] = vector.insert_strided_slice %[[ADDI]], %[[VEC_OUT]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x8x4xi8> into vector<1x8x4xi8>
+// CHECK:           %[[OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : vector<1x8x4xi8>, tensor<1x8x?xi8> } : vector<1x8x4xi1> -> tensor<1x8x?xi8>
+// CHECK:           return %[[OUT]] : tensor<1x8x?xi8>
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_1x8x3xi8_tensor_scalable(
+      %input: tensor<1x8x?xi8>,
+      %filter: tensor<1x?xi8>,
+      %output: tensor<1x8x?xi8>) -> (tensor<1x8x?xi8>) {
+  %res = linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<1> : vector<1xi64>,
+    strides = dense<1> : vector<1xi64>}
+    ins(%input, %filter : tensor<1x8x?xi8>, tensor<1x?xi8>)
+    outs(%output : tensor<1x8x?xi8>) -> tensor<1x8x?xi8>
+  return %res : tensor<1x8x?xi8>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [1, 8, [4], 1] : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL:   func.func @depthwise_conv1d_nwc_wc_1x8x3xi8_tensor_scalable(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<1x8x?xi8>,
+// CHECK-SAME:      %[[FILTER:.*]]: tensor<1x?xi8>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: tensor<1x8x?xi8>) -> tensor<1x8x?xi8> {
+
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0 : i8
+
+/// Create a mask for the input tensor
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[CH_DIM_IN:.*]] = tensor.dim %[[INPUT]], %[[C2]] : tensor<1x8x?xi8>
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[MASK_IN:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_IN]] : vector<1x8x[4]xi1>
+/// Read the input tensor
+// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x[4]xi8> } : vector<1x8x[4]xi1> -> vector<1x8x[4]xi8>
+
+/// Create a mask for the filter tensor
+// CHECK:           %[[CH_DIM_FLT:.*]] = tensor.dim %[[FILTER]], %[[C1]] : tensor<1x?xi8>
+// CHECK:           %[[MASK_FLT:.*]] = vector.create_mask %[[C1]], %[[CH_DIM_FLT]] : vector<1x[4]xi1>
+/// Read the filter tensor
+// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : tensor<1x?xi8>, vector<1x[4]xi8> } : vector<1x[4]xi1> -> vector<1x[4]xi8>
+
+/// Create a mask for the output tensor
+// CHECK:           %[[CH_DIM_OUT:.*]] = tensor.dim %[[OUTPUT]], %[[C2]] : tensor<1x8x?xi8>
+// CHECK:           %[[MASK_OUT:.*]] = vector.create_mask %[[C1]], %[[C8]], %[[CH_DIM_OUT]] : vector<1x8x[4]xi1>
+/// Read the output tensor
+// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : tensor<1x8x?xi8>, vector<1x8x[4]xi8> } : vector<1x8x[4]xi1> -> vector<1x8x[4]xi8>
+
+/// Convolution
+// CHECK:           %[[IN_1:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x[4]xi8> to vector<1x8x[4]xi8>
+// CHECK:           %[[FLT_1:.*]] = vector.extract %[[VEC_FLT]][0] : vector<[4]xi8> from vector<1x[4]xi8>
+// CHECK:           %[[OUT_1:.*]] = vector.extract_strided_slice %[[VEC_OUT]] {offsets = [0, 0, 0], sizes = [1, 8, 4], strides = [1, 1, 1]} : vector<1x8x[4]xi8> to vector<1x8x[4]xi8>
+// CHECK:           %[[FLT_1_B:.*]] = vector.broadcast %[[FLT_1]] : vector<[4]xi8> to vector<1x8x[4]xi8>
+// CHECK:           %[[MULI:.*]] = arith.muli %[[IN_1]], %[[FLT_1_B]] : vector<1x8x[4]xi8>
+// CHECK:           %[[ADDI:.*]] = arith.addi %[[MULI]], %[[OUT_1]] : vector<1x8x[4]xi8>
+// CHECK:           %[[OUT_INS:.*]] = vector.insert_strided_slice %[[ADDI]], %[[VEC_OUT]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<1x8x[4]xi8> into vector<1x8x[4]xi8>
+// CHECK:           %[[OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : vector<1x8x[4]xi8>, tensor<1x8x?xi8> } : vector<1x8x[4]xi1> -> tensor<1x8x?xi8>
+// CHECK:           return %[[OUT]] : tensor<1x8x?xi8>
+
+// -----
+
+func.func @depthwise_conv1d_nwc_wc_3x5x4xf32_memref_dilation_2(
+      %input: memref<3x5x?xf32>,
+      %filter: memref<2x?xf32>,
+      %output: memref<3x2x?xf32>) {
+  linalg.depthwise_conv_1d_nwc_wc
+    {dilations = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %filter : memref<3x5x?xf32>, memref<2x?xf32>)
+    outs(%output : memref<3x2x?xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [3, 2, [4], 2] : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL:   func.func @depthwise_conv1d_nwc_wc_3x5x4xf32_memref_dilation_2(
+// CHECK-SAME:      %[[INPUT:.*]]: memref<3x5x?xf32>,
+// CHECK-SAME:      %[[FILTER:.*]]: memref<2x?xf32>,
+// CHECK-SAME:      %[[OUTPUT:.*]]: memref<3x2x?xf32>) {
+
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PAD:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+
+/// Create a mask for the input tensor
+// CHECK:           %[[CH_DIM_IN:.*]] = memref.dim %[[INPUT]], %[[C2]] : memref<3x5x?xf32>
+// CHECK:           %[[C3:.*]] = arith.constant 3 : index
+// CHECK:           %[[C5:.*]] = arith.constant 5 : index
+// CHECK:           %[[MASK_IN:.*]] = vector.create_mask %[[C3]], %[[C5]], %[[CH_DIM_IN]] : vector<3x4x[4]xi1>
+/// Read the input tensor
+// CHECK:           %[[VEC_IN:.*]] = vector.mask %[[MASK_IN]] { vector.transfer_read %[[INPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : memref<3x5x?xf32>, vector<3x4x[4]xf32> } : vector<3x4x[4]xi1> -> vector<3x4x[4]xf32>
+
+/// Create a mask for the filter tensor
+// CHECK:           %[[CH_DIM_FLT:.*]] = memref.dim %[[FILTER]], %[[C1]] : memref<2x?xf32>
+// CHECK:           %[[MASK_FLT:.*]] = vector.create_mask %[[C2]], %[[CH_DIM_FLT]] : vector<2x[4]xi1>
+/// Read the filter tensor
+// CHECK:           %[[VEC_FLT:.*]] = vector.mask %[[MASK_FLT]] { vector.transfer_read %[[FILTER]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : memref<2x?xf32>, vector<2x[4]xf32> } : vector<2x[4]xi1> -> vector<2x[4]xf32>
+
+/// Create a mask for the output tensor
+// CHECK:           %[[CH_DIM_OUT:.*]] = memref.dim %[[OUTPUT]], %[[C2]] : memref<3x2x?xf32>
+// CHECK:           %[[MASK_OUT:.*]] = vector.create_mask %[[C3]], %[[C2]], %[[CH_DIM_OUT]] : vector<3x2x[4]xi1>
+/// Read the output tensor
+// CHECK:           %[[VEC_OUT:.*]] = vector.mask %[[MASK_OUT]] { vector.transfer_read %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] : memref<3x2x?xf32>, vector<3x2x[4]xf32> } : vector<3x2x[4]xi1> -> vector<3x2x[4]xf32>
+
+/// Convolution
+// CHECK:           %[[IN_1:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 0, 0], sizes = [3, 2, 4], strides = [1, 1, 1]} : vector<3x4x[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[IN_2:.*]] = vector.extract_strided_slice %[[VEC_IN]] {offsets = [0, 2, 0], sizes = [3, 2, 4], strides = [1, 1, 1]} : vector<3x4x[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[FLT_1:.*]] = vector.extract %[[VEC_FLT]][0] : vector<[4]xf32> from vector<2x[4]xf32>
+// CHECK:           %[[FLT_2:.*]] = vector.extract %[[VEC_FLT]][1] : vector<[4]xf32> from vector<2x[4]xf32>
+// CHECK:           %[[OUT_1:.*]] = vector.extract_strided_slice %[[VEC_OUT]] {offsets = [0, 0, 0], sizes = [3, 2, 4], strides = [1, 1, 1]} : vector<3x2x[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[FLT_1_B:.*]] = vector.broadcast %[[FLT_1]] : vector<[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[FMA_1:.*]] = vector.fma %[[IN_1]], %[[FLT_1_B]], %[[OUT_1]] : vector<3x2x[4]xf32>
+// CHECK:           %[[FLT_2_B:.*]] = vector.broadcast %[[FLT_2]] : vector<[4]xf32> to vector<3x2x[4]xf32>
+// CHECK:           %[[FMA_2:.*]] = vector.fma %[[IN_2]], %[[FLT_2_B]], %[[FMA_1]] : vector<3x2x[4]xf32>
+// CHECK:           %[[OUT_INS:.*]] = vector.insert_strided_slice %[[FMA_2]], %[[VEC_OUT]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<3x2x[4]xf32> into vector<3x2x[4]xf32>
+// CHECK:           vector.mask %[[MASK_OUT]] { vector.transfer_write %[[OUT_INS]], %[[OUTPUT]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] : vector<3x2x[4]xf32>, memref<3x2x?xf32> } : vector<3x2x[4]xi1>
diff --git a/mlir/test/Dialect/MLProgram/inlining.mlir b/mlir/test/Dialect/MLProgram/inlining.mlir
new file mode 100644
index 00000000000000..ac3677e1b92882
--- /dev/null
+++ b/mlir/test/Dialect/MLProgram/inlining.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-opt %s -inline | FileCheck %s
+
+// Verifies that regions with operations from the ml_program dialect can
+// be inlined.
+
+ml_program.global private @global(dense<4> : tensor<4xi32>) : tensor<4xi32>
+
+// CHECK: @inline_into
+func.func @inline_into() -> tensor<4xi32> {
+  // CHECK-NOT: @inline_from
+  // CHECK: ml_program.global_load_const
+  %0 = call @inline_from() : () -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+func.func @inline_from() -> tensor<4xi32> {
+  %0 = ml_program.global_load_const @global : tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
diff --git a/mlir/test/Dialect/Math/expand-math.mlir b/mlir/test/Dialect/Math/expand-math.mlir
index c2b5353f7d8b5d..ca93bd878800d6 100644
--- a/mlir/test/Dialect/Math/expand-math.mlir
+++ b/mlir/test/Dialect/Math/expand-math.mlir
@@ -7,19 +7,18 @@ func.func @tanh(%arg: f32) -> f32 {
 }
 // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG: %[[ONE:.+]] = arith.constant 1.000000e+00 : f32
-// CHECK-DAG: %[[TWO:.+]] = arith.constant 2.000000e+00 : f32
-// CHECK: %[[DOUBLEDX:.+]] = arith.mulf %arg0, %[[TWO]] : f32
-// CHECK: %[[NEGDOUBLEDX:.+]] = arith.negf %[[DOUBLEDX]] : f32
+// CHECK-DAG: %[[TWO:.+]] = arith.constant -2.000000e+00 : f32
+// CHECK: %[[VAL0:.+]] = arith.cmpf olt, %arg0, %[[ZERO]] : f32
+// CHECK: %[[VAL1:.+]] = arith.uitofp %[[VAL0]] : i1 to f32
+// CHECK: %[[VAL2:.+]] = arith.mulf %[[VAL1]], %[[TWO]] : f32
+// CHECK: %[[SIGN:.+]] = arith.addf %[[VAL2]], %[[ONE]] : f32
+// CHECK: %[[POSX:.+]] = arith.mulf %[[SIGN]], %arg0 : f32
+// CHECK: %[[NEGDOUBLEDX:.+]] = arith.mulf %[[POSX]], %[[TWO]] : f32
 // CHECK: %[[EXP1:.+]] = math.exp %[[NEGDOUBLEDX]] : f32
 // CHECK: %[[DIVIDEND1:.+]] = arith.subf %[[ONE]], %[[EXP1]] : f32
 // CHECK: %[[DIVISOR1:.+]] = arith.addf %[[EXP1]], %[[ONE]] : f32
-// CHECK: %[[RES1:.+]] = arith.divf %[[DIVIDEND1]], %[[DIVISOR1]] : f32
-// CHECK: %[[EXP2:.+]] = math.exp %[[DOUBLEDX]] : f32
-// CHECK: %[[DIVIDEND2:.+]] = arith.subf %[[EXP2]], %[[ONE]] : f32
-// CHECK: %[[DIVISOR2:.+]] = arith.addf %[[EXP2]], %[[ONE]] : f32
-// CHECK: %[[RES2:.+]] = arith.divf %[[DIVIDEND2]], %[[DIVISOR2]] : f32
-// CHECK: %[[COND:.+]] = arith.cmpf oge, %arg0, %[[ZERO]] : f32
-// CHECK: %[[RESULT:.+]] = arith.select %[[COND]], %[[RES1]], %[[RES2]] : f32
+// CHECK: %[[POSRES:.+]] = arith.divf %[[DIVIDEND1]], %[[DIVISOR1]] : f32
+// CHECK: %[[RESULT:.+]] = arith.mulf %[[SIGN]], %[[POSRES]] : f32
 // CHECK: return %[[RESULT]]
 
 // -----
diff --git a/mlir/test/Dialect/Math/polynomial-approximation.mlir b/mlir/test/Dialect/Math/polynomial-approximation.mlir
index 834a7dc0af66d6..93ecd67f14dd3d 100644
--- a/mlir/test/Dialect/Math/polynomial-approximation.mlir
+++ b/mlir/test/Dialect/Math/polynomial-approximation.mlir
@@ -94,6 +94,20 @@ func.func @erf_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @erf_scalable_vector(
+// CHECK-SAME:                     %[[arg0:.*]]: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[zero:.*]] = arith.constant dense<0.000000e+00> : vector<[8]xf32>
+// CHECK-NOT:       erf
+// CHECK-NOT:       vector<8xf32>
+// CHECK-COUNT-20:  select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[res:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[res]] : vector<[8]xf32>
+// CHECK:         }
+func.func @erf_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.erf %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // CHECK-LABEL:   func @exp_scalar(
 // CHECK-SAME:                     %[[VAL_0:.*]]: f32) -> f32 {
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 5.000000e-01 : f32
@@ -151,6 +165,17 @@ func.func @exp_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @exp_scalable_vector
+// CHECK-NOT:      math.exp
+// CHECK-NOT:      vector<8xf32>
+// CHECK-COUNT-46: vector<[8]x{{(i32)|(f32)}}>
+// CHECK-NOT:      vector<8xf32>
+// CHECK-NOT:      math.exp
+func.func @exp_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.exp %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // CHECK-LABEL:   func @expm1_scalar(
 // CHECK-SAME:                       %[[X:.*]]: f32) -> f32 {
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f32
@@ -277,6 +302,22 @@ func.func @expm1_vector(%arg0: vector<8x8xf32>) -> vector<8x8xf32> {
   return %0 : vector<8x8xf32>
 }
 
+// CHECK-LABEL:   func @expm1_scalable_vector(
+// CHECK-SAME:                       %{{.*}}: vector<8x[8]xf32>) -> vector<8x[8]xf32> {
+// CHECK-NOT:       vector<8x8xf32>
+// CHECK-NOT:       exp
+// CHECK-NOT:       log
+// CHECK-NOT:       expm1
+// CHECK-COUNT-127: vector<8x[8]x{{(i32)|(f32)|(i1)}}>
+// CHECK-NOT:       vector<8x8xf32>
+// CHECK-NOT:       exp
+// CHECK-NOT:       log
+// CHECK-NOT:       expm1
+func.func @expm1_scalable_vector(%arg0: vector<8x[8]xf32>) -> vector<8x[8]xf32> {
+  %0 = math.expm1 %arg0 : vector<8x[8]xf32>
+  return %0 : vector<8x[8]xf32>
+}
+
 // CHECK-LABEL:   func @log_scalar(
 // CHECK-SAME:                             %[[X:.*]]: f32) -> f32 {
 // CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f32
@@ -357,6 +398,18 @@ func.func @log_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @log_scalable_vector(
+// CHECK-SAME:                     %{{.*}}: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[CST_LN2:.*]] = arith.constant dense<0.693147182> : vector<[8]xf32>
+// CHECK-COUNT-5:   select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[VAL_71:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[VAL_71]] : vector<[8]xf32>
+// CHECK:         }
+func.func @log_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.log %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // CHECK-LABEL:   func @log2_scalar(
 // CHECK-SAME:                      %[[VAL_0:.*]]: f32) -> f32 {
 // CHECK:           %[[CST_LOG2E:.*]] = arith.constant 1.44269502 : f32
@@ -381,6 +434,18 @@ func.func @log2_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @log2_scalable_vector(
+// CHECK-SAME:                      %{{.*}}: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[CST_LOG2E:.*]] = arith.constant dense<1.44269502> : vector<[8]xf32>
+// CHECK-COUNT-5:   select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[VAL_71:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[VAL_71]] : vector<[8]xf32>
+// CHECK:         }
+func.func @log2_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.log2 %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // CHECK-LABEL:   func @log1p_scalar(
 // CHECK-SAME:                       %[[X:.*]]: f32) -> f32 {
 // CHECK:           %[[CST_ONE:.*]] = arith.constant 1.000000e+00 : f32
@@ -414,6 +479,17 @@ func.func @log1p_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @log1p_scalable_vector(
+// CHECK-SAME:                       %[[VAL_0:.*]]: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[CST_ONE:.*]] = arith.constant dense<1.000000e+00> : vector<[8]xf32>
+// CHECK-COUNT-6:   select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[VAL_79:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[VAL_79]] : vector<[8]xf32>
+// CHECK:         }
+func.func @log1p_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.log1p %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
 
 // CHECK-LABEL:   func @tanh_scalar(
 // CHECK-SAME:                      %[[VAL_0:.*]]: f32) -> f32 {
@@ -470,6 +546,19 @@ func.func @tanh_vector(%arg0: vector<8xf32>) -> vector<8xf32> {
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL:   func @tanh_scalable_vector(
+// CHECK-SAME:                      %[[VAL_0:.*]]: vector<[8]xf32>) -> vector<[8]xf32> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant dense<-7.99881172> : vector<[8]xf32>
+// CHECK-NOT:       tanh
+// CHECK-COUNT-2:   select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           %[[VAL_33:.*]] = arith.select {{.*}} : vector<[8]xi1>, vector<[8]xf32>
+// CHECK:           return %[[VAL_33]] : vector<[8]xf32>
+// CHECK:         }
+func.func @tanh_scalable_vector(%arg0: vector<[8]xf32>) -> vector<[8]xf32> {
+  %0 = math.tanh %arg0 : vector<[8]xf32>
+  return %0 : vector<[8]xf32>
+}
+
 // We only approximate rsqrt for vectors and when the AVX2 option is enabled.
 // CHECK-LABEL:   func @rsqrt_scalar
 // AVX2-LABEL:    func @rsqrt_scalar
diff --git a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
index 1cb69891a70ed6..de21d114e9fc4f 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
@@ -1346,6 +1346,52 @@ func.func @convert_logical_or_true_false_vector(%arg: vector<3xi1>) -> (vector<3
 
 // -----
 
+//===----------------------------------------------------------------------===//
+// spirv.Select
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @convert_select_scalar
+// CHECK-SAME: %[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32
+func.func @convert_select_scalar(%arg1: i32, %arg2: i32) -> (i32, i32) {
+  %true = spirv.Constant true
+  %false = spirv.Constant false
+  %0 = spirv.Select %true, %arg1, %arg2 : i1, i32
+  %1 = spirv.Select %false, %arg1, %arg2 : i1, i32
+
+  // CHECK: return %[[ARG1]], %[[ARG2]]
+  return %0, %1 : i32, i32
+}
+
+// CHECK-LABEL: @convert_select_vector
+// CHECK-SAME: %[[ARG1:.+]]: vector<3xi32>, %[[ARG2:.+]]: vector<3xi32>
+func.func @convert_select_vector(%arg1: vector<3xi32>, %arg2: vector<3xi32>) -> (vector<3xi32>, vector<3xi32>) {
+  %true = spirv.Constant dense<true> : vector<3xi1>
+  %false = spirv.Constant dense<false> : vector<3xi1>
+  %0 = spirv.Select %true, %arg1, %arg2 : vector<3xi1>, vector<3xi32>
+  %1 = spirv.Select %false, %arg1, %arg2 : vector<3xi1>, vector<3xi32>
+
+  // CHECK: return %[[ARG1]], %[[ARG2]]
+  return %0, %1: vector<3xi32>, vector<3xi32>
+}
+
+// CHECK-LABEL: @convert_select_vector_extra
+// CHECK-SAME: %[[CONDITIONS:.+]]: vector<2xi1>, %[[ARG1:.+]]: vector<2xi32>
+func.func @convert_select_vector_extra(%conditions: vector<2xi1>, %arg1: vector<2xi32>) -> (vector<2xi32>, vector<2xi32>) {
+  %true_false = spirv.Constant dense<[true, false]> : vector<2xi1>
+  %cvec_1 = spirv.Constant dense<[42, -132]> : vector<2xi32>
+  %cvec_2 = spirv.Constant dense<[0, 42]> : vector<2xi32>
+
+  // CHECK: %[[RES:.+]] = spirv.Constant dense<42>
+  %0 = spirv.Select %true_false, %cvec_1, %cvec_2: vector<2xi1>, vector<2xi32>
+
+  %1 = spirv.Select %conditions, %arg1, %arg1 : vector<2xi1>, vector<2xi32>
+
+  // CHECK: return %[[RES]], %[[ARG1]]
+  return %0, %1: vector<2xi32>, vector<2xi32>
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.IEqual
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
index 48f28ef390ed53..18851f29d8eaa3 100644
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -868,16 +868,6 @@ func.func @sparse_sort_coo_no_perm(%arg0: index, %arg1: memref<?xindex>) -> (mem
 
 // -----
 
-#CSR = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
-
-func.func @sparse_alloc_escapes(%arg0: index) -> tensor<10x?xf64, #CSR> {
-  // expected-error@+1 {{sparse tensor allocation should not escape function}}
-  %0 = bufferization.alloc_tensor(%arg0) : tensor<10x?xf64, #CSR>
-  return %0: tensor<10x?xf64, #CSR>
-}
-
-// -----
-
 #UnorderedCOO = #sparse_tensor.encoding<{map = (d0, d1) -> (d0 : compressed(nonunique, nonordered), d1 : singleton(nonordered))}>
 #OrderedCOOPerm = #sparse_tensor.encoding<{map = (d0, d1) -> (d1 : compressed(nonunique), d0 : singleton)}>
 
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
index e9e458e805ba47..a47a3d5119f96d 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -728,3 +728,13 @@ func.func @sparse_print(%arg0: tensor<10x10xf64, #CSR>) {
   sparse_tensor.print %arg0 : tensor<10x10xf64, #CSR>
   return
 }
+
+// -----
+
+// CHECK-LABEL:   func.func @sparse_has_runtime() -> i1
+// CHECK:           %[[H:.*]] = sparse_tensor.has_runtime_library
+// CHECK:           return %[[H]] : i1
+func.func @sparse_has_runtime() -> i1 {
+  %has_runtime = sparse_tensor.has_runtime_library
+  return %has_runtime : i1
+}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 4c73a6271786e6..627ac54cf145bf 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2483,6 +2483,18 @@ func.func @all_true_vector_mask(%a : vector<3x4xf32>) -> vector<3x4xf32> {
 
 // -----
 
+// CHECK-LABEL: func @all_true_vector_mask_no_result(
+func.func @all_true_vector_mask_no_result(%a : vector<3x4xf32>, %m : memref<3x4xf32>) {
+//   CHECK-NOT:   vector.mask
+//       CHECK:   vector.transfer_write
+  %c0 = arith.constant 0 : index
+  %all_true = vector.constant_mask [3, 4] : vector<3x4xi1>
+  vector.mask %all_true { vector.transfer_write %a, %m[%c0, %c0] : vector<3x4xf32>, memref<3x4xf32> } : vector<3x4xi1>
+  return
+}
+
+// -----
+
 // CHECK-LABEL:   func.func @fold_shape_cast_with_mask(
 // CHECK-SAME:     %[[VAL_0:.*]]: tensor<1x?xf32>) -> vector<1x4xi1> {
 func.func @fold_shape_cast_with_mask(%arg0: tensor<1x?xf32>) -> vector<1x4xi1> {
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index a42ed6968d3700..441f7500538f7e 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -9,9 +9,6 @@
 // RUN:  -shared-libs=%mlir_c_runner_utils |\
 // RUN: FileCheck %s
 
-// XFAIL: target=aarch64{{.*}}
-// See: https://github.com/llvm/llvm-project/issues/58531
-
 func.func @test_unary(%input: tensor<?xcomplex<f32>>,
                       %func: (complex<f32>) -> complex<f32>) {
   %c0 = arith.constant 0 : index
@@ -189,8 +186,9 @@ func.func @entry() {
     // CHECK-NEXT:  0
     // CHECK-NEXT:  0
     (0.0, 0.0), (-1.0, 0.0),
-    // CHECK-NEXT:  -nan
-    // CHECK-NEXT:  -nan
+    // Ignoring the sign of nan as that can't be tested in platform agnostic manner. See: #58531
+    // CHECK-NEXT:  nan
+    // CHECK-NEXT:  nan
     (1.0, 1.0), (1.0, 1.0)
     // CHECK-NEXT:  0.273
     // CHECK-NEXT:  0.583
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
index 8ee4e1fb48fef1..92c7039c849601 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/mmt4d.mlir
@@ -1,6 +1,6 @@
 // DEFINE: %{compile} =  mlir-opt %s \
 // DEFINE:    -transform-interpreter -test-transform-dialect-erase-schedule \
-// DEFINE:    -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm -o %t
+// DEFINE:    -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -test-lower-to-llvm -o %t
 // DEFINE: %{entry_point} = mmt4d
 // DEFINE: %{run} = mlir-cpu-runner %t -e %{entry_point} -entry-point-result=void \
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-matmul-masked-vec.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-matmul-masked-vec.mlir
index 0378d638df61ab..a70d794506c483 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-matmul-masked-vec.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-matmul-masked-vec.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -one-shot-bufferize -func-bufferize -lower-vector-mask --test-lower-to-llvm | \
+// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -lower-vector-mask --test-lower-to-llvm | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_c_runner_utils,%mlir_runner_utils | \
 // RUN: FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
index 37002915405478..2239ba50b66267 100644
--- a/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
@@ -34,7 +34,8 @@ func.func @main() {
   %5 = arith.constant 5 : index
 
   %alloca_1 = memref.alloca() : memref<1xf32>
-  %alloc_4 = memref.alloc(%4) : memref<?xf32>
+  %alloca_4 = memref.alloca() : memref<4xf32>
+  %alloca_4_dyn = memref.cast %alloca_4 : memref<4xf32> to memref<?xf32>
 
   // Offset is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
@@ -55,20 +56,20 @@ func.func @main() {
   // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
   // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %5, %1) : (memref<?xf32>, index, index, index) -> ()
+  func.call @reinterpret_cast_fully_dynamic(%alloca_4_dyn, %0, %5, %1) : (memref<?xf32>, index, index, index) -> ()
 
   // Stride is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
   // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
   // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %4) : (memref<?xf32>, index, index, index) -> ()
+  func.call @reinterpret_cast_fully_dynamic(%alloca_4_dyn, %0, %4, %4) : (memref<?xf32>, index, index, index) -> ()
 
   //  CHECK-NOT: ERROR: Runtime op verification failed
   func.call @reinterpret_cast(%alloca_1, %0) : (memref<1xf32>, index) -> ()
 
   //  CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %1) : (memref<?xf32>, index, index, index) -> ()
+  func.call @reinterpret_cast_fully_dynamic(%alloca_4_dyn, %0, %4, %1) : (memref<?xf32>, index, index, index) -> ()
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
index 48987ce216f1a3..3ccf8b1be6d7bf 100644
--- a/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
@@ -38,14 +38,15 @@ func.func @main() {
   %5 = arith.constant 5 : index
 
   %alloca = memref.alloca() : memref<1xf32>
-  %alloc = memref.alloc(%4) : memref<?x4xf32>
+  %alloca_4 = memref.alloca() : memref<4x4xf32>
+  %alloca_4_dyn = memref.cast %alloca_4 : memref<4x4xf32> to memref<?x4xf32>
 
   // Offset is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
   // CHECK-NEXT: "memref.subview"
   // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @subview_dynamic_rank_reduce(%alloc, %5, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic_rank_reduce(%alloca_4_dyn, %5, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
 
   // Offset is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
@@ -66,23 +67,23 @@ func.func @main() {
   // CHECK-NEXT: "memref.subview"
   // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @subview_dynamic(%alloc, %0, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic(%alloca_4_dyn, %0, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
 
   // Stride is out-of-bounds
   //      CHECK: ERROR: Runtime op verification failed
   // CHECK-NEXT: "memref.subview"
   // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
   // CHECK-NEXT: Location: loc({{.*}})
-  func.call @subview_dynamic(%alloc, %0, %4, %4) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic(%alloca_4_dyn, %0, %4, %4) : (memref<?x4xf32>, index, index, index) -> ()
 
   // CHECK-NOT: ERROR: Runtime op verification failed
   func.call @subview(%alloca, %0) : (memref<1xf32>, index) -> ()
 
   // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @subview_dynamic(%alloc, %0, %4, %1) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic(%alloca_4_dyn, %0, %4, %1) : (memref<?x4xf32>, index, index, index) -> ()
 
   // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @subview_dynamic_rank_reduce(%alloc, %0, %1, %0) : (memref<?x4xf32>, index, index, index) -> ()
+  func.call @subview_dynamic_rank_reduce(%alloca_4_dyn, %0, %1, %0) : (memref<?x4xf32>, index, index, index) -> ()
 
 
   return
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
index 350b5b41dafc00..c645ca6567209c 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
@@ -222,6 +222,7 @@ module {
     bufferization.dealloc_tensor %sparse_filter_CD : tensor<3x3xi32, #CDR>
     bufferization.dealloc_tensor %sparse_filter_CSC : tensor<3x3xi32, #CSC>
 
+    bufferization.dealloc_tensor %0 : tensor<6x6xi32>
     bufferization.dealloc_tensor %2 : tensor<6x6xi32, #DCSR>
     bufferization.dealloc_tensor %3 : tensor<6x6xi32, #CSR>
     bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CDR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
index ebf9f4392d859b..f7975e0738fa81 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
@@ -35,8 +35,8 @@
 #COO_3D = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique), d2 : singleton), posWidth = 32, crdWidth = 32 }>
 
 module {
-  func.func private @printMemref3dF32(%ptr : tensor<?x?x?xf32>) attributes { llvm.emit_c_interface }
-  func.func private @printMemref2dF32(%ptr : tensor<?x?xf32>) attributes { llvm.emit_c_interface }
+  func.func private @printMemref3dF32(%ptr : tensor<?x?x?xf32> {bufferization.access = "read"}) attributes { llvm.emit_c_interface }
+  func.func private @printMemref2dF32(%ptr : tensor<?x?xf32> {bufferization.access = "read"}) attributes { llvm.emit_c_interface }
 
   func.func @test_sparse_rhs(%arg0: tensor<5x6xf32>, %arg1: tensor<6x2x3xf32, #COO_3D>) -> tensor<?x?x?xf32> {
     %collapsed = tensor.collapse_shape %arg1 [[0], [1, 2]] : tensor<6x2x3xf32, #COO_3D> into tensor<6x6xf32, #COO_2D>
@@ -46,6 +46,11 @@ module {
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
+
+    // Note: tensor.collapse_shape is a metadata-only operation on dense tensors
+    // but requires reallocation on sparse tensors.
+    bufferization.dealloc_tensor %collapsed : tensor<6x6xf32, #COO_2D>
+
     return %ret1 : tensor<?x?x?xf32>
   }
 
@@ -57,6 +62,11 @@ module {
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32, #COO_2D>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
+
+    // Note: tensor.collapse_shape is a metadata-only operation on dense tensors
+    // but requires reallocation on sparse tensors.
+    bufferization.dealloc_tensor %collapsed : tensor<6x6xf32, #COO_2D>
+
     return %ret1 : tensor<?x?x?xf32>
   }
 
@@ -80,6 +90,11 @@ module {
     %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32, #COO_2D>, tensor<6x6xf32, #COO_2D>) outs(%1 : tensor<5x6xf32>) -> tensor<5x6xf32>
     %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
     %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
+
+    // Note: tensor.collapse_shape is a metadata-only operation on dense tensors
+    // but requires reallocation on sparse tensors.
+    bufferization.dealloc_tensor %collapsed : tensor<6x6xf32, #COO_2D>
+
     return %ret1 : tensor<?x?x?xf32>
   }
 
@@ -192,6 +207,7 @@ module {
     bufferization.dealloc_tensor %so1 : tensor<?x?x?xf32>
     bufferization.dealloc_tensor %so2 : tensor<?x?x?xf32>
     bufferization.dealloc_tensor %so3 : tensor<?x?x?xf32>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
index 464de9c8a2c3a6..efef01155cc784 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block_matmul.mlir
@@ -161,6 +161,14 @@ module {
     call @dump_dense_f64(%s24)  : (tensor<4x4xf64>) -> ()
     call @dump_dense_f64(%scsr) : (tensor<4x4xf64>) -> ()
 
+    bufferization.dealloc_tensor %a : tensor<4x8xf64, #BSR>
+    bufferization.dealloc_tensor %b : tensor<4x8xf64, #NV_24>
+    bufferization.dealloc_tensor %c : tensor<4x8xf64, #CSR>
+    bufferization.dealloc_tensor %d : tensor<4x4xf64>
+    bufferization.dealloc_tensor %s : tensor<4x4xf64>
+    bufferization.dealloc_tensor %s24 : tensor<4x4xf64>
+    bufferization.dealloc_tensor %scsr : tensor<4x4xf64>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
index f8fb8fdf53e356..55d4caeb7eb30e 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
@@ -273,11 +273,14 @@ module {
     bufferization.dealloc_tensor %sparse_input_CSC : tensor<8x8xi32, #CSC>
     bufferization.dealloc_tensor %sparse_input_CD : tensor<8x8xi32, #CDR>
 
+    bufferization.dealloc_tensor %0 : tensor<6x6xi32>
     bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR>
     bufferization.dealloc_tensor %2 : tensor<6x6xi32, #DCSR>
     bufferization.dealloc_tensor %3 : tensor<6x6xi32, #CSR>
     bufferization.dealloc_tensor %4 : tensor<6x6xi32, #CDR>
     bufferization.dealloc_tensor %5 : tensor<6x6xi32, #CSC>
+    bufferization.dealloc_tensor %6 : tensor<6x6xi32>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir
index 9b05f9bf3a29c2..e145c4542a7bfc 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir
@@ -233,6 +233,19 @@ module {
     // bufferization.dealloc_tensor %s2pp4 : tensor<2x?x?xf64, #Tensor4>
     // bufferization.dealloc_tensor %s2pp5 : tensor<2x?x?xf64, #Tensor5>
     // bufferization.dealloc_tensor %s2pp6 : tensor<2x?x?xf64, #Tensor6>
+
+    bufferization.dealloc_tensor %d2341 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2342 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2343 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2344 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2345 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d2346 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %dp344 : tensor<?x3x4xf64>
+    bufferization.dealloc_tensor %d2p45 : tensor<2x?x4xf64>
+    bufferization.dealloc_tensor %d23p6 : tensor<2x3x?xf64>
+    bufferization.dealloc_tensor %dp3p4 : tensor<?x3x?xf64>
+    bufferization.dealloc_tensor %dpp45 : tensor<?x?x4xf64>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir
index 0f9dfb9da7204f..12f8e34500d818 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir
@@ -114,12 +114,14 @@ module {
     call @dump(%d31) : (tensor<2x3x4xf64>) -> ()
 
     //
-    // Release sparse tensors.
+    // Release the resources.
     //
     bufferization.dealloc_tensor %t13 : tensor<2x3x4xf64, #Tensor3>
     bufferization.dealloc_tensor %t31 : tensor<2x3x4xf64, #Tensor1>
     bufferization.dealloc_tensor %s1 : tensor<2x3x4xf64, #Tensor1>
     bufferization.dealloc_tensor %s3 : tensor<2x3x4xf64, #Tensor3>
+    bufferization.dealloc_tensor %d13 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d31 : tensor<2x3x4xf64>
 
     return
   }
@@ -167,12 +169,14 @@ module {
     call @dump(%d31) : (tensor<2x3x4xf64>) -> ()
 
     //
-    // Release sparse tensors.
+    // Release the resources.
     //
     bufferization.dealloc_tensor %t13 : tensor<2x3x4xf64, #SingletonTensor3>
     bufferization.dealloc_tensor %t31 : tensor<2x3x4xf64, #SingletonTensor1>
     bufferization.dealloc_tensor %s1 : tensor<2x3x4xf64, #SingletonTensor1>
     bufferization.dealloc_tensor %s3 : tensor<2x3x4xf64, #SingletonTensor3>
+    bufferization.dealloc_tensor %d13 : tensor<2x3x4xf64>
+    bufferization.dealloc_tensor %d31 : tensor<2x3x4xf64>
 
     return
   }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir
new file mode 100755
index 00000000000000..bcd71f7bd674bd
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_empty.mlir
@@ -0,0 +1,144 @@
+//--------------------------------------------------------------------------------------------------
+// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
+//
+// Set-up that's shared across all tests in this directory. In principle, this
+// config could be moved to lit.local.cfg. However, there are downstream users that
+//  do not use these LIT config files. Hence why this is kept inline.
+//
+// DEFINE: %{sparsifier_opts} = enable-runtime-library=true
+// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts}
+// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
+// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
+// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
+// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
+// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
+//
+// DEFINE: %{env} =
+//--------------------------------------------------------------------------------------------------
+
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and VLA vectorization.
+// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
+
+
+#map = affine_map<(d0) -> (d0)>
+
+#SV  = #sparse_tensor.encoding<{
+  map = (d0) -> (d0 : compressed)
+}>
+
+module {
+
+  // This directly yields an empty sparse vector.
+  func.func @empty() -> tensor<10xf32, #SV> {
+    %0 = tensor.empty() : tensor<10xf32, #SV>
+    return %0 : tensor<10xf32, #SV>
+  }
+
+  // This also directly yields an empty sparse vector.
+  func.func @empty_alloc() -> tensor<10xf32, #SV> {
+    %0 = bufferization.alloc_tensor() : tensor<10xf32, #SV>
+    return %0 : tensor<10xf32, #SV>
+  }
+
+  // This yields a hidden empty sparse vector (all zeros).
+  func.func @zeros() -> tensor<10xf32, #SV> {
+    %cst = arith.constant 0.0 : f32
+    %0 = bufferization.alloc_tensor() : tensor<10xf32, #SV>
+    %1 = linalg.generic {
+        indexing_maps = [#map],
+	iterator_types = ["parallel"]}
+      outs(%0 : tensor<10xf32, #SV>) {
+         ^bb0(%out: f32):
+            linalg.yield %cst : f32
+    } -> tensor<10xf32, #SV>
+    return %1 : tensor<10xf32, #SV>
+  }
+
+  // This yields a filled sparse vector (all ones).
+  func.func @ones() -> tensor<10xf32, #SV> {
+    %cst = arith.constant 1.0 : f32
+    %0 = bufferization.alloc_tensor() : tensor<10xf32, #SV>
+    %1 = linalg.generic {
+        indexing_maps = [#map],
+	iterator_types = ["parallel"]}
+      outs(%0 : tensor<10xf32, #SV>) {
+         ^bb0(%out: f32):
+            linalg.yield %cst : f32
+    } -> tensor<10xf32, #SV>
+    return %1 : tensor<10xf32, #SV>
+  }
+
+  //
+  // Main driver.
+  //
+  func.func @main() {
+
+    %0 = call @empty()       : () -> tensor<10xf32, #SV>
+    %1 = call @empty_alloc() : () -> tensor<10xf32, #SV>
+    %2 = call @zeros()       : () -> tensor<10xf32, #SV>
+    %3 = call @ones()        : () -> tensor<10xf32, #SV>
+
+    //
+    // Verify the output. In particular, make sure that
+    // all empty sparse vector data structures are properly
+    // finalized with a pair (0,0) for positions.
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 0
+    // CHECK-NEXT: dim = ( 10 )
+    // CHECK-NEXT: lvl = ( 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 0,
+    // CHECK-NEXT: crd[0] : (
+    // CHECK-NEXT: values : (
+    // CHECK-NEXT: ----
+    //
+    // CHECK-NEXT: ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 0
+    // CHECK-NEXT: dim = ( 10 )
+    // CHECK-NEXT: lvl = ( 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 0,
+    // CHECK-NEXT: crd[0] : (
+    // CHECK-NEXT: values : (
+    // CHECK-NEXT: ----
+    //
+    // CHECK-NEXT: ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 0
+    // CHECK-NEXT: dim = ( 10 )
+    // CHECK-NEXT: lvl = ( 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 0,
+    // CHECK-NEXT: crd[0] : (
+    // CHECK-NEXT: values : (
+    // CHECK-NEXT: ----
+    //
+    // CHECK-NEXT: ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 10
+    // CHECK-NEXT: dim = ( 10 )
+    // CHECK-NEXT: lvl = ( 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 10,
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %0 : tensor<10xf32, #SV>
+    sparse_tensor.print %1 : tensor<10xf32, #SV>
+    sparse_tensor.print %2 : tensor<10xf32, #SV>
+    sparse_tensor.print %3 : tensor<10xf32, #SV>
+
+    bufferization.dealloc_tensor %0 : tensor<10xf32, #SV>
+    bufferization.dealloc_tensor %1 : tensor<10xf32, #SV>
+    bufferization.dealloc_tensor %2 : tensor<10xf32, #SV>
+    bufferization.dealloc_tensor %3 : tensor<10xf32, #SV>
+    return
+  }
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
index 7cde6b93d3250c..34d450c2403f61 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
@@ -279,6 +279,31 @@ module {
     %si = tensor.extract %li[] : tensor<i64>
     vector.print %si : i64
 
+    // TODO: This check is no longer needed once the codegen path uses the
+    // buffer deallocation pass. "dealloc_tensor" turn into a no-op in the
+    // codegen path.
+    %has_runtime = sparse_tensor.has_runtime_library
+    scf.if %has_runtime {
+      // sparse_tensor.assemble copies buffers when running with the runtime
+      // library. Deallocations are needed not needed when running in codgen
+      // mode.
+      bufferization.dealloc_tensor %s4 : tensor<10x10xf64, #SortedCOO>
+      bufferization.dealloc_tensor %s5 : tensor<10x10xf64, #SortedCOOI32>
+      bufferization.dealloc_tensor %csr : tensor<2x2xf64, #CSR>
+      bufferization.dealloc_tensor %bs : tensor<2x10x10xf64, #BCOO>
+    }
+
+    bufferization.dealloc_tensor %li : tensor<i64>
+    bufferization.dealloc_tensor %od : tensor<3xf64>
+    bufferization.dealloc_tensor %op : tensor<2xi32>
+    bufferization.dealloc_tensor %oi : tensor<3x2xi32>
+    bufferization.dealloc_tensor %d_csr : tensor<4xf64>
+    bufferization.dealloc_tensor %p_csr : tensor<3xi32>
+    bufferization.dealloc_tensor %i_csr : tensor<3xi32>
+    bufferization.dealloc_tensor %bod : tensor<6xf64>
+    bufferization.dealloc_tensor %bop : tensor<4xindex>
+    bufferization.dealloc_tensor %boi : tensor<6x2xindex>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
index aa1bd04fde87dc..fe8836266a4793 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
@@ -140,10 +140,19 @@ module {
     sparse_tensor.print %s1 : tensor<4x3x2xf32, #BatchedCSR>
     sparse_tensor.print %s2 : tensor<4x3x2xf32, #CSRDense>
 
-    // FIXME: doing this explicitly crashes runtime
-    // bufferization.dealloc_tensor %s0 : tensor<4x3x2xf32, #CCC>
-    // bufferization.dealloc_tensor %s1 : tensor<4x3x2xf32, #BatchedCSR>
-    // bufferization.dealloc_tensor %s2 : tensor<4x3x2xf32, #CSRDense>
+    // TODO: This check is no longer needed once the codegen path uses the
+    // buffer deallocation pass. "dealloc_tensor" turn into a no-op in the
+    // codegen path.
+    %has_runtime = sparse_tensor.has_runtime_library
+    scf.if %has_runtime {
+      // sparse_tensor.assemble copies buffers when running with the runtime
+      // library. Deallocations are needed not needed when running in codgen
+      // mode.
+      bufferization.dealloc_tensor %s0 : tensor<4x3x2xf32, #CCC>
+      bufferization.dealloc_tensor %s1 : tensor<4x3x2xf32, #BatchedCSR>
+      bufferization.dealloc_tensor %s2 : tensor<4x3x2xf32, #CSRDense>
+    }
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir b/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir
index bf178c826574e4..258b1b4f2fab45 100644
--- a/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir
+++ b/mlir/test/Integration/Dialect/Tosa/CPU/test-fully-connected.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith))" | \
-// RUN: mlir-opt -one-shot-bufferize -func-bufferize -test-lower-to-llvm | \
+// RUN: mlir-opt -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -test-lower-to-llvm | \
 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir
new file mode 100644
index 00000000000000..4d415394482029
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x128_stride_noswizzle.mlir
@@ -0,0 +1,148 @@
+// RUN: mlir-opt %s \
+// RUN:  -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
+// RUN:  | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN:  | FileCheck %s
+
+// CHECK: Correct Results :8192
+// CHECK: Incorrect Results :0
+
+module {
+  func.func @main() {
+    %c10000000 = arith.constant 10000000 : index
+    %false = arith.constant false
+    %c32768 = arith.constant 32768 : index
+    %c31_i32 = arith.constant 31 : i32
+    %c-1_i32 = arith.constant -1 : i32
+    %c5_i32 = arith.constant 5 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %c8 = arith.constant 8 : index
+    %c64 = arith.constant 64 : index
+    %c2 = arith.constant 2 : index
+    %c32768_i32 = arith.constant 32768 : i32
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.mlir.constant(128 : i64) : i64
+    %2 = llvm.mlir.constant(0 : i64) : i64
+    %f0 = arith.constant 0.0 : f16
+    %f123 = arith.constant 1.123 : f16
+    
+    %srcMemref_host = memref.alloc() : memref<128x128xf16>
+    %dstMemref_host = memref.alloc() : memref<128x128xf16>
+    scf.for %arg0 = %c0 to %c128 step %c1 {
+      scf.for %arg1 = %c0 to %c64 step %c1 {
+        %d1 = arith.index_cast %arg0 : index to i32
+        %d2 = arith.index_cast %arg1 : index to i32
+        %d3 = arith.sitofp %d1 : i32 to f16
+        %d4 = arith.sitofp %d2 : i32 to f16
+        %d5 = arith.addf %d3, %f123 : f16
+        %d6 = arith.constant 3.12 : f16
+        %d7 = arith.mulf %d5, %d6 : f16
+        %d8 = arith.addf %d7, %d5 : f16
+        %d9 = arith.constant 0.178 : f16
+        %d10 = arith.divf %d9, %d8 : f16
+        memref.store %d10, %srcMemref_host[%arg0, %arg1] : memref<128x128xf16>
+        memref.store %f0, %dstMemref_host[%arg0, %arg1] : memref<128x128xf16>
+      }
+    }
+
+    %s1 = gpu.wait async
+    %srcMemref, %s2 = gpu.alloc async [%s1] () : memref<128x128xf16>
+    %dstMemref, %s3 = gpu.alloc async [%s2] () : memref<128x128xf16>
+    %s4 = gpu.memcpy async [%s3] %srcMemref, %srcMemref_host : memref<128x128xf16>, memref<128x128xf16>
+    %s5 = gpu.memcpy async [%s4] %dstMemref, %dstMemref_host : memref<128x128xf16>, memref<128x128xf16>
+
+    %expand_shape = memref.expand_shape %srcMemref [[0, 1], [2, 3]] : memref<128x128xf16> into memref<2x64x2x64xf16>
+    %transpose = memref.transpose %expand_shape (d0, d1, d2, d3) -> (d0, d2, d1, d3) : memref<2x64x2x64xf16> to memref<2x2x64x64xf16, strided<[8192, 64, 128, 1]>>
+    %cast = memref.cast %transpose : memref<2x2x64x64xf16, strided<[8192, 64, 128, 1]>> to memref<*xf16>
+    %24 = nvgpu.tma.create.descriptor %cast box[%c2, %c2, %c64, %c64] : memref<*xf16> -> <tensor = memref<2x2x64x64xf16, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
+    
+    gpu.launch 
+      blocks(%arg2, %arg3, %arg4) in (%arg8 = %c1, %arg9 = %c1, %arg10 = %c1) 
+      threads(%arg5, %arg6, %arg7) in (%arg11 = %c128, %arg12 = %c1, %arg13 = %c1) 
+      dynamic_shared_memory_size %c32768_i32 
+    {
+      %26 = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+      %view = memref.view %26[%c0][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<2x2x64x64xf16, #gpu.address_space<workgroup>>
+      %27 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+      %thread_id_x = gpu.thread_id  x
+      %28 = arith.index_cast %thread_id_x : index to i32
+      %29 = arith.shrui %28, %c5_i32 : i32
+      %30 = nvvm.shfl.sync  idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32
+      %31 = arith.cmpi eq, %30, %c0_i32 : i32
+      %32 = nvvm.elect.sync -> i1
+      %33 = arith.andi %31, %32 : i1
+      scf.if %33 {
+        nvgpu.mbarrier.init %27[%c0], %c1 : <memorySpace = #gpu.address_space<workgroup>>
+      }
+      %34 = nvvm.shfl.sync  idx %c-1_i32, %29, %c0_i32, %c31_i32 : i32 -> i32
+      %35 = arith.cmpi eq, %34, %c0_i32 : i32
+      %36 = nvvm.elect.sync -> i1
+      %37 = arith.andi %35, %36 : i1
+      scf.if %37 {
+        nvgpu.mbarrier.arrive.expect_tx %27[%c0], %c32768 : <memorySpace = #gpu.address_space<workgroup>>
+        nvgpu.tma.async.load %24[%c0, %c0, %c0, %c0], %27[%c0] to %view : <tensor = memref<2x2x64x64xf16, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<2x2x64x64xf16, #gpu.address_space<workgroup>>
+      }
+      nvgpu.mbarrier.try_wait.parity %27[%c0], %false, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
+      scf.for %arg14 = %c0 to %c2 step %c1 {
+        scf.for %arg15 = %c0 to %c2 step %c1 {
+          %38 = arith.muli %arg14, %c64 : index
+          %39 = arith.muli %arg15, %c64 : index
+          %subview = memref.subview %view[%arg14, %arg15, 0, 0] [1, 1, 64, 64] [1, 1, 1, 1] : memref<2x2x64x64xf16, #gpu.address_space<workgroup>> to memref<64x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space<workgroup>>
+          %subview_0 = memref.subview %dstMemref[%38, %39] [64, 64] [1, 1] : memref<128x128xf16> to memref<64x64xf16, strided<[128, 1], offset: ?>>
+          %block_dim_x = gpu.block_dim  x
+          %thread_id_y = gpu.thread_id  y
+          %40 = arith.muli %thread_id_y, %block_dim_x : index
+          %41 = arith.addi %thread_id_x, %40 : index
+          %block_dim_y = gpu.block_dim  y
+          %42 = arith.muli %block_dim_x, %block_dim_y : index
+          %thread_id_z = gpu.thread_id  z
+          %43 = arith.muli %thread_id_z, %42 : index
+          %44 = arith.addi %41, %43 : index
+          %45 = arith.cmpi eq, %44, %c0 : index
+          scf.if %45 {
+            scf.for %arg16 = %c0 to %c64 step %c1 {
+              scf.for %arg17 = %c0 to %c64 step %c1 {
+                %46 = memref.load %subview[%arg16, %arg17] : memref<64x64xf16, strided<[64, 1], offset: ?>, #gpu.address_space<workgroup>>
+                memref.store %46, %subview_0[%arg16, %arg17] : memref<64x64xf16, strided<[128, 1], offset: ?>>
+              }
+            }
+          }
+          gpu.barrier
+        }
+      }
+      gpu.terminator
+    }
+
+    %s6 = gpu.memcpy async [%s5] %dstMemref_host, %dstMemref  : memref<128x128xf16>, memref<128x128xf16>
+    gpu.wait [%s6]
+
+   %errorCount, %correctCount =  scf.for %arg0 = %c0 to %c128 step %c1 iter_args(%ec1 = %c0, %cc1 = %c0) -> (index,index) {
+      %ec2, %cc2 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%ec2 = %ec1, %cc2 = %cc1) -> (index, index) { 
+        %v1 = memref.load %dstMemref_host[%arg0, %arg1] : memref<128x128xf16>
+        %v2 = memref.load %srcMemref_host[%arg0, %arg1] : memref<128x128xf16>
+        %p = arith.cmpf one, %v1, %v2 : f16        
+        %ec3, %cc3 = scf.if %p -> (index, index) {
+          %ec3 = arith.addi %ec2, %c1 : index
+          scf.yield %ec3, %cc2 : index, index
+        } else {
+          %cc3 = arith.addi %cc2, %c1 : index
+          scf.yield %ec2, %cc3 : index, index
+        }
+      scf.yield %ec3, %cc3 : index,index
+      }
+      scf.yield %ec2, %cc2 : index,index
+    }
+    
+    vector.print str "Correct Results :"
+    vector.print %correctCount : index
+    vector.print str "Incorrect Results :"
+    vector.print %errorCount : index
+    return
+  }
+}
diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll
index 9ef6580bcf2408..9af40d8c8d3ee6 100644
--- a/mlir/test/Target/LLVMIR/Import/debug-info.ll
+++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll
@@ -296,12 +296,17 @@ define void @class_method() {
   ret void, !dbg !9
 }
 
-; Verify the elements parameter is dropped due to the cyclic dependencies.
-; CHECK: #[[COMP:.+]] = #llvm.di_composite_type<tag = DW_TAG_class_type, name = "class_name", file = #{{.*}}, line = 42, flags = "TypePassByReference|NonTrivial">
-; CHECK: #[[COMP_PTR:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP]], sizeInBits = 64>
+; Verify the cyclic composite type is identified, even though conversion begins from the subprogram type.
+; CHECK: #[[COMP_SELF:.+]] = #llvm.di_composite_type<tag = DW_TAG_null, recId = [[REC_ID:.+]]>
+; CHECK: #[[COMP_PTR:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP_SELF]], sizeInBits = 64>
 ; CHECK: #[[SP_TYPE:.+]] = #llvm.di_subroutine_type<types = #{{.*}}, #[[COMP_PTR]]>
-; CHECK: #[[SP:.+]] = #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #{{.*}}, scope = #[[COMP]], name = "class_method", file = #{{.*}}, subprogramFlags = Definition, type = #[[SP_TYPE]]>
-; CHECK: #[[LOC]] = loc(fused<#[[SP]]>
+; CHECK: #[[SP_INNER:.+]] = #llvm.di_subprogram<id = [[SP_ID:.+]], compileUnit = #{{.*}}, scope = #[[COMP_SELF]], name = "class_method", file = #{{.*}}, subprogramFlags = Definition, type = #[[SP_TYPE]]>
+; CHECK: #[[COMP:.+]] = #llvm.di_composite_type<tag = DW_TAG_class_type, recId = [[REC_ID]], name = "class_name", file = #{{.*}}, line = 42, flags = "TypePassByReference|NonTrivial", elements = #[[SP_INNER]]>
+
+; CHECK: #[[COMP_PTR_OUTER:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP]], sizeInBits = 64>
+; CHECK: #[[SP_TYPE_OUTER:.+]] = #llvm.di_subroutine_type<types = #{{.*}}, #[[COMP_PTR_OUTER]]>
+; CHECK: #[[SP_OUTER:.+]] = #llvm.di_subprogram<id = [[SP_ID]], compileUnit = #{{.*}}, scope = #[[COMP]], name = "class_method", file = #{{.*}}, subprogramFlags = Definition, type = #[[SP_TYPE_OUTER]]>
+; CHECK: #[[LOC]] = loc(fused<#[[SP_OUTER]]>
 
 !llvm.dbg.cu = !{!1}
 !llvm.module.flags = !{!0}
@@ -318,15 +323,18 @@ define void @class_method() {
 
 ; // -----
 
-; Verify the elements parameter is dropped due to the cyclic dependencies.
-; CHECK: #[[$COMP:.+]] = #llvm.di_composite_type<tag = DW_TAG_class_type, name = "class_field", file = #{{.*}}, line = 42, flags = "TypePassByReference|NonTrivial">
-; CHECK: #[[$COMP_PTR:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[$COMP]]>
-; CHECK: #[[$VAR0:.+]] = #llvm.di_local_variable<scope = #{{.*}}, name = "class_field", file = #{{.*}}, type = #[[$COMP_PTR]]>
+; Verify the cyclic composite type is handled correctly.
+; CHECK: #[[COMP_SELF:.+]] = #llvm.di_composite_type<tag = DW_TAG_null, recId = [[REC_ID:.+]]>
+; CHECK: #[[COMP_PTR_INNER:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP_SELF]]>
+; CHECK: #[[FIELD:.+]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "call_field", baseType = #[[COMP_PTR_INNER]]>
+; CHECK: #[[COMP:.+]] = #llvm.di_composite_type<tag = DW_TAG_class_type, recId = [[REC_ID]], name = "class_field", file = #{{.*}}, line = 42, flags = "TypePassByReference|NonTrivial", elements = #[[FIELD]]>
+; CHECK: #[[COMP_PTR_OUTER:.+]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #[[COMP]]>
+; CHECK: #[[VAR0:.+]] = #llvm.di_local_variable<scope = #{{.*}}, name = "class_field", file = #{{.*}}, type = #[[COMP_PTR_OUTER]]>
 
-; CHECK-LABEL: @class_field
+; CHECK: @class_field
 ; CHECK-SAME:  %[[ARG0:[a-zA-Z0-9]+]]
 define void @class_field(ptr %arg1) {
-  ; CHECK: llvm.intr.dbg.value #[[$VAR0]] = %[[ARG0]] : !llvm.ptr
+  ; CHECK: llvm.intr.dbg.value #[[VAR0]] = %[[ARG0]] : !llvm.ptr
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !7, metadata !DIExpression()), !dbg !9
   ret void
 }
@@ -563,35 +571,6 @@ define void @func_in_module(ptr %arg) !dbg !8 {
 
 ; // -----
 
-; Verifies that array types that have an unimportable base type are removed to
-; avoid producing invalid IR.
-; CHECK: #[[DI_LOCAL_VAR:.+]] = #llvm.di_local_variable<
-; CHECK-NOT: type =
-
-; CHECK-LABEL: @array_with_cyclic_base_type
-define i32 @array_with_cyclic_base_type(ptr %0) !dbg !3 {
-  call void @llvm.dbg.value(metadata ptr %0, metadata !4, metadata !DIExpression()), !dbg !7
-  ret i32 0
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-
-!llvm.module.flags = !{!0}
-!llvm.dbg.cu = !{!1}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2)
-!2 = !DIFile(filename: "debug-info.ll", directory: "/")
-!3 = distinct !DISubprogram(name: "func", scope: !2, file: !2, line: 46, scopeLine: 48, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
-!4 = !DILocalVariable(name: "op", arg: 5, scope: !3, file: !2, line: 47, type: !5)
-!5 = !DICompositeType(tag: DW_TAG_array_type, size: 42, baseType: !6)
-!6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5)
-!7 = !DILocation(line: 0, scope: !3)
-
-; // -----
-
 ; Verifies that import compile units respect the distinctness of the input.
 ; CHECK-LABEL: @distinct_cu_func0
 define void @distinct_cu_func0() !dbg !4 {
diff --git a/mlir/test/Target/LLVMIR/Import/import-failure.ll b/mlir/test/Target/LLVMIR/Import/import-failure.ll
index 9a4e939d106516..3f4efab70e1c02 100644
--- a/mlir/test/Target/LLVMIR/Import/import-failure.ll
+++ b/mlir/test/Target/LLVMIR/Import/import-failure.ll
@@ -85,38 +85,6 @@ define void @unsupported_argument(i64 %arg1) {
 
 ; // -----
 
-; Check that debug intrinsics that depend on cyclic metadata are dropped.
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
-; CHECK:      import-failure.ll
-; CHECK-SAME: warning: dropped instruction: call void @llvm.dbg.label(metadata !{{.*}})
-; CHECK:      import-failure.ll
-; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata i64 %{{.*}}, metadata !3, metadata !DIExpression())
-define void @cylic_metadata(i64 %arg1) {
-  call void @llvm.dbg.value(metadata i64 %arg1, metadata !10, metadata !DIExpression()), !dbg !14
-  call void @llvm.dbg.label(metadata !13), !dbg !14
-  ret void
-}
-
-!llvm.dbg.cu = !{!1}
-!llvm.module.flags = !{!0}
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2)
-!2 = !DIFile(filename: "import-failure.ll", directory: "/")
-!3 = !DICompositeType(tag: DW_TAG_array_type, size: 42, baseType: !4)
-!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3)
-!5 = distinct !DISubprogram(name: "class_method", scope: !2, file: !2, type: !6, spFlags: DISPFlagDefinition, unit: !1)
-!6 = !DISubroutineType(types: !7)
-!7 = !{!3}
-!10 = !DILocalVariable(scope: !5, name: "arg1", file: !2, line: 1, arg: 1, align: 64);
-!11 = !DILexicalBlock(scope: !5)
-!12 = !DILexicalBlockFile(scope: !11, discriminator: 0)
-!13 = !DILabel(scope: !12, name: "label", file: !2, line: 42)
-!14 = !DILocation(line: 1, column: 2, scope: !5)
-
-; // -----
-
 ; global_dtors with non-null data fields cannot be represented in MLIR.
 ; CHECK:      <unknown>
 ; CHECK-SAME: error: unhandled global variable: @llvm.global_dtors
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index cfd5239515c9c0..c042628334d4c5 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -342,3 +342,83 @@ llvm.func @func_line_tables() {
 llvm.func @func_debug_directives() {
   llvm.return
 } loc(fused<#di_subprogram_2>["foo2.mlir":0:0])
+
+// -----
+
+// Ensure recursive types with multiple external references work.
+
+// Common base nodes.
+#di_file = #llvm.di_file<"test.mlir" in "/">
+#di_null_type = #llvm.di_null_type
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[1]<>, sourceLanguage = DW_LANG_C, file = #di_file, isOptimized = false, emissionKind = None>
+
+// Recursive type itself.
+#di_struct_self = #llvm.di_composite_type<tag = DW_TAG_null, recId = distinct[0]<>>
+#di_ptr_inner = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #di_struct_self, sizeInBits = 64>
+#di_subroutine_inner = #llvm.di_subroutine_type<types = #di_null_type, #di_ptr_inner>
+#di_subprogram_inner = #llvm.di_subprogram<
+  id = distinct[2]<>,
+  compileUnit = #di_compile_unit,
+  scope = #di_struct_self,
+  name = "class_method",
+  file = #di_file,
+  subprogramFlags = Definition,
+  type = #di_subroutine_inner>
+#di_struct = #llvm.di_composite_type<
+  tag = DW_TAG_class_type,
+  recId = distinct[0]<>,
+  name = "class_name",
+  file = #di_file,
+  line = 42,
+  flags = "TypePassByReference|NonTrivial",
+  elements = #di_subprogram_inner>
+
+// Outer types referencing the entire recursive type.
+#di_ptr_outer = #llvm.di_derived_type<tag = DW_TAG_pointer_type, baseType = #di_struct, sizeInBits = 64>
+#di_subroutine_outer = #llvm.di_subroutine_type<types = #di_null_type, #di_ptr_outer>
+#di_subprogram_outer = #llvm.di_subprogram<
+  id = distinct[2]<>,
+  compileUnit = #di_compile_unit,
+  scope = #di_struct,
+  name = "class_method",
+  file = #di_file,
+  subprogramFlags = Definition,
+  type = #di_subroutine_outer>
+
+#loc3 = loc(fused<#di_subprogram_outer>["test.mlir":1:1])
+
+// CHECK: @class_method
+// CHECK: ret void, !dbg ![[LOC:.*]]
+
+// CHECK: ![[CU:.*]] = distinct !DICompileUnit(
+// CHECK: ![[SP:.*]] = distinct !DISubprogram(name: "class_method", scope: ![[STRUCT:.*]], file: !{{.*}}, type: ![[SUBROUTINE:.*]], spFlags: DISPFlagDefinition, unit: ![[CU]])
+// CHECK: ![[STRUCT]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "class_name", {{.*}}, elements: ![[ELEMS:.*]])
+// CHECK: ![[ELEMS]] = !{![[SP]]}
+// CHECK: ![[SUBROUTINE]] = !DISubroutineType(types: ![[SUBROUTINE_ELEMS:.*]])
+// CHECK: ![[SUBROUTINE_ELEMS]] = !{null, ![[PTR:.*]]}
+// CHECK: ![[PTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[STRUCT]], size: 64)
+// CHECK: ![[LOC]] = !DILocation(line: 1, column: 1, scope: ![[SP]])
+
+llvm.func @class_method() {
+  llvm.return loc(#loc3)
+} loc(#loc3)
+
+// -----
+
+// Ensures composite types with a recursive scope work.
+
+#di_composite_type_self = #llvm.di_composite_type<tag = DW_TAG_null, recId = distinct[0]<>>
+#di_file = #llvm.di_file<"test.mlir" in "/">
+#di_subroutine_type = #llvm.di_subroutine_type<types = #di_composite_type_self>
+#di_subprogram = #llvm.di_subprogram<scope = #di_file, file = #di_file, subprogramFlags = Optimized, type = #di_subroutine_type>
+#di_composite_type = #llvm.di_composite_type<tag = DW_TAG_class_type, recId = distinct[0]<>, scope = #di_subprogram>
+#di_global_variable = #llvm.di_global_variable<file = #di_file, line = 1, type = #di_composite_type>
+#di_global_variable_expression = #llvm.di_global_variable_expression<var = #di_global_variable>
+
+llvm.mlir.global @global_variable() {dbg_expr = #di_global_variable_expression} : !llvm.struct<()>
+
+// CHECK: distinct !DIGlobalVariable({{.*}}type: ![[COMP:[0-9]+]],
+// CHECK: ![[COMP]] = distinct !DICompositeType({{.*}}scope: ![[SCOPE:[0-9]+]],
+// CHECK: ![[SCOPE]] = !DISubprogram({{.*}}type: ![[SUBROUTINE:[0-9]+]],
+// CHECK: ![[SUBROUTINE]] = !DISubroutineType(types: ![[SR_TYPES:[0-9]+]])
+// CHECK: ![[SR_TYPES]] = !{![[COMP]]}
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-call.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-call.mlir
new file mode 100644
index 00000000000000..9aa9ac7e4b5a52
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-call.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Test that we don't crash when there is a call operation in the combiner
+
+omp.reduction.declare @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = llvm.mlir.constant(0.0 : f32) : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+// test this call here:
+  llvm.call @test_call() : () -> ()
+  %1 = llvm.fadd %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+
+llvm.func @simple_reduction(%lb : i64, %ub : i64, %step : i64) {
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
+  omp.parallel reduction(@add_f32 %0 -> %prv : !llvm.ptr) {
+    %1 = llvm.mlir.constant(2.0 : f32) : f32
+    %2 = llvm.load %prv : !llvm.ptr -> f32
+    %3 = llvm.fadd %1, %2 : f32
+    llvm.store %3, %prv : f32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+llvm.func @test_call() -> ()
+
+// Call to the troublesome function will have been inlined twice: once into
+// main and once into the outlined function
+// CHECK: call void @test_call()
+// CHECK: call void @test_call()
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 93550f5c7bd5a4..ce6b56d48437a0 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -88,13 +88,20 @@ llvm.func @rocdl.bpermute(%src : i32) -> i32 {
   llvm.return %0 : i32
 }
 
-llvm.func @rocdl.ballot(%pred : i1) -> i32 {
-  // CHECK-LABEL: rocdl.ballot
+llvm.func @rocdl.ballot32(%pred : i1) -> i32 {
+  // CHECK-LABEL: rocdl.ballot32
   // CHECK: call i32 @llvm.amdgcn.ballot
   %0 = rocdl.ballot %pred : i32
   llvm.return %0 : i32
 }
 
+llvm.func @rocdl.ballot64(%pred : i1) -> i64 {
+  // CHECK-LABEL: rocdl.ballot64
+  // CHECK: call i64 @llvm.amdgcn.ballot
+  %0 = rocdl.ballot %pred : i64
+  llvm.return %0 : i64
+}
+
 llvm.func @rocdl.waitcnt() {
   // CHECK-LABEL: rocdl.waitcnt
   // CHECK-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0)
diff --git a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
index 541a201c94c586..e2229a392bbf76 100644
--- a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
@@ -683,6 +683,24 @@ func.func @cosh() {
  return
 }
 
+// -------------------------------------------------------------------------- //
+// Tanh.
+// -------------------------------------------------------------------------- //
+
+func.func @tanh_8xf32(%a : vector<8xf32>) {
+  %r = math.tanh %a : vector<8xf32>
+  vector.print %r : vector<8xf32>
+  return
+}
+
+func.func @tanh() {
+  // CHECK: -1, -0.761594, -0.291313, 0, 0.291313, 0.761594, 1, 1
+  %v3 = arith.constant dense<[0xff800000, -1.0, -0.3, 0.0, 0.3, 1.0, 10.0, 0x7f800000]> : vector<8xf32>
+  call @tanh_8xf32(%v3) : (vector<8xf32>) -> ()
+
+ return
+}
+
 func.func @main() {
   call @exp2f() : () -> ()
   call @roundf() : () -> ()
@@ -690,5 +708,6 @@ func.func @main() {
   call @roundeven() : () -> ()
   call @sinh() : () -> ()
   call @cosh() : () -> ()
+  call @tanh() : () -> ()
   return
 }
diff --git a/mlir/test/mlir-opt/split-markers.mlir b/mlir/test/mlir-opt/split-markers.mlir
index 8011f5202b3f79..665a37f3177051 100644
--- a/mlir/test/mlir-opt/split-markers.mlir
+++ b/mlir/test/mlir-opt/split-markers.mlir
@@ -6,7 +6,7 @@
 // Check that (1) custom input splitter and (2) custom output splitters work.
 // RUN: mlir-opt %s -split-input-file="// CHECK: ""----" \
 // RUN:   -output-split-marker="// ---- next split ----" \
-// RUN: | FileCheck -input-file %s -check-prefix=CHECK-SPLITTERS %s
+// RUN: | FileCheck --check-prefix=CHECK-SPLITTERS %s
 
 func.func @main() {return}
 
diff --git a/mlir/tools/mlir-pdll/mlir-pdll.cpp b/mlir/tools/mlir-pdll/mlir-pdll.cpp
index ce273227c31906..704f19a4b2f555 100644
--- a/mlir/tools/mlir-pdll/mlir-pdll.cpp
+++ b/mlir/tools/mlir-pdll/mlir-pdll.cpp
@@ -143,7 +143,7 @@ int main(int argc, char **argv) {
       llvm::cl::desc(
           "Print out the parsed ODS information from the input file"),
       llvm::cl::init(false));
-  llvm::cl::opt<std::string> inputSplitMarker(
+  llvm::cl::opt<std::string> inputSplitMarker{
       "split-input-file", llvm::cl::ValueOptional,
       llvm::cl::callback([&](const std::string &str) {
         // Implicit value: use default marker if flag was used without value.
@@ -152,7 +152,7 @@ int main(int argc, char **argv) {
       }),
       llvm::cl::desc("Split the input file into chunks using the given or "
                      "default marker and process each chunk independently"),
-      llvm::cl::init(""));
+      llvm::cl::init("")};
   llvm::cl::opt<std::string> outputSplitMarker(
       "output-split-marker",
       llvm::cl::desc("Split marker to use for merging the ouput"),
diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
index 9672a02cc08f68..2a7406f42f34b5 100644
--- a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
@@ -533,7 +533,7 @@ void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) {
      << "struct " << interfaceTraitsName << " {\n";
   emitConceptDecl(interface);
   emitModelDecl(interface);
-  os << "};";
+  os << "};\n";
 
   // Emit the derived trait for the interface.
   os << "template <typename " << valueTemplate << ">\n";
diff --git a/mlir/unittests/IR/AffineExprTest.cpp b/mlir/unittests/IR/AffineExprTest.cpp
new file mode 100644
index 00000000000000..ff154eb29807c3
--- /dev/null
+++ b/mlir/unittests/IR/AffineExprTest.cpp
@@ -0,0 +1,32 @@
+//===- AffineExprTest.cpp - unit tests for affine expression API ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+
+// Test creating AffineExprs using the overloaded binary operators.
+TEST(AffineExprTest, constructFromBinaryOperators) {
+  MLIRContext ctx;
+  OpBuilder b(&ctx);
+
+  auto d0 = b.getAffineDimExpr(0);
+  auto d1 = b.getAffineDimExpr(1);
+
+  auto sum = d0 + d1;
+  auto difference = d0 - d1;
+  auto product = d0 * d1;
+  auto remainder = d0 % d1;
+
+  ASSERT_EQ(sum.getKind(), AffineExprKind::Add);
+  ASSERT_EQ(difference.getKind(), AffineExprKind::Add);
+  ASSERT_EQ(product.getKind(), AffineExprKind::Mul);
+  ASSERT_EQ(remainder.getKind(), AffineExprKind::Mod);
+}
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index e7e9c3b5651693..71f8f449756ec0 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_unittest(MLIRIRTests
   AdaptorTest.cpp
+  AffineExprTest.cpp
   AffineMapTest.cpp
   AttributeTest.cpp
   DialectTest.cpp
diff --git a/openmp/docs/SupportAndFAQ.rst b/openmp/docs/SupportAndFAQ.rst
index afc532cd73eef4..9e6974dfbb13d2 100644
--- a/openmp/docs/SupportAndFAQ.rst
+++ b/openmp/docs/SupportAndFAQ.rst
@@ -279,11 +279,12 @@ Q: How to build an OpenMP offload capable compiler with an outdated host compile
 
 Enabling the OpenMP runtime will perform a two-stage build for you.
 If your host compiler is different from your system-wide compiler, you may need
-to set the CMake variable `GCC_INSTALL_PREFIX` so clang will be able to find the
-correct GCC toolchain in the second stage of the build.
+to set ``CMAKE_{C,CXX}_FLAGS`` like
+``--gcc-install-dir=/usr/lib/gcc/x86_64-linux-gnu/12`` so that clang will be
+able to find the correct GCC toolchain in the second stage of the build.
 
 For example, if your system-wide GCC installation is too old to build LLVM and
-you would like to use a newer GCC, set the CMake variable `GCC_INSTALL_PREFIX`
+you would like to use a newer GCC, set ``--gcc-install-dir=``
 to inform clang of the GCC installation you would like to use in the second stage.
 
 Q: How can I include OpenMP offloading support in my CMake project?
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index a74eff0c0bebf3..b382137b70ee2a 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -56,6 +56,8 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu-L
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-LTO")
 set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-JIT-LTO")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu-LTO")
 
 # Once the plugins for the different targets are validated, they will be added to
 # the list of supported targets in the current system.
diff --git a/openmp/libomptarget/include/OpenMP/Mapping.h b/openmp/libomptarget/include/OpenMP/Mapping.h
index 4bd676fc658a7d..b9f5c165829314 100644
--- a/openmp/libomptarget/include/OpenMP/Mapping.h
+++ b/openmp/libomptarget/include/OpenMP/Mapping.h
@@ -424,7 +424,8 @@ typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
                                    map_var_info_t *, void **, AsyncInfoTy &,
                                    bool);
 
-void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device);
+void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
+                               bool toStdOut = false);
 
 int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     void **ArgsBase, void **Args, int64_t *ArgSizes,
diff --git a/openmp/libomptarget/include/Shared/Debug.h b/openmp/libomptarget/include/Shared/Debug.h
index a39626d15386b0..7c3db8dbf119f6 100644
--- a/openmp/libomptarget/include/Shared/Debug.h
+++ b/openmp/libomptarget/include/Shared/Debug.h
@@ -136,10 +136,12 @@ inline uint32_t getDebugLevel() {
   } while (0)
 
 /// Print a generic information string used if LIBOMPTARGET_INFO=1
-#define INFO_MESSAGE(_num, ...)                                                \
+#define INFO_MESSAGE(_num, ...) INFO_MESSAGE_TO(stderr, _num, __VA_ARGS__)
+
+#define INFO_MESSAGE_TO(_stdDst, _num, ...)                                    \
   do {                                                                         \
-    fprintf(stderr, GETNAME(TARGET_NAME) " device %d info: ", (int)_num);      \
-    fprintf(stderr, __VA_ARGS__);                                              \
+    fprintf(_stdDst, GETNAME(TARGET_NAME) " device %d info: ", (int)_num);     \
+    fprintf(_stdDst, __VA_ARGS__);                                             \
   } while (0)
 
 // Debugging messages
@@ -187,4 +189,13 @@ inline uint32_t getDebugLevel() {
     }                                                                          \
   } while (false)
 
+#define DUMP_INFO(toStdOut, _flags, _id, ...)                                  \
+  do {                                                                         \
+    if (toStdOut) {                                                            \
+      INFO_MESSAGE_TO(stdout, _id, __VA_ARGS__);                               \
+    } else {                                                                   \
+      INFO(_flags, _id, __VA_ARGS__);                                          \
+    }                                                                          \
+  } while (false)
+
 #endif // OMPTARGET_SHARED_DEBUG_H
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 8e0ccf191839da..323dee41630f2f 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -273,6 +273,7 @@ struct __tgt_target_non_contig {
 extern "C" {
 #endif
 
+void ompx_dump_mapping_tables(void);
 int omp_get_num_devices(void);
 int omp_get_device_num(void);
 int omp_get_initial_device(void);
diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
index 3ca02368253ef8..b6fc136e8a1798 100644
--- a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -12,12 +12,15 @@
 
 add_subdirectory(common)
 
-# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
+# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname,
+#                          string tmachine_llvm, string tmachine_triple, string elf_machine_id);
 # - build a plugin for an ELF based generic 64-bit target based on libffi.
 # - tmachine: name of the machine processor as used in the cmake build system.
 # - tmachine_name: name of the machine to be printed with the debug messages.
 # - tmachine_libname: machine name to be appended to the plugin library name.
-macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+# - tmachine_llvm: LLVM triple for the processor
+# - tmachine_triple: GNU target triple
+macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_llvm tmachine_triple elf_machine_id)
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
   # Define macro to be used as prefix of the runtime messages for this target.
   add_definitions("-DTARGET_NAME=${tmachine_name}")
@@ -30,7 +33,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
   add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
 
   # Define target triple
-  add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine}")
+  add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine_llvm}")
 
   add_llvm_library("omptarget.rtl.${tmachine_libname}"
     SHARED
diff --git a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
index 2c2b753590e205..663ab4d60ff912 100644
--- a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
 else()
  libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index 5c767995126b77..829b4b72911935 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -104,7 +104,7 @@ class GenericGlobalHandlerTy {
   virtual ~GenericGlobalHandlerTy() {}
 
   /// Helper function for getting an ELF from a device image.
-  Expected<ELF64LEObjectFile> getELFObjectFile(DeviceImageTy &Image);
+  Expected<std::unique_ptr<ObjectFile>> getELFObjectFile(DeviceImageTy &Image);
 
   /// Returns whether the symbol named \p SymName is present in the given \p
   /// Image.
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index 9645ab2a672625..b7be7b645ba33e 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -289,7 +289,7 @@ struct GenericKernelTy {
 
   /// Return a device pointer to a new kernel launch environment.
   Expected<KernelLaunchEnvironmentTy *>
-  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
+  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
                              AsyncInfoWrapperTy &AsyncInfo) const;
 
   /// Indicate whether an execution mode is valid.
@@ -1393,7 +1393,7 @@ template <typename ResourceRef> class GenericDeviceResourceManagerTy {
 };
 
 /// A static check on whether or not we support RPC in libomptarget.
-const bool libomptargetSupportsRPC();
+bool libomptargetSupportsRPC();
 
 } // namespace plugin
 } // namespace target
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/Utils/ELF.h b/openmp/libomptarget/plugins-nextgen/common/include/Utils/ELF.h
index 140a6b6b84aa12..88c83d39b68ceb 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/Utils/ELF.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/Utils/ELF.h
@@ -28,17 +28,15 @@ bool isELF(llvm::StringRef Buffer);
 llvm::Expected<bool> checkMachine(llvm::StringRef Object, uint16_t EMachine);
 
 /// Returns a pointer to the given \p Symbol inside of an ELF object.
-llvm::Expected<const void *> getSymbolAddress(
-    const llvm::object::ELFObjectFile<llvm::object::ELF64LE> &ELFObj,
-    const llvm::object::ELF64LE::Sym &Symbol);
+llvm::Expected<const void *>
+getSymbolAddress(const llvm::object::ELFSymbolRef &Symbol);
 
 /// Returns the symbol associated with the \p Name in the \p ELFObj. It will
 /// first search for the hash sections to identify symbols from the hash table.
 /// If that fails it will fall back to a linear search in the case of an
 /// executable file without a hash table.
-llvm::Expected<const typename llvm::object::ELF64LE::Sym *>
-getSymbol(const llvm::object::ELFObjectFile<llvm::object::ELF64LE> &ELFObj,
-          llvm::StringRef Name);
+llvm::Expected<std::optional<llvm::object::ELFSymbolRef>>
+getSymbol(const llvm::object::ObjectFile &ELFObj, llvm::StringRef Name);
 
 } // namespace elf
 } // namespace utils
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index d398f60c55bd13..ba0aa47f8e51c3 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -25,14 +25,12 @@ using namespace omp;
 using namespace target;
 using namespace plugin;
 
-Expected<ELF64LEObjectFile>
+Expected<std::unique_ptr<ObjectFile>>
 GenericGlobalHandlerTy::getELFObjectFile(DeviceImageTy &Image) {
   assert(utils::elf::isELF(Image.getMemoryBuffer().getBuffer()) &&
          "Input is not an ELF file");
 
-  Expected<ELF64LEObjectFile> ElfOrErr =
-      ELF64LEObjectFile::create(Image.getMemoryBuffer());
-  return ElfOrErr;
+  return ELFObjectFileBase::createELFObjectFile(Image.getMemoryBuffer());
 }
 
 Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
@@ -91,13 +89,13 @@ bool GenericGlobalHandlerTy::isSymbolInImage(GenericDeviceTy &Device,
   }
 
   // Search the ELF symbol using the symbol name.
-  auto SymOrErr = utils::elf::getSymbol(*ELFObjOrErr, SymName);
+  auto SymOrErr = utils::elf::getSymbol(**ELFObjOrErr, SymName);
   if (!SymOrErr) {
     consumeError(SymOrErr.takeError());
     return false;
   }
 
-  return *SymOrErr;
+  return SymOrErr->has_value();
 }
 
 Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
@@ -110,17 +108,17 @@ Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
     return ELFObj.takeError();
 
   // Search the ELF symbol using the symbol name.
-  auto SymOrErr = utils::elf::getSymbol(*ELFObj, ImageGlobal.getName());
+  auto SymOrErr = utils::elf::getSymbol(**ELFObj, ImageGlobal.getName());
   if (!SymOrErr)
     return Plugin::error("Failed ELF lookup of global '%s': %s",
                          ImageGlobal.getName().data(),
                          toString(SymOrErr.takeError()).data());
 
-  if (!*SymOrErr)
+  if (!SymOrErr->has_value())
     return Plugin::error("Failed to find global symbol '%s' in the ELF image",
                          ImageGlobal.getName().data());
 
-  auto AddrOrErr = utils::elf::getSymbolAddress(*ELFObj, **SymOrErr);
+  auto AddrOrErr = utils::elf::getSymbolAddress(**SymOrErr);
   // Get the section to which the symbol belongs.
   if (!AddrOrErr)
     return Plugin::error("Failed to get ELF symbol from global '%s': %s",
@@ -129,7 +127,7 @@ Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
 
   // Setup the global symbol's address and size.
   ImageGlobal.setPtr(const_cast<void *>(*AddrOrErr));
-  ImageGlobal.setSize((*SymOrErr)->st_size);
+  ImageGlobal.setSize((*SymOrErr)->getSize());
 
   return Plugin::success();
 }
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 8c8c52de0c210c..f39f913d85eec2 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -468,11 +468,13 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
 Expected<KernelLaunchEnvironmentTy *>
 GenericKernelTy::getKernelLaunchEnvironment(
-    GenericDeviceTy &GenericDevice,
+    GenericDeviceTy &GenericDevice, uint32_t Version,
     AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   // Ctor/Dtor have no arguments, replaying uses the original kernel launch
-  // environment.
-  if (isCtorOrDtor() || RecordReplay.isReplaying())
+  // environment. Older versions of the compiler do not generate a kernel
+  // launch environment.
+  if (isCtorOrDtor() || RecordReplay.isReplaying() ||
+      Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
     return nullptr;
 
   if (!KernelEnvironment.Configuration.ReductionDataSize ||
@@ -544,8 +546,8 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
   llvm::SmallVector<void *, 16> Args;
   llvm::SmallVector<void *, 16> Ptrs;
 
-  auto KernelLaunchEnvOrErr =
-      getKernelLaunchEnvironment(GenericDevice, AsyncInfoWrapper);
+  auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
+      GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
   if (!KernelLaunchEnvOrErr)
     return KernelLaunchEnvOrErr.takeError();
 
@@ -586,6 +588,9 @@ void *GenericKernelTy::prepareArgs(
   uint32_t KLEOffset = !!KernelLaunchEnvironment;
   NumArgs += KLEOffset;
 
+  if (NumArgs == 0)
+    return nullptr;
+
   Args.resize(NumArgs);
   Ptrs.resize(NumArgs);
 
@@ -1560,7 +1565,7 @@ Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
   return isELFCompatible(Image);
 }
 
-const bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
+bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
   return true;
 #else
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp b/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
index c84c3bad5def0a..2ae97f0f258925 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/Utils/ELF.cpp
@@ -36,18 +36,10 @@ bool utils::elf::isELF(StringRef Buffer) {
   }
 }
 
-Expected<bool> utils::elf::checkMachine(StringRef Object, uint16_t EMachine) {
-  assert(isELF(Object) && "Input is not an ELF!");
-
-  Expected<ELF64LEObjectFile> ElfOrErr =
-      ELF64LEObjectFile::create(MemoryBufferRef(Object, /*Identifier=*/""),
-                                /*InitContent=*/false);
-  if (!ElfOrErr)
-    return ElfOrErr.takeError();
-
-  const auto Header = ElfOrErr->getELFFile().getHeader();
-  if (Header.e_ident[EI_CLASS] != ELFCLASS64)
-    return createError("Only 64-bit ELF files are supported");
+template <class ELFT>
+static Expected<bool>
+checkMachineImpl(const object::ELFObjectFile<ELFT> &ELFObj, uint16_t EMachine) {
+  const auto Header = ELFObj.getELFFile().getHeader();
   if (Header.e_type != ET_EXEC && Header.e_type != ET_DYN)
     return createError("Only executable ELF files are supported");
 
@@ -71,6 +63,25 @@ Expected<bool> utils::elf::checkMachine(StringRef Object, uint16_t EMachine) {
   return Header.e_machine == EMachine;
 }
 
+Expected<bool> utils::elf::checkMachine(StringRef Object, uint16_t EMachine) {
+  assert(isELF(Object) && "Input is not an ELF!");
+
+  Expected<std::unique_ptr<ObjectFile>> ElfOrErr =
+      ObjectFile::createELFObjectFile(
+          MemoryBufferRef(Object, /*Identifier=*/""),
+          /*InitContent=*/false);
+  if (!ElfOrErr)
+    return ElfOrErr.takeError();
+
+  if (const ELF64LEObjectFile *ELFObj =
+          dyn_cast<ELF64LEObjectFile>(&**ElfOrErr))
+    return checkMachineImpl(*ELFObj, EMachine);
+  if (const ELF64BEObjectFile *ELFObj =
+          dyn_cast<ELF64BEObjectFile>(&**ElfOrErr))
+    return checkMachineImpl(*ELFObj, EMachine);
+  return createError("Only 64-bit ELF files are supported");
+}
+
 template <class ELFT>
 static Expected<const typename ELFT::Sym *>
 getSymbolFromGnuHashTable(StringRef Name, const typename ELFT::GnuHash &HashTab,
@@ -138,9 +149,10 @@ getSymbolFromSysVHashTable(StringRef Name, const typename ELFT::Hash &HashTab,
 }
 
 template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getHashTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
-                   StringRef Name) {
+static Expected<std::optional<ELFSymbolRef>>
+getHashTableSymbol(const ELFObjectFile<ELFT> &ELFObj,
+                   const typename ELFT::Shdr &Sec, StringRef Name) {
+  const ELFFile<ELFT> &Elf = ELFObj.getELFFile();
   if (Sec.sh_type != ELF::SHT_HASH && Sec.sh_type != ELF::SHT_GNU_HASH)
     return createError(
         "invalid sh_type for hash table, expected SHT_HASH or SHT_GNU_HASH");
@@ -179,7 +191,12 @@ getHashTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
                 sizeof(typename ELFT::Word) * HashTab->nbuckets +
                 sizeof(typename ELFT::Word) * (SymTab.size() - HashTab->symndx))
       return createError("section has invalid sh_size: " + Twine(Sec.sh_size));
-    return getSymbolFromGnuHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
+    auto Sym = getSymbolFromGnuHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
+    if (!Sym)
+      return Sym.takeError();
+    if (!*Sym)
+      return std::nullopt;
+    return ELFObj.toSymbolRef(*SymTabOrErr, *Sym - &SymTab[0]);
   }
 
   // If this is a Sys-V hash table we verify its size and search the symbol
@@ -197,16 +214,22 @@ getHashTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
                           sizeof(typename ELFT::Word) * HashTab->nchain)
       return createError("section has invalid sh_size: " + Twine(Sec.sh_size));
 
-    return getSymbolFromSysVHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
+    auto Sym = getSymbolFromSysVHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
+    if (!Sym)
+      return Sym.takeError();
+    if (!*Sym)
+      return std::nullopt;
+    return ELFObj.toSymbolRef(*SymTabOrErr, *Sym - &SymTab[0]);
   }
 
-  return nullptr;
+  return std::nullopt;
 }
 
 template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getSymTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
-                  StringRef Name) {
+static Expected<std::optional<ELFSymbolRef>>
+getSymTableSymbol(const ELFObjectFile<ELFT> &ELFObj,
+                  const typename ELFT::Shdr &Sec, StringRef Name) {
+  const ELFFile<ELFT> &Elf = ELFObj.getELFFile();
   if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM)
     return createError(
         "invalid sh_type for hash table, expected SHT_SYMTAB or SHT_DYNSYM");
@@ -226,13 +249,14 @@ getSymTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
 
   for (const typename ELFT::Sym &Sym : SymTab)
     if (StrTab.drop_front(Sym.st_name).data() == Name)
-      return &Sym;
+      return ELFObj.toSymbolRef(&Sec, &Sym - &SymTab[0]);
 
-  return nullptr;
+  return std::nullopt;
 }
 
-Expected<const typename ELF64LE::Sym *>
-utils::elf::getSymbol(const ELFObjectFile<ELF64LE> &ELFObj, StringRef Name) {
+template <class ELFT>
+static Expected<std::optional<ELFSymbolRef>>
+getSymbolImpl(const ELFObjectFile<ELFT> &ELFObj, StringRef Name) {
   // First try to look up the symbol via the hash table.
   for (ELFSectionRef Sec : ELFObj.sections()) {
     if (Sec.getType() != SHT_HASH && Sec.getType() != SHT_GNU_HASH)
@@ -241,8 +265,7 @@ utils::elf::getSymbol(const ELFObjectFile<ELF64LE> &ELFObj, StringRef Name) {
     auto HashTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex());
     if (!HashTabOrErr)
       return HashTabOrErr.takeError();
-    return getHashTableSymbol<ELF64LE>(ELFObj.getELFFile(), **HashTabOrErr,
-                                       Name);
+    return getHashTableSymbol<ELFT>(ELFObj, **HashTabOrErr, Name);
   }
 
   // If this is an executable file check the entire standard symbol table.
@@ -253,16 +276,31 @@ utils::elf::getSymbol(const ELFObjectFile<ELF64LE> &ELFObj, StringRef Name) {
     auto SymTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex());
     if (!SymTabOrErr)
       return SymTabOrErr.takeError();
-    return getSymTableSymbol<ELF64LE>(ELFObj.getELFFile(), **SymTabOrErr, Name);
+    return getSymTableSymbol<ELFT>(ELFObj, **SymTabOrErr, Name);
   }
 
-  return nullptr;
+  return std::nullopt;
 }
 
-Expected<const void *> utils::elf::getSymbolAddress(
-    const object::ELFObjectFile<object::ELF64LE> &ELFObj,
-    const object::ELF64LE::Sym &Symbol) {
-  const ELFFile<ELF64LE> &ELFFile = ELFObj.getELFFile();
+Expected<std::optional<ELFSymbolRef>>
+utils::elf::getSymbol(const ObjectFile &Obj, StringRef Name) {
+  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(&Obj))
+    return getSymbolImpl(*ELFObj, Name);
+  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(&Obj))
+    return getSymbolImpl(*ELFObj, Name);
+  return createError("Only 64-bit ELF files are supported");
+}
+
+template <class ELFT>
+static Expected<const void *>
+getSymbolAddressImpl(const ELFObjectFile<ELFT> &ELFObj,
+                     const ELFSymbolRef &SymRef) {
+  const ELFFile<ELFT> &ELFFile = ELFObj.getELFFile();
+
+  auto SymOrErr = ELFObj.getSymbol(SymRef.getRawDataRefImpl());
+  if (!SymOrErr)
+    return SymOrErr.takeError();
+  const auto &Symbol = **SymOrErr;
 
   auto SecOrErr = ELFFile.getSection(Symbol.st_shndx);
   if (!SecOrErr)
@@ -283,3 +321,13 @@ Expected<const void *> utils::elf::getSymbolAddress(
 
   return ELFFile.base() + Offset;
 }
+
+Expected<const void *>
+utils::elf::getSymbolAddress(const ELFSymbolRef &SymRef) {
+  const ObjectFile *Obj = SymRef.getObject();
+  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
+    return getSymbolAddressImpl(*ELFObj, SymRef);
+  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
+    return getSymbolAddressImpl(*ELFObj, SymRef);
+  return createError("Only 64-bit ELF files are supported");
+}
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index f85a00cd1cd530..b862bc74909257 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -1166,7 +1166,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     // Search for all symbols that contain a constructor or destructor.
     SmallVector<std::pair<StringRef, uint16_t>> Funcs;
-    for (ELFSymbolRef Sym : ELFObjOrErr->symbols()) {
+    for (ELFSymbolRef Sym : (*ELFObjOrErr)->symbols()) {
       auto NameOrErr = Sym.getName();
       if (!NameOrErr)
         return NameOrErr.takeError();
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
index 0cccc9cb82e42c..77466c111ee0aa 100644
--- a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+  build_generic_elf64("ppc64" "PPC64" "ppc64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
 else()
  libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
index 9461d79d145dfe..91d21627a3272f 100644
--- a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "ppc64le" "powerpc64le-ibm-linux-gnu" "21")
 else()
  libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
index 1b12a292899980..0388a235d28998 100644
--- a/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64("SystemZ" "S390X" "s390x" "s390x-ibm-linux-gnu" "22")
+ build_generic_elf64("s390x" "S390X" "s390x" "systemz" "s390x-ibm-linux-gnu" "22")
 else()
  libomptarget_say("Not building s390x NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
index 129d526a2ae784..27cf3e069a377e 100644
--- a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
 else()
  libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt
index 9bc3f3339583d9..d0971bd4ef079e 100644
--- a/openmp/libomptarget/src/CMakeLists.txt
+++ b/openmp/libomptarget/src/CMakeLists.txt
@@ -73,6 +73,7 @@ if (NOT LIBOMPTARGET_PLUGINS_TO_LOAD)
 	check_plugin_target(cuda)
 	check_plugin_target(aarch64)
 	check_plugin_target(amdgpu)
+	check_plugin_target(s390x)
 endif()
 
 list(TRANSFORM LIBOMPTARGET_PLUGINS_TO_LOAD PREPEND "\"libomptarget.rtl.")
diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 85fb08c00a9a74..c85f9868e37c25 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -16,6 +16,7 @@
 #include "rtl.h"
 
 #include "OpenMP/InternalTypes.h"
+#include "OpenMP/Mapping.h"
 #include "OpenMP/OMPT/Interface.h"
 #include "OpenMP/omp.h"
 #include "Shared/Profile.h"
@@ -27,6 +28,13 @@
 #include <cstring>
 #include <mutex>
 
+EXTERN void ompx_dump_mapping_tables() {
+  ident_t Loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
+  auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
+  for (auto &Device : PM->devices(ExclusiveDevicesAccessor))
+    dumpTargetPointerMappings(&Loc, Device, true);
+}
+
 #ifdef OMPT_SUPPORT
 using namespace llvm::omp::target::ompt;
 #endif
diff --git a/openmp/libomptarget/src/OpenMP/Mapping.cpp b/openmp/libomptarget/src/OpenMP/Mapping.cpp
index 9c0b219b6f15f1..c6ff3aa54dd66f 100644
--- a/openmp/libomptarget/src/OpenMP/Mapping.cpp
+++ b/openmp/libomptarget/src/OpenMP/Mapping.cpp
@@ -16,28 +16,33 @@
 #include "device.h"
 
 /// Dump a table of all the host-target pointer pairs on failure
-void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device) {
+void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device,
+                               bool toStdOut) {
   MappingInfoTy::HDTTMapAccessorTy HDTTMap =
       Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
-  if (HDTTMap->empty())
+  if (HDTTMap->empty()) {
+    DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
+              "OpenMP Host-Device pointer mappings table empty\n");
     return;
+  }
 
   SourceInfo Kernel(Loc);
-  INFO(OMP_INFOTYPE_ALL, Device.DeviceID,
-       "OpenMP Host-Device pointer mappings after block at %s:%d:%d:\n",
-       Kernel.getFilename(), Kernel.getLine(), Kernel.getColumn());
-  INFO(OMP_INFOTYPE_ALL, Device.DeviceID, "%-18s %-18s %s %s %s %s\n",
-       "Host Ptr", "Target Ptr", "Size (B)", "DynRefCount", "HoldRefCount",
-       "Declaration");
+  DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
+            "OpenMP Host-Device pointer mappings after block at %s:%d:%d:\n",
+            Kernel.getFilename(), Kernel.getLine(), Kernel.getColumn());
+  DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
+            "%-18s %-18s %s %s %s %s\n", "Host Ptr", "Target Ptr", "Size (B)",
+            "DynRefCount", "HoldRefCount", "Declaration");
   for (const auto &It : *HDTTMap) {
     HostDataToTargetTy &HDTT = *It.HDTT;
     SourceInfo Info(HDTT.HstPtrName);
-    INFO(OMP_INFOTYPE_ALL, Device.DeviceID,
-         DPxMOD " " DPxMOD " %-8" PRIuPTR " %-11s %-12s %s at %s:%d:%d\n",
-         DPxPTR(HDTT.HstPtrBegin), DPxPTR(HDTT.TgtPtrBegin),
-         HDTT.HstPtrEnd - HDTT.HstPtrBegin, HDTT.dynRefCountToStr().c_str(),
-         HDTT.holdRefCountToStr().c_str(), Info.getName(), Info.getFilename(),
-         Info.getLine(), Info.getColumn());
+    DUMP_INFO(toStdOut, OMP_INFOTYPE_ALL, Device.DeviceID,
+              DPxMOD " " DPxMOD " %-8" PRIuPTR " %-11s %-12s %s at %s:%d:%d\n",
+              DPxPTR(HDTT.HstPtrBegin), DPxPTR(HDTT.TgtPtrBegin),
+              HDTT.HstPtrEnd - HDTT.HstPtrBegin,
+              HDTT.dynRefCountToStr().c_str(), HDTT.holdRefCountToStr().c_str(),
+              Info.getName(), Info.getFilename(), Info.getLine(),
+              Info.getColumn());
   }
 }
 
@@ -511,7 +516,8 @@ static void printCopyInfoImpl(int DeviceId, bool H2D, void *SrcPtrBegin,
        "Copying data from %s to %s, %sPtr=" DPxMOD ", %sPtr=" DPxMOD
        ", Size=%" PRId64 ", Name=%s\n",
        H2D ? "host" : "device", H2D ? "device" : "host", H2D ? "Hst" : "Tgt",
-       DPxPTR(SrcPtrBegin), H2D ? "Tgt" : "Hst", DPxPTR(DstPtrBegin), Size,
+       DPxPTR(H2D ? SrcPtrBegin : DstPtrBegin), H2D ? "Tgt" : "Hst",
+       DPxPTR(H2D ? DstPtrBegin : SrcPtrBegin), Size,
        (HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
                               : "unknown");
 }
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index d5432a9eed380d..f95544ec8329c8 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -35,6 +35,7 @@ VERS1.0 {
     __tgt_push_mapper_component;
     __kmpc_push_target_tripcount;
     __kmpc_push_target_tripcount_mapper;
+    ompx_dump_mapping_tables;
     omp_get_mapped_ptr;
     omp_get_num_devices;
     omp_get_device_num;
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 8b89bc3ff7124d..b7f547f1ec3d5c 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -21,6 +21,8 @@
 
 #include "Utils/ExponentialBackoff.h"
 
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+
 #include <cassert>
 #include <cstdint>
 #include <cstdio>
@@ -218,11 +220,19 @@ EXTERN void __tgt_target_data_update_nowait_mapper(
 static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs,
                                        KernelArgsTy &LocalKernelArgs,
                                        int32_t NumTeams, int32_t ThreadLimit) {
-  if (KernelArgs->Version > 2)
+  if (KernelArgs->Version > OMP_KERNEL_ARG_VERSION)
     DP("Unexpected ABI version: %u\n", KernelArgs->Version);
 
-  if (KernelArgs->Version == 1) {
-    LocalKernelArgs.Version = 2;
+  uint32_t UpgradedVersion = KernelArgs->Version;
+  if (KernelArgs->Version < OMP_KERNEL_ARG_VERSION) {
+    // The upgraded version will be based on the kernel launch environment.
+    if (KernelArgs->Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
+      UpgradedVersion = OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR - 1;
+    else
+      UpgradedVersion = OMP_KERNEL_ARG_VERSION;
+  }
+  if (UpgradedVersion != KernelArgs->Version) {
+    LocalKernelArgs.Version = UpgradedVersion;
     LocalKernelArgs.NumArgs = KernelArgs->NumArgs;
     LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs;
     LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs;
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 3b3e17f0f311e8..5bbf3a455c72ad 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -30,6 +30,7 @@
 
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/bit.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Object/ObjectFile.h"
 
 #include <cassert>
@@ -1752,7 +1753,7 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
   Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
 
   KernelArgsTy KernelArgs = {0};
-  KernelArgs.Version = 2;
+  KernelArgs.Version = OMP_KERNEL_ARG_VERSION;
   KernelArgs.NumArgs = NumArgs;
   KernelArgs.Tripcount = LoopTripCount;
   KernelArgs.NumTeams[0] = NumTeams;
diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
index 5095a69a375c94..3fe75f24db3e67 100644
--- a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
@@ -10,6 +10,8 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/openmp/libomptarget/test/api/ompx_dump_mapping_tables.cpp b/openmp/libomptarget/test/api/ompx_dump_mapping_tables.cpp
new file mode 100644
index 00000000000000..c170c2d7387338
--- /dev/null
+++ b/openmp/libomptarget/test/api/ompx_dump_mapping_tables.cpp
@@ -0,0 +1,35 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <cstdio>
+#include <omp.h>
+
+#define N 10
+
+int main() {
+  int *a = new __int32_t[N];     // mapped and released from device 0
+  int *b = new __int32_t[2 * N]; // mapped to device 0
+
+  // clang-format off
+  // CHECK: Mapping tables after target enter data:
+  // CHECK-NEXT: omptarget device 0 info: OpenMP Host-Device pointer mappings after block
+  // CHECK-NEXT: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration
+  // CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} {{[48]}}0
+  // CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} {{[48]}}0
+#pragma omp target enter data device(0) map(to : a[ : N])
+#pragma omp target enter data device(0) map(to : b[ : 2*N])
+  // clang-format on
+  printf("Mapping tables after target enter data:\n");
+  ompx_dump_mapping_tables();
+
+  // clang-format off
+  // CHECK: Mapping tables after target exit data for a:
+  // CHECK-NEXT: omptarget device 0 info: OpenMP Host-Device pointer mappings after block
+  // CHECK-NEXT: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration
+  // CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}} 80
+#pragma omp target exit data device(0) map(release : a[ : N])
+  // clang-format on
+  printf("\nMapping tables after target exit data for a:\n");
+  ompx_dump_mapping_tables();
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/jit/empty_kernel_lvl1.c b/openmp/libomptarget/test/jit/empty_kernel_lvl1.c
index c908c8ba06509e..a0b8cd448837da 100644
--- a/openmp/libomptarget/test/jit/empty_kernel_lvl1.c
+++ b/openmp/libomptarget/test/jit/empty_kernel_lvl1.c
@@ -32,5 +32,7 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include "empty_kernel.inc"
diff --git a/openmp/libomptarget/test/jit/empty_kernel_lvl2.c b/openmp/libomptarget/test/jit/empty_kernel_lvl2.c
index 0b88d33292c571..81a04f55ce43d6 100644
--- a/openmp/libomptarget/test/jit/empty_kernel_lvl2.c
+++ b/openmp/libomptarget/test/jit/empty_kernel_lvl2.c
@@ -92,5 +92,7 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include "empty_kernel.inc"
diff --git a/openmp/libomptarget/test/jit/type_punning.c b/openmp/libomptarget/test/jit/type_punning.c
index 23aa69bba7b952..10e3d2cef718b8 100644
--- a/openmp/libomptarget/test/jit/type_punning.c
+++ b/openmp/libomptarget/test/jit/type_punning.c
@@ -12,6 +12,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 // Ensure that there is only the kernel function left, not any outlined
 // parallel regions.
diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
index 6f9d8c2b128c94..46641200bb56e2 100644
--- a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
+++ b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
@@ -13,6 +13,8 @@
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 // REQUIRES: unified_shared_memory
 
diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy_globals.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy_globals.cpp
index 4a13d270aeebe5..55dfb2807ebc78 100644
--- a/openmp/libomptarget/test/mapping/auto_zero_copy_globals.cpp
+++ b/openmp/libomptarget/test/mapping/auto_zero_copy_globals.cpp
@@ -9,6 +9,8 @@
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 // REQUIRES: unified_shared_memory
 
diff --git a/openmp/libomptarget/test/offloading/barrier_fence.c b/openmp/libomptarget/test/offloading/barrier_fence.c
index 3eeaac52a2d60f..b9a8ca27965a09 100644
--- a/openmp/libomptarget/test/offloading/barrier_fence.c
+++ b/openmp/libomptarget/test/offloading/barrier_fence.c
@@ -3,14 +3,12 @@
 // RUN: %libomptarget-compileopt-generic -fopenmp-offload-mandatory -O3
 // RUN: %libomptarget-run-generic
 
-// FIXME: This test is flaky on all targets
-// UNSUPPORTED: amdgcn-amd-amdhsa
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/openmp/libomptarget/test/offloading/bug49334.cpp b/openmp/libomptarget/test/offloading/bug49334.cpp
index a22d3fe9f66535..1f19dab378810d 100644
--- a/openmp/libomptarget/test/offloading/bug49334.cpp
+++ b/openmp/libomptarget/test/offloading/bug49334.cpp
@@ -9,6 +9,8 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 // UNSUPPORTED: amdgcn-amd-amdhsa
 // UNSUPPORTED: nvptx64-nvidia-cuda
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
diff --git a/openmp/libomptarget/test/offloading/default_thread_limit.c b/openmp/libomptarget/test/offloading/default_thread_limit.c
index d32e7df418cbbd..4da02bbb152e60 100644
--- a/openmp/libomptarget/test/offloading/default_thread_limit.c
+++ b/openmp/libomptarget/test/offloading/default_thread_limit.c
@@ -9,6 +9,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 __attribute__((optnone)) int optnone() { return 1; }
 
diff --git a/openmp/libomptarget/test/offloading/fortran/target-nested-target-data.f90 b/openmp/libomptarget/test/offloading/fortran/target-nested-target-data.f90
new file mode 100644
index 00000000000000..a694f2464546c5
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target-nested-target-data.f90
@@ -0,0 +1,31 @@
+! Offloading test for target nested inside
+! a target data region
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+   integer :: A(10), B(10), C(10)
+
+   do I = 1, 10
+      A(I) = 1
+      B(I) = 2
+   end do
+   !$omp target data map(to: A, B) map(alloc: C)
+   !$omp target map(from: C)
+   do I = 1, 10
+      C(I) = A(I) + B(I) ! assigns 3, A:1 + B:2
+   end do
+   !$omp end target
+   !$omp target update from(C) ! updates C device -> host
+   !$omp end target data
+
+   print *, C ! should be all 3's
+
+end program
+
+! CHECK: 3 3 3 3 3 3 3 3 3 3
diff --git a/openmp/libomptarget/test/offloading/ompx_bare.c b/openmp/libomptarget/test/offloading/ompx_bare.c
index fb3810bd1df126..3dabdcd15e0d8d 100644
--- a/openmp/libomptarget/test/offloading/ompx_bare.c
+++ b/openmp/libomptarget/test/offloading/ompx_bare.c
@@ -1,10 +1,13 @@
 // RUN: %libomptarget-compile-generic
-// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | %fcheck-generic
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | \
+// RUN:   %fcheck-generic
 //
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <assert.h>
 #include <ompx.h>
diff --git a/openmp/libomptarget/test/offloading/ompx_coords.c b/openmp/libomptarget/test/offloading/ompx_coords.c
index 61dad61f46405e..5e4e14b4c6daeb 100644
--- a/openmp/libomptarget/test/offloading/ompx_coords.c
+++ b/openmp/libomptarget/test/offloading/ompx_coords.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <omp.h>
 #include <ompx.h>
diff --git a/openmp/libomptarget/test/offloading/ompx_saxpy_mixed.c b/openmp/libomptarget/test/offloading/ompx_saxpy_mixed.c
index 440b694e3ac706..f479be8a484fc9 100644
--- a/openmp/libomptarget/test/offloading/ompx_saxpy_mixed.c
+++ b/openmp/libomptarget/test/offloading/ompx_saxpy_mixed.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <math.h>
 #include <omp.h>
diff --git a/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
index 5303a9463f1551..10e1b33f1ce405 100644
--- a/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
+++ b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
@@ -6,6 +6,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <iostream>
 #include <vector>
diff --git a/openmp/libomptarget/test/offloading/small_trip_count.c b/openmp/libomptarget/test/offloading/small_trip_count.c
index d8bef667607a1c..65f094f1574694 100644
--- a/openmp/libomptarget/test/offloading/small_trip_count.c
+++ b/openmp/libomptarget/test/offloading/small_trip_count.c
@@ -9,6 +9,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #define N 128
 
diff --git a/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp b/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
index 9796c2dc11663b..b7ae52a62c83bb 100644
--- a/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
+++ b/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
@@ -7,6 +7,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 int main(int argc, char *argv[]) {
   constexpr const int block_size = 256;
diff --git a/openmp/libomptarget/test/offloading/spmdization.c b/openmp/libomptarget/test/offloading/spmdization.c
index 2cf30ff593b991..77913bec8342f9 100644
--- a/openmp/libomptarget/test/offloading/spmdization.c
+++ b/openmp/libomptarget/test/offloading/spmdization.c
@@ -11,6 +11,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/openmp/libomptarget/test/offloading/target_critical_region.cpp b/openmp/libomptarget/test/offloading/target_critical_region.cpp
index 9a741bef6c5915..495632bf76e175 100644
--- a/openmp/libomptarget/test/offloading/target_critical_region.cpp
+++ b/openmp/libomptarget/test/offloading/target_critical_region.cpp
@@ -6,6 +6,8 @@
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 // UNSUPPORTED: amdgcn-amd-amdhsa
 
 #include <omp.h>
diff --git a/openmp/libomptarget/test/offloading/thread_limit.c b/openmp/libomptarget/test/offloading/thread_limit.c
index 65275a8f279584..a8cc51b651dc96 100644
--- a/openmp/libomptarget/test/offloading/thread_limit.c
+++ b/openmp/libomptarget/test/offloading/thread_limit.c
@@ -9,6 +9,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 int main() {
   int n = 1 << 20;
diff --git a/openmp/libomptarget/test/offloading/workshare_chunk.c b/openmp/libomptarget/test/offloading/workshare_chunk.c
new file mode 100644
index 00000000000000..a8c60c0097791a
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/workshare_chunk.c
@@ -0,0 +1,254 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// RUN: %libomptarget-compileopt-run-and-check-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+// clang-format off
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 100
+#define BLOCK_SHIFT 8
+
+void print(int *A, int size) {
+  for (int i = 0; i < size; ++i) {
+    printf("B%dT%d ", A[i] >> BLOCK_SHIFT, A[i] % (1 << BLOCK_SHIFT));
+  }
+  printf("\n");
+}
+
+int main() {
+  int A[N];
+
+#pragma omp target parallel for map(from:A) num_threads(10) schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target parallel for thread chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute map(from:A) num_teams(10) \
+        dist_schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute block chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) dist_schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 2);
+  printf("thread chunk size default\n");
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) dist_schedule(static, 2) schedule(static, 3)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 2);
+  printf("thread chunk size %d\n", 3);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) dist_schedule(static, 3) schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 3);
+  printf("thread chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) dist_schedule(static, 5) schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 5);
+  printf("thread chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) num_teams(10) \
+        dist_schedule(static, 49) schedule(static, 2)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 49);
+  printf("thread chunk size %d\n", 2);
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) num_threads(10) dist_schedule(static, 29)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 29);
+  printf("thread chunk size default\n");
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(10) num_threads(10) dist_schedule(static, 101)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for block chunk size %d ", 101);
+  printf("thread chunk size default\n");
+  print(A, N);
+
+#pragma omp target teams distribute parallel for map(from:A) \
+        num_teams(9) num_threads(10) schedule(static, 101)
+  for (int i = 0; i < N; ++i) {
+     A[i] = (omp_get_team_num() << BLOCK_SHIFT) + omp_get_thread_num();
+  }
+  printf("omp target teams distribute parallel for default block chunk size ");
+  printf("thread chunk size %d\n", 101);
+  print(A, N);
+  return 0;
+}
+//CHECK:      omp target parallel for thread chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+//CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+
+//CHECK:      omp target teams distribute block chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 2 thread chunk size default
+
+//CHECK-NEXT: B0T0 B0T1 B1T0 B1T1 B2T0 B2T1 B3T0 B3T1 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T1 B6T0 B6T1 B7T0 B7T1 B8T0 B8T1 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T1 B1T0 B1T1 B2T0 B2T1 B3T0 B3T1 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T1 B6T0 B6T1 B7T0 B7T1 B8T0 B8T1 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T1 B1T0 B1T1 B2T0 B2T1 B3T0 B3T1 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T1 B6T0 B6T1 B7T0 B7T1 B8T0 B8T1 B9T0 B9T1
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME  block chunk size 2 thread chunk size 3
+
+//CHECK-NEXT: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+//CHECK-SAME: B0T0 B0T0 B1T0 B1T0 B2T0 B2T0 B3T0 B3T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B6T0 B6T0 B7T0 B7T0 B8T0 B8T0 B9T0 B9T0
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 3 thread chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1
+//CHECK-SAME: B3T0 B3T0 B3T1 B4T0 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T0 B5T1 B6T0 B6T0 B6T1 B7T0 B7T0 B7T1
+//CHECK-SAME: B8T0 B8T0 B8T1 B9T0 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1
+//CHECK-SAME: B3T0 B3T0 B3T1 B4T0 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T0 B5T1 B6T0 B6T0 B6T1 B7T0 B7T0 B7T1
+//CHECK-SAME: B8T0 B8T0 B8T1 B9T0 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1
+//CHECK-SAME: B3T0 B3T0 B3T1 B4T0 B4T0 B4T1
+//CHECK-SAME: B5T0 B5T0 B5T1 B6T0 B6T0 B6T1 B7T0 B7T0 B7T1
+//CHECK-SAME: B8T0 B8T0 B8T1 B9T0 B9T0 B9T1
+//CHECK-SAME: B0T0 B0T0 B0T1 B1T0 B1T0 B1T1 B2T0 B2T0 B2T1 B3T0
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 5 thread chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B1T0 B1T0 B1T1 B1T1 B1T2
+//CHECK-SAME: B2T0 B2T0 B2T1 B2T1 B2T2 B3T0 B3T0 B3T1 B3T1 B3T2
+//CHECK-SAME: B4T0 B4T0 B4T1 B4T1 B4T2 B5T0 B5T0 B5T1 B5T1 B5T2
+//CHECK-SAME: B6T0 B6T0 B6T1 B6T1 B6T2 B7T0 B7T0 B7T1 B7T1 B7T2
+//CHECK-SAME: B8T0 B8T0 B8T1 B8T1 B8T2 B9T0 B9T0 B9T1 B9T1 B9T2
+//CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B1T0 B1T0 B1T1 B1T1 B1T2
+//CHECK-SAME: B2T0 B2T0 B2T1 B2T1 B2T2 B3T0 B3T0 B3T1 B3T1 B3T2
+//CHECK-SAME: B4T0 B4T0 B4T1 B4T1 B4T2 B5T0 B5T0 B5T1 B5T1 B5T2
+//CHECK-SAME: B6T0 B6T0 B6T1 B6T1 B6T2 B7T0 B7T0 B7T1 B7T1 B7T2
+//CHECK-SAME: B8T0 B8T0 B8T1 B8T1 B8T2 B9T0 B9T0 B9T1 B9T1 B9T2
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 49 thread chunk size 2
+
+//CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4 B0T5 B0T5
+//CHECK-SAME: B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9 B0T10 B0T10 B0T11 B0T11
+//CHECK-SAME: B0T12 B0T12 B0T13 B0T13 B0T14 B0T14 B0T15 B0T15 B0T16 B0T16
+//CHECK-SAME: B0T17 B0T17 B0T18 B0T18 B0T19 B0T19 B0T20 B0T20 B0T21 B0T21
+//CHECK-SAME: B0T22 B0T22 B0T23 B0T23 B0T24
+//CHECK-SAME: B1T0 B1T0 B1T1 B1T1 B1T2 B1T2 B1T3 B1T3 B1T4 B1T4 B1T5 B1T5
+//CHECK-SAME: B1T6 B1T6 B1T7 B1T7 B1T8 B1T8 B1T9 B1T9 B1T10 B1T10 B1T11 B1T11
+//CHECK-SAME: B1T12 B1T12 B1T13 B1T13 B1T14 B1T14 B1T15 B1T15 B1T16 B1T16
+//CHECK-SAME: B1T17 B1T17 B1T18 B1T18 B1T19 B1T19 B1T20 B1T20 B1T21 B1T21
+//CHECK-SAME: B1T22 B1T22 B1T23 B1T23 B1T24
+//CHECK-SAME: B2T0 B2T0
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 29 thread chunk size default
+
+//CHECK-NEXT: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8
+//CHECK-SAME: B1T0 B1T1 B1T2 B1T3 B1T4 B1T5 B1T6 B1T7 B1T8 B1T9
+//CHECK-SAME: B1T0 B1T1 B1T2 B1T3 B1T4 B1T5 B1T6 B1T7 B1T8 B1T9
+//CHECK-SAME: B1T0 B1T1 B1T2 B1T3 B1T4 B1T5 B1T6 B1T7 B1T8
+//CHECK-SAME: B2T0 B2T1 B2T2 B2T3 B2T4 B2T5 B2T6 B2T7 B2T8 B2T9
+//CHECK-SAME: B2T0 B2T1 B2T2 B2T3 B2T4 B2T5 B2T6 B2T7 B2T8 B2T9
+//CHECK-SAME: B2T0 B2T1 B2T2 B2T3 B2T4 B2T5 B2T6 B2T7 B2T8
+//CHECK-SAME: B3T0 B3T1 B3T2 B3T3 B3T4 B3T5 B3T6 B3T7 B3T8 B3T9
+//CHECK-SAME: B3T0 B3T1 B3T2
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: block chunk size 101 thread chunk size default
+
+//CHECK-NEXT: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+//CHECK-SAME: B0T0 B0T1 B0T2 B0T3 B0T4 B0T5 B0T6 B0T7 B0T8 B0T9
+
+//CHECK:      omp target teams distribute parallel for
+//CHECK-SAME: default block chunk size thread chunk size 101
+
+//CHECK-NEXT: B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0
+//CHECK-SAME: B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0 B1T0
+//CHECK-SAME: B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0 B2T0
+//CHECK-SAME: B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0 B3T0
+//CHECK-SAME: B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0 B4T0
+//CHECK-SAME: B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0 B5T0
+//CHECK-SAME: B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0 B6T0
+//CHECK-SAME: B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0 B7T0
+//CHECK-SAME: B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0 B8T0
+//CHECK-SAME: B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0 B0T0
diff --git a/openmp/libomptarget/test/ompt/target_memcpy.c b/openmp/libomptarget/test/ompt/target_memcpy.c
index 80a8d6a4b32e59..3224c5ac4c916d 100644
--- a/openmp/libomptarget/test/ompt/target_memcpy.c
+++ b/openmp/libomptarget/test/ompt/target_memcpy.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Verify that for the target OpenMP APIs, the return address is non-null and
diff --git a/openmp/libomptarget/test/ompt/target_memcpy_emi.c b/openmp/libomptarget/test/ompt/target_memcpy_emi.c
index 5347f38b87b6ff..fd7da13cb05d07 100644
--- a/openmp/libomptarget/test/ompt/target_memcpy_emi.c
+++ b/openmp/libomptarget/test/ompt/target_memcpy_emi.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Verify all three data transfer directions: H2D, D2D and D2H
diff --git a/openmp/libomptarget/test/ompt/veccopy.c b/openmp/libomptarget/test/ompt/veccopy.c
index 80e71fd8a48cb4..bc70f5a0b5b67d 100644
--- a/openmp/libomptarget/test/ompt/veccopy.c
+++ b/openmp/libomptarget/test/ompt/veccopy.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that registers non-EMI callbacks
diff --git a/openmp/libomptarget/test/ompt/veccopy_data.c b/openmp/libomptarget/test/ompt/veccopy_data.c
index cef1de316a7a14..264b484b881f10 100644
--- a/openmp/libomptarget/test/ompt/veccopy_data.c
+++ b/openmp/libomptarget/test/ompt/veccopy_data.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that registers EMI callbacks.
diff --git a/openmp/libomptarget/test/ompt/veccopy_disallow_both.c b/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
index 06293e413ba45a..a011c21955bb2d 100644
--- a/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
+++ b/openmp/libomptarget/test/ompt/veccopy_disallow_both.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that shows that both EMI and non-EMI
diff --git a/openmp/libomptarget/test/ompt/veccopy_emi.c b/openmp/libomptarget/test/ompt/veccopy_emi.c
index b597d7be6aff66..8718a39b4af812 100644
--- a/openmp/libomptarget/test/ompt/veccopy_emi.c
+++ b/openmp/libomptarget/test/ompt/veccopy_emi.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that registers EMI callbacks
diff --git a/openmp/libomptarget/test/ompt/veccopy_emi_map.c b/openmp/libomptarget/test/ompt/veccopy_emi_map.c
index ce6f6e30d5815f..2accba34034e64 100644
--- a/openmp/libomptarget/test/ompt/veccopy_emi_map.c
+++ b/openmp/libomptarget/test/ompt/veccopy_emi_map.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that shows that map-EMI callbacks are not supported.
diff --git a/openmp/libomptarget/test/ompt/veccopy_map.c b/openmp/libomptarget/test/ompt/veccopy_map.c
index 83f63a6e6049eb..56a0dd48f53683 100644
--- a/openmp/libomptarget/test/ompt/veccopy_map.c
+++ b/openmp/libomptarget/test/ompt/veccopy_map.c
@@ -4,6 +4,8 @@
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
 // UNSUPPORTED: x86_64-pc-linux-gnu
 // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 
 /*
  * Example OpenMP program that shows that map callbacks are not supported.
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index a1488ae9d21c61..eb3ab7778606a3 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -156,6 +156,8 @@
     /* OpenMP 5.1 interop */
     typedef intptr_t omp_intptr_t;
 
+    extern void __KAI_KMPC_CONVENTION ompx_dump_mapping_tables(void);
+
     /* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined properties */
     typedef enum omp_interop_property {
         omp_ipr_fr_id = -1,
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 569a1ab9b477c4..885d6636abe4a8 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1402,19 +1402,9 @@ extern void __kmp_query_cpuid(kmp_cpuinfo_t *p);
 // subleaf is only needed for cache and topology discovery and can be set to
 // zero in most cases
 static inline void __kmp_x86_cpuid(int leaf, int subleaf, struct kmp_cpuid *p) {
-#if KMP_ARCH_X86 && (defined(__pic__) || defined(__PIC__))
-  // on i386 arch, the ebx reg. is used by pic, thus we need to preserve from
-  // being trashed beforehand
-  __asm__ __volatile__("mov %%ebx, %%edi\n"
-                       "cpuid\n"
-                       "xchg %%edi, %%ebx\n"
-                       : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
-                       : "a"(leaf), "c"(subleaf));
-#else
   __asm__ __volatile__("cpuid"
                        : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
                        : "a"(leaf), "c"(subleaf));
-#endif
 }
 // Load p into FPU control word
 static inline void __kmp_load_x87_fpu_control_word(const kmp_int16 *p) {
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 3f831d6e2a8f9d..d751a417331cee 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -2610,6 +2610,43 @@ int __kmp_get_load_balance(int max) {
       KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||            \
       KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC_XCOFF)
 
+// Because WebAssembly will use `call_indirect` to invoke the microtask and
+// WebAssembly indirect calls check that the called signature is a precise
+// match, we need to cast each microtask function pointer back from `void *` to
+// its original type.
+typedef void (*microtask_t0)(int *, int *);
+typedef void (*microtask_t1)(int *, int *, void *);
+typedef void (*microtask_t2)(int *, int *, void *, void *);
+typedef void (*microtask_t3)(int *, int *, void *, void *, void *);
+typedef void (*microtask_t4)(int *, int *, void *, void *, void *, void *);
+typedef void (*microtask_t5)(int *, int *, void *, void *, void *, void *,
+                             void *);
+typedef void (*microtask_t6)(int *, int *, void *, void *, void *, void *,
+                             void *, void *);
+typedef void (*microtask_t7)(int *, int *, void *, void *, void *, void *,
+                             void *, void *, void *);
+typedef void (*microtask_t8)(int *, int *, void *, void *, void *, void *,
+                             void *, void *, void *, void *);
+typedef void (*microtask_t9)(int *, int *, void *, void *, void *, void *,
+                             void *, void *, void *, void *, void *);
+typedef void (*microtask_t10)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *);
+typedef void (*microtask_t11)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *);
+typedef void (*microtask_t12)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *);
+typedef void (*microtask_t13)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *, void *);
+typedef void (*microtask_t14)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *, void *, void *);
+typedef void (*microtask_t15)(int *, int *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *, void *,
+                              void *, void *, void *, void *, void *);
+
 // we really only need the case with 1 argument, because CLANG always build
 // a struct of pointers to shared variables referenced in the outlined function
 int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
@@ -2629,66 +2666,76 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
     fflush(stderr);
     exit(-1);
   case 0:
-    (*pkfn)(&gtid, &tid);
+    (*(microtask_t0)pkfn)(&gtid, &tid);
     break;
   case 1:
-    (*pkfn)(&gtid, &tid, p_argv[0]);
+    (*(microtask_t1)pkfn)(&gtid, &tid, p_argv[0]);
     break;
   case 2:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
+    (*(microtask_t2)pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
     break;
   case 3:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
+    (*(microtask_t3)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
     break;
   case 4:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
+    (*(microtask_t4)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3]);
     break;
   case 5:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
+    (*(microtask_t5)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4]);
     break;
   case 6:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5]);
+    (*(microtask_t6)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5]);
     break;
   case 7:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6]);
+    (*(microtask_t7)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5], p_argv[6]);
     break;
   case 8:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7]);
+    (*(microtask_t8)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                          p_argv[7]);
     break;
   case 9:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
+    (*(microtask_t9)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                          p_argv[3], p_argv[4], p_argv[5], p_argv[6], p_argv[7],
+                          p_argv[8]);
     break;
   case 10:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
+    (*(microtask_t10)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9]);
     break;
   case 11:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
+    (*(microtask_t11)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
     break;
   case 12:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11]);
+    (*(microtask_t12)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11]);
     break;
   case 13:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12]);
+    (*(microtask_t13)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11], p_argv[12]);
     break;
   case 14:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12], p_argv[13]);
+    (*(microtask_t14)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11], p_argv[12], p_argv[13]);
     break;
   case 15:
-    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
-            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
-            p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
+    (*(microtask_t15)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+                           p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+                           p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+                           p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
     break;
   }
 
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index 4a457f4cc41f75..a3456063c10fc6 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -129,7 +129,7 @@ if config.operating_system == 'NetBSD':
 if config.operating_system == 'Darwin':
     config.available_features.add("darwin")
 
-if config.operating_system in ['Linux', 'Windows']:
+if config.operating_system in ['Windows', 'Linux', 'FreeBSD', 'NetBSD', 'DragonFly']:
     config.available_features.add('affinity')
 
 if config.operating_system in ['Linux']:
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index a226cc2a1b2506..8b2207ecbf362c 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -1212,7 +1212,7 @@ bool IslNodeBuilder::preloadInvariantEquivClass(
   BasicBlock *EntryBB = &Builder.GetInsertBlock()->getParent()->getEntryBlock();
   auto *Alloca = new AllocaInst(AccInstTy, DL.getAllocaAddrSpace(),
                                 AccInst->getName() + ".preload.s2a",
-                                &*EntryBB->getFirstInsertionPt());
+                                EntryBB->getFirstInsertionPt());
   Builder.CreateStore(PreloadVal, Alloca);
   ValueMapT PreloadedPointer;
   PreloadedPointer[PreloadVal] = AccInst;
@@ -1308,10 +1308,11 @@ void IslNodeBuilder::allocateNewArrays(BBPair StartExitBlocks) {
       auto InstIt = Builder.GetInsertBlock()
                         ->getParent()
                         ->getEntryBlock()
-                        .getTerminator();
+                        .getTerminator()
+                        ->getIterator();
 
       auto *CreatedArray = new AllocaInst(NewArrayType, DL.getAllocaAddrSpace(),
-                                          SAI->getName(), &*InstIt);
+                                          SAI->getName(), InstIt);
       if (PollyTargetFirstLevelCacheLineSize)
         CreatedArray->setAlignment(Align(PollyTargetFirstLevelCacheLineSize));
       SAI->setBasePtr(CreatedArray);
diff --git a/polly/lib/CodeGen/LoopGenerators.cpp b/polly/lib/CodeGen/LoopGenerators.cpp
index 5c99515c544405..b4f8bb8948c282 100644
--- a/polly/lib/CodeGen/LoopGenerators.cpp
+++ b/polly/lib/CodeGen/LoopGenerators.cpp
@@ -225,7 +225,7 @@ ParallelLoopGenerator::storeValuesIntoStruct(SetVector<Value *> &Values) {
   // in the entry block of the function and use annotations to denote the actual
   // live span (similar to clang).
   BasicBlock &EntryBB = Builder.GetInsertBlock()->getParent()->getEntryBlock();
-  Instruction *IP = &*EntryBB.getFirstInsertionPt();
+  BasicBlock::iterator IP = EntryBB.getFirstInsertionPt();
   StructType *Ty = StructType::get(Builder.getContext(), Members);
   AllocaInst *Struct = new AllocaInst(Ty, DL.getAllocaAddrSpace(), nullptr,
                                       "polly.par.userContext", IP);
diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp
index afdb61f25b6c12..24c7011b06de93 100644
--- a/polly/lib/Support/ScopHelper.cpp
+++ b/polly/lib/Support/ScopHelper.cpp
@@ -328,8 +328,9 @@ struct ScopExpander final : SCEVVisitor<ScopExpander, const SCEV *> {
     Value *LHS = expandCodeFor(LHSScev, E->getType(), IP);
     Value *RHS = expandCodeFor(RHSScev, E->getType(), IP);
 
-    Inst = BinaryOperator::Create((Instruction::BinaryOps)Inst->getOpcode(),
-                                  LHS, RHS, Inst->getName() + Name, IP);
+    Inst =
+        BinaryOperator::Create((Instruction::BinaryOps)Inst->getOpcode(), LHS,
+                               RHS, Inst->getName() + Name, IP->getIterator());
     return SE.getSCEV(Inst);
   }
 
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel
new file mode 100644
index 00000000000000..f2aa64b6399850
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clangd/BUILD.bazel
@@ -0,0 +1,71 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+)
+
+licenses(["notice"])
+
+# TODO: this is a shim to provide Features.inc as needed by Feature.h.
+# Replace this with something that parses Features.inc.in.
+genrule(
+    name = "gen_features_inc",
+    outs = ["Features.inc"],
+    cmd = "\n".join([
+        "echo '// IWYU pragma: private, include \"Feature.h\"' >> $@",
+        "echo '#define CLANGD_BUILD_XPC 0' >> $@",
+        "echo '#define CLANGD_ENABLE_REMOTE 1' >> $@",
+        "echo '#define ENABLE_GRPC_REFLECTION 0' >> $@",
+        "echo '#define CLANGD_MALLOC_TRIM 0' >> $@",
+        "echo '#define CLANGD_TIDY_CHECKS 1' >> $@",
+        "echo '#define CLANGD_DECISION_FOREST 1' >> $@",
+    ]),
+)
+
+# TODO: Pick up other files for more complete functionality, to match
+# clangd/CMakeLists.txt. This might look something like
+# glob(["*.cpp", "dir/**/*.cpp", ...]).
+cc_library(
+    name = "ClangDaemon",
+    srcs = [
+        "Feature.cpp",
+        "Features.inc",
+        "JSONTransport.cpp",
+        "Protocol.cpp",
+        "URI.cpp",
+        "index/SymbolID.cpp",
+        "support/Cancellation.cpp",
+        "support/Context.cpp",
+        "support/Logger.cpp",
+        "support/MemoryTree.cpp",
+        "support/Shutdown.cpp",
+        "support/ThreadCrashReporter.cpp",
+        "support/Trace.cpp",
+    ],
+    hdrs = [
+        "Feature.h",
+        "LSPBinder.h",
+        "Protocol.h",
+        "Transport.h",
+        "URI.h",
+        "index/SymbolID.h",
+        "support/Cancellation.h",
+        "support/Context.h",
+        "support/Function.h",
+        "support/Logger.h",
+        "support/MemoryTree.h",
+        "support/Shutdown.h",
+        "support/ThreadCrashReporter.h",
+        "support/Trace.h",
+    ],
+    includes = ["."],
+    deps = [
+        "//clang:basic",
+        "//clang:index",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index d6b124f9d8e4c9..865cafbf50c63e 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -62,6 +62,7 @@ gentbl(
         "CrossTU",
         "Driver",
         "Frontend",
+        "InstallAPI",
         "Lex",
         "Parse",
         "Refactoring",
@@ -2068,6 +2069,7 @@ cc_library(
         ":frontend",
         ":support",
         "//llvm:Core",
+        "//llvm:Demangle",
         "//llvm:Support",
         "//llvm:TextAPI",
     ],
@@ -2405,6 +2407,7 @@ cc_binary(
     stamp = 0,
     deps = [
         ":basic",
+        ":diagnostic_defs_gen",
         ":frontend",
         ":lex",
         ":sema",
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 073353a89c8907..59b0bbbda2f529 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -62,7 +62,42 @@ config_setting(
     flag_values = {":mpfr": "system"},
 )
 
-############################## Support libraries #############################
+################################# Include Files ################################
+
+libc_support_library(
+    name = "llvm_libc_macros_math_macros",
+    hdrs = ["include/llvm-libc-macros/math-macros.h"],
+    deps = [":llvm_libc_macros_limits_macros"],
+)
+
+libc_support_library(
+    name = "llvm_libc_macros_limits_macros",
+    hdrs = ["include/llvm-libc-macros/limits-macros.h"],
+)
+
+libc_support_library(
+    name = "llvm_libc_macros_float_macros",
+    hdrs = ["include/llvm-libc-macros/float-macros.h"],
+)
+
+libc_support_library(
+    name = "llvm_libc_macros_stdint_macros",
+    hdrs = ["include/llvm-libc-macros/stdint-macros.h"],
+)
+
+libc_support_library(
+    name = "llvm_libc_macros_stdfix_macros",
+    hdrs = ["include/llvm-libc-macros/stdfix-macros.h"],
+    deps = [":llvm_libc_macros_float_macros"],
+)
+
+libc_support_library(
+    name = "llvm_libc_types_float128",
+    hdrs = ["include/llvm-libc-types/float128.h"],
+    deps = [":llvm_libc_macros_float_macros"],
+)
+
+############################### Support libraries ##############################
 
 libc_support_library(
     name = "__support_macros_properties_architectures",
@@ -151,10 +186,20 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_cpp_iterator",
+    hdrs = ["src/__support/CPP/iterator.h"],
+    deps = [
+        ":__support_cpp_type_traits",
+        ":__support_macros_attributes",
+    ],
+)
+
 libc_support_library(
     name = "__support_cpp_array",
     hdrs = ["src/__support/CPP/array.h"],
     deps = [
+        ":__support_cpp_iterator",
         ":__support_macros_attributes",
     ],
 )
@@ -528,6 +573,7 @@ libc_support_library(
         ":__support_cpp_type_traits",
         ":__support_ctype_utils",
         ":__support_str_to_num_result",
+        ":__support_uint128",
         ":errno",
     ],
 )
@@ -670,6 +716,7 @@ libc_support_library(
         ":__support_macros_properties_architectures",
         ":__support_macros_sanitizer",
         ":errno",
+        ":llvm_libc_macros_math_macros",
     ],
 )
 
@@ -727,7 +774,10 @@ libc_support_library(
 libc_support_library(
     name = "__support_fputil_manipulation_functions",
     hdrs = ["src/__support/FPUtil/ManipulationFunctions.h"],
-    textual_hdrs = ["src/__support/FPUtil/x86_64/NextAfterLongDouble.h"],
+    textual_hdrs = [
+        "src/__support/FPUtil/x86_64/NextAfterLongDouble.h",
+        "src/__support/FPUtil/x86_64/NextUpDownLongDouble.h",
+    ],
     deps = [
         ":__support_common",
         ":__support_cpp_bit",
@@ -739,6 +789,7 @@ libc_support_library(
         ":__support_fputil_normal_float",
         ":__support_macros_optimization",
         ":__support_uint128",
+        ":llvm_libc_macros_math_macros",
     ],
 )
 
@@ -752,6 +803,7 @@ libc_support_library(
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
         ":__support_macros_attributes",
+        ":llvm_libc_macros_math_macros",
     ],
 )
 
@@ -980,33 +1032,6 @@ libc_support_library(
     ],
 )
 
-libc_support_library(
-    name = "llvm_libc_macros_limits_macros",
-    hdrs = ["include/llvm-libc-macros/limits-macros.h"],
-)
-
-libc_support_library(
-    name = "llvm_libc_macros_float_macros",
-    hdrs = ["include/llvm-libc-macros/float-macros.h"],
-)
-
-libc_support_library(
-    name = "llvm_libc_macros_stdint_macros",
-    hdrs = ["include/llvm-libc-macros/stdint-macros.h"],
-)
-
-libc_support_library(
-    name = "llvm_libc_macros_stdfix_macros",
-    hdrs = ["include/llvm-libc-macros/stdfix-macros.h"],
-    deps = [":llvm_libc_macros_float_macros"],
-)
-
-libc_support_library(
-    name = "llvm_libc_types_float128",
-    hdrs = ["include/llvm-libc-types/float128.h"],
-    deps = [":llvm_libc_macros_float_macros"],
-)
-
 ############################### errno targets ################################
 
 libc_function(
@@ -1173,6 +1198,7 @@ libc_support_library(
         "__support_cpp_type_traits",
         ":__support_common",
         ":errno",
+        ":llvm_libc_macros_math_macros",
     ],
 )
 
@@ -1234,13 +1260,9 @@ libc_support_library(
     hdrs = ["src/math/generic/inv_trigf_utils.h"],
     deps = [
         ":__support_common",
-        ":__support_fputil_fenv_impl",
         ":__support_fputil_fma",
-        ":__support_fputil_fp_bits",
         ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
-        ":math_utils",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index 4a94916e1205e0..2a0c071f228683 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -84,6 +84,7 @@ libc_support_library(
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_fpbits_str",
         "//libc:__support_fputil_rounding_mode",
+        "//libc:llvm_libc_macros_math_macros",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
index 5434761a9b1392..4f976122967c4d 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel
@@ -64,8 +64,8 @@ libc_test(
     name = "integer_to_string_test",
     srcs = ["integer_to_string_test.cpp"],
     deps = [
-        "//libc:__support_cpp_span",
         "//libc:__support_cpp_limits",
+        "//libc:__support_cpp_span",
         "//libc:__support_cpp_string_view",
         "//libc:__support_integer_literals",
         "//libc:__support_integer_to_string",
@@ -89,6 +89,7 @@ libc_test(
         "//libc:__support_cpp_optional",
         "//libc:__support_macros_properties_types",
         "//libc:__support_uint",
+        "//libc:llvm_libc_macros_math_macros",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
index 63e18b83710918..15e367f0aca2dc 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
@@ -178,6 +178,7 @@ libc_support_library(
     deps = [
         "//libc:__support_fputil_basic_operations",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
         "//libc/utils/MPFRWrapper:mpfr_wrapper",
@@ -296,6 +297,7 @@ libc_support_library(
         "//libc:__support_cpp_limits",
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_manipulation_functions",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
     ],
 )
@@ -322,6 +324,7 @@ libc_support_library(
         "//libc:__support_fputil_basic_operations",
         "//libc:__support_fputil_fenv_impl",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
@@ -349,6 +352,7 @@ libc_support_library(
         "//libc:__support_cpp_limits",
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_normal_float",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
@@ -375,6 +379,7 @@ libc_support_library(
     deps = [
         "//libc:__support_fputil_fenv_impl",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
         "//libc/utils/MPFRWrapper:mpfr_wrapper",
@@ -411,6 +416,7 @@ libc_support_library(
     deps = [
         "//libc:__support_fputil_fenv_impl",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
         "//libc/utils/MPFRWrapper:mpfr_wrapper",
@@ -522,6 +528,7 @@ libc_support_library(
         "//libc:__support_cpp_type_traits",
         "//libc:__support_fputil_basic_operations",
         "//libc:__support_fputil_fp_bits",
+        "//libc:llvm_libc_macros_math_macros",
         "//libc/test/UnitTest:LibcUnitTest",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
index aba259ba6401a5..1a5868d242e80a 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
@@ -34,6 +34,7 @@ def math_test(name, hdrs = [], deps = [], **kwargs):
             "//libc:__support_math_extras",
             "//libc:__support_uint128",
             "//libc/test/UnitTest:fp_test_helpers",
+            "//libc:llvm_libc_macros_math_macros",
         ] + deps,
         **kwargs
     )
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 99b924626c2afc..c8863af43c400d 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -312,6 +312,7 @@ cc_test(
         "//llvm:AllTargetsAsmParsers",
         "//llvm:AllTargetsCodeGens",
         "//llvm:AsmParser",
+        "//llvm:BinaryFormat",
         "//llvm:Core",
         "//llvm:ExecutionEngine",
         "//llvm:IRReader",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index cda28543f1e3ee..6b2a563c319fcd 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1936,17 +1936,14 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
-        ":ArmNeonIncGen",
         ":ArmNeonDialect",
+        ":DialectUtils",
         ":FuncDialect",
         ":IR",
         ":LLVMDialect",
-        ":SideEffectInterfaces",
         ":Support",
-        ":VectorDialect",
         ":Transforms",
-        "//llvm:Core",
-        "//llvm:Support",
+        ":VectorDialect",
     ],
 )
 
@@ -2852,6 +2849,7 @@ cc_library(
         ":ArithDialect",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":DestinationStyleOpInterface",
         ":DialectUtils",
@@ -3125,6 +3123,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
@@ -3197,6 +3196,7 @@ cc_library(
         ":ArithDialect",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":ComplexDialect",
         ":DialectUtils",
@@ -3669,6 +3669,8 @@ td_library(
     deps = [
         ":BuiltinDialectTdFiles",
         ":OpBaseTdFiles",
+        ":ShapedOpInterfacesTdFiles",
+        ":ViewLikeInterfaceTdFiles",
     ],
 )
 
@@ -3752,7 +3754,10 @@ cc_library(
     hdrs = ["include/mlir/Dialect/XeGPU/IR/XeGPU.h"],
     includes = ["include"],
     deps = [
+        ":DialectUtils",
         ":IR",
+        ":ShapedOpInterfaces",
+        ":ViewLikeInterface",
         ":XeGPUIncGen",
         "//llvm:Core",
         "//llvm:Support",
@@ -3886,9 +3891,11 @@ cc_library(
         ":AffineMemoryOpInterfacesIncGen",
         ":AffineOpsIncGen",
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":ControlFlowInterfaces",
         ":DialectUtils",
         ":IR",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MemRefDialect",
         ":ShapedOpInterfaces",
@@ -4290,6 +4297,7 @@ cc_library(
     deps = [
         ":ArithDialect",
         ":ArithUtils",
+        ":BufferizationInterfaces",
         ":ControlFlowDialect",
         ":ControlFlowInterfaces",
         ":DestinationStyleOpInterface",
@@ -4297,6 +4305,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MemRefDialect",
         ":ParallelCombiningOpInterface",
@@ -4367,6 +4376,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "InliningUtils",
+    srcs = ["lib/Transforms/Utils/InliningUtils.cpp"],
+    hdrs = ["include/mlir/Transforms/InliningUtils.h"],
+    includes = ["include"],
+    deps = [
+        ":CallOpInterfaces",
+        ":IR",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "LoopLikeInterface",
     srcs = ["lib/Interfaces/LoopLikeInterface.cpp"],
@@ -4537,6 +4558,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":CastInterfaces",
         ":ControlFlowInterfaces",
         ":Dialect",
@@ -4544,6 +4566,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MLIRShapeCanonicalizationIncGen",
         ":ShapeOpsIncGen",
         ":SideEffectInterfaces",
@@ -4622,6 +4645,7 @@ cc_library(
     deps = [
         ":ArithDialect",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":FuncDialect",
         ":IR",
@@ -4702,11 +4726,13 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":CommonFolders",
         ":ControlFlowInterfaces",
         ":ControlFlowOpsIncGen",
         ":ConvertToLLVMInterface",
         ":IR",
+        ":InliningUtils",
         ":SideEffectInterfaces",
         ":Support",
         "//llvm:Support",
@@ -4724,6 +4750,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":ControlFlowDialect",
         ":IR",
@@ -4743,10 +4770,11 @@ cc_library(
     hdrs = glob([
         "include/mlir/Dialect/Func/IR/*.h",
         "include/mlir/Dialect/Func/Utils/*.h",
-    ]) + ["include/mlir/Transforms/InliningUtils.h"],
+    ]),
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":CallOpInterfaces",
         ":CastInterfaces",
         ":CommonFolders",
@@ -4757,6 +4785,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":SideEffectInterfaces",
         ":Support",
         "//llvm:Support",
@@ -4773,6 +4802,7 @@ cc_library(
         ":FuncDialect",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MeshShardingInterface",
     ],
 )
@@ -4923,16 +4953,19 @@ cc_library(
         ":AffineDialect",
         ":ArithDialect",
         ":ArithUtils",
+        ":BufferizationInterfaces",
         ":ControlFlowInterfaces",
         ":DataLayoutInterfaces",
         ":DestinationStyleOpInterface",
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MaskableOpInterface",
         ":MaskingOpInterface",
         ":MemRefDialect",
         ":SideEffectInterfaces",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorDialect",
         ":ValueBoundsOpInterface",
@@ -5035,6 +5068,7 @@ cc_library(
         ":ArithTransforms",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":DialectUtils",
         ":FuncDialect",
@@ -5288,6 +5322,14 @@ gentbl_cc_library(
             ["-gen-op-interface-defs"],
             "include/mlir/Dialect/LLVMIR/LLVMInterfaces.cpp.inc",
         ),
+        (
+            ["-gen-attr-interface-decls"],
+            "include/mlir/Dialect/LLVMIR/LLVMAttrInterfaces.h.inc",
+        ),
+        (
+            ["-gen-attr-interface-defs"],
+            "include/mlir/Dialect/LLVMIR/LLVMAttrInterfaces.cpp.inc",
+        ),
         (
             ["-gen-type-interface-decls"],
             "include/mlir/Dialect/LLVMIR/LLVMTypeInterfaces.h.inc",
@@ -5337,7 +5379,6 @@ cc_library(
             "include/mlir/Dialect/LLVMIR/*X86Vector*.h",
         ],
     ) + [
-        "include/mlir/Transforms/InliningUtils.h",
         "include/mlir/Transforms/Mem2Reg.h",
     ],
     includes = ["include"],
@@ -5348,6 +5389,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LLVMDialectInterfaceIncGen",
         ":LLVMIntrinsicOpsIncGen",
         ":LLVMOpsIncGen",
@@ -5545,6 +5587,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":ArithDialect",
+        ":BufferizationInterfaces",
         ":ControlFlowInterfaces",
         ":DLTIDialect",
         ":FunctionInterfaces",
@@ -5554,6 +5597,7 @@ cc_library(
         ":IR",
         ":InferIntRangeInterface",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LLVMDialect",
         ":MemRefDialect",
         ":SCFDialect",
@@ -5642,6 +5686,7 @@ cc_library(
         ":AsmParser",
         ":AsyncDialect",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":ControlFlowDialect",
         ":DLTIDialect",
         ":DialectUtils",
@@ -6888,7 +6933,7 @@ cc_library(
     srcs = glob([
         "lib/Dialect/SPIRV/IR/*.cpp",
         "lib/Dialect/SPIRV/IR/*.h",
-    ]) + ["include/mlir/Transforms/InliningUtils.h"],
+    ]),
     hdrs = glob([
         "include/mlir/Dialect/SPIRV/IR/*.h",
     ]),
@@ -6900,6 +6945,7 @@ cc_library(
         ":GPUDialect",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":Parser",
         ":SPIRVAttrUtilsGen",
         ":SPIRVAttributesIncGen",
@@ -7304,7 +7350,6 @@ gentbl_cc_library(
 cc_library(
     name = "TensorDialect",
     srcs = [
-        "include/mlir/Transforms/InliningUtils.h",
         "lib/Dialect/Tensor/IR/TensorDialect.cpp",
         "lib/Dialect/Tensor/IR/TensorOps.cpp",
         "lib/Dialect/Tensor/IR/ValueBoundsOpInterfaceImpl.cpp",
@@ -7318,6 +7363,7 @@ cc_library(
         ":AffineDialect",
         ":ArithDialect",
         ":ArithUtils",
+        ":BufferizationInterfaces",
         ":CastInterfaces",
         ":ComplexDialect",
         ":ControlFlowInterfaces",
@@ -7325,13 +7371,16 @@ cc_library(
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":ParallelCombiningOpInterface",
         ":ShapedOpInterfaces",
         ":SideEffectInterfaces",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorOpsIncGen",
         ":TilingInterface",
+        ":TransformDialectInterfaces",
         ":ValueBoundsOpInterface",
         ":ViewLikeInterface",
         "//llvm:Support",
@@ -7425,6 +7474,7 @@ cc_library(
         ":ArithDialect",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":DialectUtils",
         ":FuncDialect",
@@ -7515,15 +7565,19 @@ cc_library(
 
 cc_library(
     name = "TransformUtils",
-    srcs = glob([
-        "lib/Transforms/Utils/*.cpp",
-        "lib/Transforms/Utils/*.h",
-    ]),
+    srcs = glob(
+        include = [
+            "lib/Transforms/Utils/*.cpp",
+            "lib/Transforms/Utils/*.h",
+        ],
+        exclude = ["lib/Transforms/Utils/InliningUtils.cpp"],
+    ),
     hdrs = glob(
-        [
-            "include/mlir/Transforms/*.h",
+        include = ["include/mlir/Transforms/*.h"],
+        exclude = [
+            "include/mlir/Transforms/InliningUtils.h",
+            "include/mlir/Transforms/Passes.h",
         ],
-        exclude = ["include/mlir/Transforms/Passes.h"],
     ),
     includes = ["include"],
     deps = [
@@ -7531,6 +7585,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
         ":IR",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MemorySlotInterfaces",
         ":Pass",
@@ -7860,6 +7915,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
         ":IR",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MemorySlotInterfaces",
         ":Pass",
@@ -10973,6 +11029,7 @@ cc_library(
         ":ArithUtils",
         ":AsmParser",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":ComplexDialect",
         ":ControlFlowInterfaces",
         ":CopyOpInterface",
@@ -10982,6 +11039,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LinalgEnumsIncGen",
         ":LinalgInterfacesIncGen",
         ":LinalgNamedStructuredOpsYamlIncGen",
@@ -10989,11 +11047,12 @@ cc_library(
         ":LinalgStructuredOpsIncGen",
         ":MathDialect",
         ":MemRefDialect",
+        ":MeshShardingInterface",
         ":Parser",
         ":SCFDialect",
-        ":MeshShardingInterface",
         ":SideEffectInterfaces",
         ":SparseTensorDialect",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorDialect",
         ":TilingInterface",
@@ -11127,6 +11186,7 @@ cc_library(
         ":ArithTransforms",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":ComplexDialect",
         ":ControlFlowDialect",
@@ -11146,12 +11206,12 @@ cc_library(
         ":MemRefDialect",
         ":MemRefTransforms",
         ":MeshDialect",
+        ":MeshShardingInterface",
         ":MeshTransforms",
         ":Pass",
         ":SCFDialect",
         ":SCFTransforms",
         ":SCFUtils",
-        ":MeshShardingInterface",
         ":SparseTensorDialect",
         ":SubsetOpInterface",
         ":Support",
@@ -11728,6 +11788,7 @@ cc_library(
         ":FuncDialect",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":LoopLikeInterface",
         ":MeshDialect",
         ":MeshShardingInterface",
@@ -12027,6 +12088,26 @@ gentbl_cc_library(
     deps = [":TransformDialectTdFiles"],
 )
 
+cc_library(
+    name = "TransformDialectInterfaces",
+    # FIXME: Change this once https://github.com/llvm/llvm-project/pull/85221 lands
+    hdrs = [
+        "include/mlir/Dialect/Transform/IR/TransformInterfaces.h",
+        "include/mlir/Dialect/Transform/IR/TransformTypes.h",
+    ],
+    deps = [
+        ":CastInterfaces",
+        ":IR",
+        ":Rewrite",
+        ":Support",
+        ":TransformDialectInterfacesIncGen",
+        ":TransformDialectUtils",
+        ":TransformTypesIncGen",
+        ":Transforms",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "TransformDialect",
     srcs = glob(["lib/Dialect/Transform/IR/*.cpp"]),
@@ -12342,6 +12423,7 @@ cc_library(
         ":ConvertToLLVMInterface",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":SideEffectInterfaces",
         "//llvm:Support",
     ],
@@ -12587,13 +12669,15 @@ gentbl_cc_library(
 cc_library(
     name = "ArithDialect",
     srcs = [
+        "include/mlir/Interfaces/ValueBoundsOpInterface.h",
         "lib/Dialect/Arith/IR/ArithDialect.cpp",
         "lib/Dialect/Arith/IR/ArithOps.cpp",
         "lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp",
-    ],
+    ] + glob([
+        "include/mlir/Analysis/**/*.h",
+    ]),
     hdrs = [
         "include/mlir/Dialect/Arith/IR/Arith.h",
-        "include/mlir/Transforms/InliningUtils.h",
     ],
     includes = ["include"],
     deps = [
@@ -12601,15 +12685,21 @@ cc_library(
         ":ArithCanonicalizationIncGen",
         ":ArithOpsIncGen",
         ":ArithOpsInterfacesIncGen",
+        ":BufferizationInterfaces",
         ":CastInterfaces",
         ":CommonFolders",
+        ":ControlFlowInterfaces",
         ":ConvertToLLVMInterface",
+        ":DestinationStyleOpInterface",
         ":IR",
         ":InferIntRangeCommon",
         ":InferIntRangeInterface",
         ":InferTypeOpInterface",
+        ":InliningUtils",
+        ":Pass",
         ":Support",
         ":UBDialect",
+        ":ValueBoundsOpInterfaceIncGen",
         ":VectorInterfaces",
         "//llvm:Support",
     ],
@@ -12647,6 +12737,7 @@ cc_library(
         ":ArithPassIncGen",
         ":ArithUtils",
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationTransforms",
         ":FuncDialect",
         ":FuncTransforms",
@@ -12769,6 +12860,7 @@ cc_library(
         ":ConvertToLLVMInterface",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MathBaseIncGen",
         ":MathOpsIncGen",
         ":SideEffectInterfaces",
@@ -12907,8 +12999,10 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":AllocationOpInterface",
         ":ArithDialect",
         ":ArithUtils",
+        ":BufferizationInterfaces",
         ":CastInterfaces",
         ":ComplexDialect",
         ":ControlFlowInterfaces",
@@ -12917,9 +13011,11 @@ cc_library(
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MemRefBaseIncGen",
         ":MemRefOpsIncGen",
         ":MemorySlotInterfaces",
+        ":RuntimeVerifiableOpInterface",
         ":ShapedOpInterfaces",
         ":Support",
         ":ValueBoundsOpInterface",
@@ -13170,11 +13266,13 @@ cc_library(
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
         ":IR",
+        ":InliningUtils",
         ":MLProgramAttributesIncGen",
         ":MLProgramOpsIncGen",
         ":MLProgramTypesIncGen",
         ":Pass",
         ":Support",
+        ":Transforms",
         "//llvm:Support",
     ],
 )
@@ -13190,6 +13288,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":FuncDialect",
         ":IR",
         ":MLProgramDialect",
@@ -13473,6 +13572,7 @@ cc_library(
     deps = [
         ":BufferizationDialect",
         ":BufferizationEnumsIncGen",
+        ":BufferizationInterfaces",
         ":BufferizationTransformOpsIncGen",
         ":BufferizationTransforms",
         ":FunctionInterfaces",
@@ -13506,6 +13606,26 @@ gentbl_cc_library(
     ],
 )
 
+cc_library(
+    name = "BufferizationInterfaces",
+    srcs = [
+        "include/mlir/Analysis/Liveness.h",
+    ],
+    hdrs = [
+        "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h",
+        "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":BufferDeallocationOpInterfaceIncGen",
+        ":BufferizableOpInterfaceIncGen",
+        ":BufferizationEnumsIncGen",
+        ":IR",
+        ":Support",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "BufferizationDialect",
     srcs = [
@@ -13516,8 +13636,6 @@ cc_library(
         "lib/Dialect/Bufferization/IR/UnstructuredControlFlow.cpp",
     ],
     hdrs = [
-        "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h",
-        "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h",
         "include/mlir/Dialect/Bufferization/IR/Bufferization.h",
         "include/mlir/Dialect/Bufferization/IR/DstBufferizableOpInterfaceImpl.h",
         "include/mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h",
@@ -13531,7 +13649,7 @@ cc_library(
         ":BufferDeallocationOpInterfaceIncGen",
         ":BufferizableOpInterfaceIncGen",
         ":BufferizationBaseIncGen",
-        ":BufferizationEnumsIncGen",
+        ":BufferizationInterfaces",
         ":BufferizationOpsIncGen",
         ":ControlFlowInterfaces",
         ":CopyOpInterface",
@@ -13540,6 +13658,7 @@ cc_library(
         ":FunctionInterfaces",
         ":IR",
         ":InferTypeOpInterface",
+        ":InliningUtils",
         ":MemRefDialect",
         ":SparseTensorDialect",
         ":SubsetOpInterface",
@@ -13580,7 +13699,7 @@ cc_library(
         ":Analysis",
         ":ArithDialect",
         ":BufferizationDialect",
-        ":BufferizationEnumsIncGen",
+        ":BufferizationInterfaces",
         ":BufferizationPassIncGen",
         ":ControlFlowDialect",
         ":ControlFlowInterfaces",
@@ -13632,6 +13751,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":BufferizationDialect",
+        ":BufferizationInterfaces",
         ":BufferizationToMemRef",
         ":BufferizationTransforms",
         ":FuncDialect",
@@ -13937,7 +14057,6 @@ gentbl_cc_library(
 cc_library(
     name = "UBDialect",
     srcs = [
-        "include/mlir/Transforms/InliningUtils.h",
         "lib/Dialect/UB/IR/UBOps.cpp",
     ],
     hdrs = ["include/mlir/Dialect/UB/IR/UBOps.h"],
@@ -13945,6 +14064,7 @@ cc_library(
     deps = [
         ":ConvertToLLVMInterface",
         ":IR",
+        ":InliningUtils",
         ":SideEffectInterfaces",
         ":UBOpsIncGen",
         ":UBOpsInterfacesIncGen",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 91706af935ac5e..ccfef3f243409d 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -399,6 +399,7 @@ cc_library(
         "//mlir:IR",
         "//mlir:InferIntRangeInterface",
         "//mlir:InferTypeOpInterface",
+        "//mlir:InliningUtils",
         "//mlir:LLVMDialect",
         "//mlir:LinalgDialect",
         "//mlir:LoopLikeInterface",
@@ -407,7 +408,6 @@ cc_library(
         "//mlir:SideEffectInterfaces",
         "//mlir:Support",
         "//mlir:TensorDialect",
-        "//mlir:TransformUtils",
         "//mlir:Transforms",
         "//mlir:ViewLikeInterface",
     ],
@@ -532,7 +532,7 @@ cc_library(
         "//mlir:PDLInterpDialect",
         "//mlir:Pass",
         "//mlir:Support",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -570,7 +570,7 @@ cc_library(
         "//mlir:SCFDialect",
         "//mlir:SPIRVDialect",
         "//mlir:Support",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -600,7 +600,7 @@ cc_library(
         "//mlir:Pass",
         "//mlir:SCFDialect",
         "//mlir:SCFTransforms",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -695,7 +695,6 @@ cc_library(
         "//mlir:SCFToControlFlow",
         "//mlir:SPIRVDialect",
         "//mlir:ToLLVMIRTranslation",
-        "//mlir:TransformUtils",
         "//mlir:Transforms",
         "//mlir:VectorDialect",
         "//mlir:VectorToLLVM",
@@ -728,7 +727,6 @@ cc_library(
         "//mlir:SCFTransforms",
         "//mlir:TensorDialect",
         "//mlir:TensorTransforms",
-        "//mlir:TransformUtils",
         "//mlir:Transforms",
         "//mlir:VectorDialect",
         "//mlir:VectorToSCF",
@@ -770,7 +768,7 @@ cc_library(
         "//mlir:MathTransforms",
         "//mlir:Pass",
         "//mlir:SCFDialect",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
         "//mlir:VectorDialect",
         "//mlir:X86VectorDialect",
     ],
@@ -786,7 +784,7 @@ cc_library(
         "//mlir:IR",
         "//mlir:MathDialect",
         "//mlir:Pass",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
         "//mlir:VCIXDialect",
         "//mlir:VectorDialect",
     ],
@@ -850,7 +848,7 @@ cc_library(
         "//mlir:Pass",
         "//mlir:SCFDialect",
         "//mlir:Support",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -869,7 +867,7 @@ cc_library(
         "//mlir:SCFDialect",
         "//mlir:SCFTransforms",
         "//mlir:SCFUtils",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -994,7 +992,7 @@ cc_library(
         "//mlir:FuncTransforms",
         "//mlir:IR",
         "//mlir:Pass",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
 
@@ -1033,7 +1031,7 @@ cc_library(
         "//mlir:SCFDialect",
         "//mlir:Support",
         "//mlir:TensorDialect",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
         "//mlir:VectorDialect",
         "//mlir:VectorToSCF",
         "//mlir:VectorTransforms",
@@ -1119,6 +1117,6 @@ cc_library(
         "//mlir:Parser",
         "//mlir:Pass",
         "//mlir:Support",
-        "//mlir:TransformUtils",
+        "//mlir:Transforms",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
index a749a11096c372..252b9ec951f6bd 100644
--- a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
@@ -61,8 +61,8 @@ cc_test(
     deps = [
         "//llvm:Support",
         "//llvm:TestingSupport",
-        "//mlir:BytecodeReader",
         "//mlir:ArithDialect",
+        "//mlir:BytecodeReader",
         "//mlir:ControlFlowInterfaces",
         "//mlir:DLTIDialect",
         "//mlir:DataLayoutInterfaces",